Message ID | 20230127175916.65389-3-alexandru.elisei@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm: Add PSCI CPU_OFF test | expand |
On Fri, Jan 27, 2023 at 05:59:16PM +0000, Alexandru Elisei wrote: > From: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > The test uses the following method. > > The primary CPU brings up all the secondary CPUs, which are held in a wait > loop. Once the primary releases the CPUs, each of the secondary CPUs > proceed to issue CPU_OFF. > > The primary CPU then checks for the status of the individual CPU_OFF > request. There is a chance that some CPUs might return from the CPU_OFF > function call after the primary CPU has finished the scan. There is no > foolproof method to handle this, but the test tries its best to > eliminate these false positives by introducing an extra delay if all the > CPUs are reported offline after the initial scan. > > Signed-off-by: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > [ Alex E: Skip CPU_OFF test if CPU_ON failed, drop cpu_off_success in > favour of checking AFFINITY_INFO, commit message tweaking ] > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> > --- > > Decided to drop Drew's Reviewed-by tag because the changes are not trivial > from the previous version. > > arm/psci.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 75 insertions(+), 5 deletions(-) > > diff --git a/arm/psci.c b/arm/psci.c > index f7238f8e0bbd..7034d8ebe6e1 100644 > --- a/arm/psci.c > +++ b/arm/psci.c > @@ -72,8 +72,9 @@ static bool psci_affinity_info_off(void) > } > > static int cpu_on_ret[NR_CPUS]; > -static cpumask_t cpu_on_ready, cpu_on_done; > +static cpumask_t cpu_on_ready, cpu_on_done, cpu_off_done; > static volatile int cpu_on_start; > +static volatile int cpu_off_start; > > extern void secondary_entry(void); > static void cpu_on_do_wake_target(void) > @@ -171,9 +172,71 @@ static bool psci_cpu_on_test(void) > return !failed; > } > > -int main(void) > +static void cpu_off_secondary_entry(void *data) > +{ > + int cpu = smp_processor_id(); > + > + while (!cpu_off_start) > + cpu_relax(); > + cpumask_set_cpu(cpu, &cpu_off_done); > + cpu_psci_cpu_die(); > +} > + > +static bool psci_cpu_off_test(void) > +{ > + bool failed = false; > + int i, count, cpu; > + > + for_each_present_cpu(cpu) { > + if (cpu == 0) > + continue; > + on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > + } > + > + cpumask_set_cpu(0, &cpu_off_done); > + > + cpu_off_start = 1; > + report_info("waiting for the CPUs to be offlined..."); > + while (!cpumask_full(&cpu_off_done)) > + cpu_relax(); > + > + /* Allow all the other CPUs to complete the operation */ > + for (i = 0; i < 100; i++) { > + mdelay(10); > + > + count = 0; > + for_each_present_cpu(cpu) { > + if (cpu == 0) > + continue; > + if (psci_affinity_info(cpus[cpu], 0) != PSCI_0_2_AFFINITY_LEVEL_OFF) > + count++; > + } > + if (count > 0) > + continue; This should be if (count == 0) break; otherwise we never leave the loop early. > + } > + > + /* Try to catch CPUs that return from CPU_OFF. */ > + if (count == 0) > + mdelay(100); > + > + for_each_present_cpu(cpu) { > + if (cpu == 0) > + continue; > + if (cpu_idle(cpu)) { > + report_info("CPU%d failed to be offlined", cpu); > + if (psci_affinity_info(cpus[cpu], 0) == PSCI_0_2_AFFINITY_LEVEL_OFF) > + report_info("AFFINITY_INFO incorrectly reports CPU%d as offline", cpu); > + failed = true; > + } > + } > + > + return !failed; > +} > + > +int main(int argc, char **argv) > { > int ver = psci_invoke(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); > + bool cpu_on_success = true; > > report_prefix_push("psci"); > > @@ -188,10 +251,17 @@ int main(void) > report(psci_affinity_info_on(), "affinity-info-on"); > report(psci_affinity_info_off(), "affinity-info-off"); > > - if (ERRATA(6c7a5dce22b3)) > - report(psci_cpu_on_test(), "cpu-on"); > - else > + if (ERRATA(6c7a5dce22b3)) { > + cpu_on_success = psci_cpu_on_test(); > + report(cpu_on_success, "cpu-on"); > + } else { > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > + } > + > + if (!cpu_on_success) > + report_skip("Skipping cpu-off test because the cpu-on test failed"); We should output "was skipped" when the cpu-on test was skipped, rather than always reporting "failed". We need two booleans, try_cpu_on_test and cpu_on_success. > + else > + report(psci_cpu_off_test(), "cpu-off"); > > done: > #if 0 > -- > 2.39.0 > Thanks, drew
Hi Drew, On Tue, Jan 31, 2023 at 07:56:23AM +0100, Andrew Jones wrote: > On Fri, Jan 27, 2023 at 05:59:16PM +0000, Alexandru Elisei wrote: > > From: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > > > The test uses the following method. > > > > The primary CPU brings up all the secondary CPUs, which are held in a wait > > loop. Once the primary releases the CPUs, each of the secondary CPUs > > proceed to issue CPU_OFF. > > > > The primary CPU then checks for the status of the individual CPU_OFF > > request. There is a chance that some CPUs might return from the CPU_OFF > > function call after the primary CPU has finished the scan. There is no > > foolproof method to handle this, but the test tries its best to > > eliminate these false positives by introducing an extra delay if all the > > CPUs are reported offline after the initial scan. > > > > Signed-off-by: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > [ Alex E: Skip CPU_OFF test if CPU_ON failed, drop cpu_off_success in > > favour of checking AFFINITY_INFO, commit message tweaking ] > > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> > > --- > > > > Decided to drop Drew's Reviewed-by tag because the changes are not trivial > > from the previous version. > > > > arm/psci.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 75 insertions(+), 5 deletions(-) > > > > diff --git a/arm/psci.c b/arm/psci.c > > index f7238f8e0bbd..7034d8ebe6e1 100644 > > --- a/arm/psci.c > > +++ b/arm/psci.c > > @@ -72,8 +72,9 @@ static bool psci_affinity_info_off(void) > > } > > > > static int cpu_on_ret[NR_CPUS]; > > -static cpumask_t cpu_on_ready, cpu_on_done; > > +static cpumask_t cpu_on_ready, cpu_on_done, cpu_off_done; > > static volatile int cpu_on_start; > > +static volatile int cpu_off_start; > > > > extern void secondary_entry(void); > > static void cpu_on_do_wake_target(void) > > @@ -171,9 +172,71 @@ static bool psci_cpu_on_test(void) > > return !failed; > > } > > > > -int main(void) > > +static void cpu_off_secondary_entry(void *data) > > +{ > > + int cpu = smp_processor_id(); > > + > > + while (!cpu_off_start) > > + cpu_relax(); > > + cpumask_set_cpu(cpu, &cpu_off_done); > > + cpu_psci_cpu_die(); > > +} > > + > > +static bool psci_cpu_off_test(void) > > +{ > > + bool failed = false; > > + int i, count, cpu; > > + > > + for_each_present_cpu(cpu) { > > + if (cpu == 0) > > + continue; > > + on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > > + } > > + > > + cpumask_set_cpu(0, &cpu_off_done); > > + > > + cpu_off_start = 1; > > + report_info("waiting for the CPUs to be offlined..."); > > + while (!cpumask_full(&cpu_off_done)) > > + cpu_relax(); > > + > > + /* Allow all the other CPUs to complete the operation */ > > + for (i = 0; i < 100; i++) { > > + mdelay(10); > > + > > + count = 0; > > + for_each_present_cpu(cpu) { > > + if (cpu == 0) > > + continue; > > + if (psci_affinity_info(cpus[cpu], 0) != PSCI_0_2_AFFINITY_LEVEL_OFF) > > + count++; > > + } > > + if (count > 0) > > + continue; > > This should be > > if (count == 0) > break; > > otherwise we never leave the loop early. Duh, don't know what I was thinking. Thanks for noticing it. > > > + } > > + > > + /* Try to catch CPUs that return from CPU_OFF. */ > > + if (count == 0) > > + mdelay(100); > > + > > + for_each_present_cpu(cpu) { > > + if (cpu == 0) > > + continue; > > + if (cpu_idle(cpu)) { > > + report_info("CPU%d failed to be offlined", cpu); > > + if (psci_affinity_info(cpus[cpu], 0) == PSCI_0_2_AFFINITY_LEVEL_OFF) > > + report_info("AFFINITY_INFO incorrectly reports CPU%d as offline", cpu); > > + failed = true; > > + } > > + } > > + > > + return !failed; > > +} > > + > > +int main(int argc, char **argv) > > { > > int ver = psci_invoke(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); > > + bool cpu_on_success = true; > > > > report_prefix_push("psci"); > > > > @@ -188,10 +251,17 @@ int main(void) > > report(psci_affinity_info_on(), "affinity-info-on"); > > report(psci_affinity_info_off(), "affinity-info-off"); > > > > - if (ERRATA(6c7a5dce22b3)) > > - report(psci_cpu_on_test(), "cpu-on"); > > - else > > + if (ERRATA(6c7a5dce22b3)) { > > + cpu_on_success = psci_cpu_on_test(); > > + report(cpu_on_success, "cpu-on"); > > + } else { > > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > + } > > + > > + if (!cpu_on_success) > > + report_skip("Skipping cpu-off test because the cpu-on test failed"); > > We should output "was skipped" when the cpu-on test was skipped, rather > than always reporting "failed". We need two booleans, try_cpu_on_test and > cpu_on_success. This is not about cpu-on being a precondition for cpu-off. cpu-off makes only one assumption, and that is that all secondaries can be onlined successfully. Even if cpu-on is never run, cpu-off calls on_cpu_async, which will online a secondary. This is safe even if the errata is not present, because the errata is about concurrent CPU_ON calls for the same VCPU, not for different VCPUs. The cpu-off test is skipped here because it can hang indefinitely if onlining CPU1 was not successful: [..] for_each_present_cpu(cpu) { if (cpu == 0) continue; on_cpu_async(cpu, cpu_off_secondary_entry, NULL); } cpumask_set_cpu(0, &cpu_off_done); cpu_off_start = 1; report_info("waiting for the CPUs to be offlined..."); while (!cpumask_full(&cpu_off_done)) // infinite loop if CPU1 cpu_relax(); // cannot be onlined. Does that make sense? Should I add a comment to make it clear why cpu-off is skipped when cpu-on fails? Thanks, Alex > > > + else > > + report(psci_cpu_off_test(), "cpu-off"); > > > > done: > > #if 0 > > -- > > 2.39.0 > > > > Thanks, > drew
On Tue, Jan 31, 2023 at 09:52:56AM +0000, Alexandru Elisei wrote: > Hi Drew, > > On Tue, Jan 31, 2023 at 07:56:23AM +0100, Andrew Jones wrote: > > On Fri, Jan 27, 2023 at 05:59:16PM +0000, Alexandru Elisei wrote: > > > From: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > > > > > The test uses the following method. > > > > > > The primary CPU brings up all the secondary CPUs, which are held in a wait > > > loop. Once the primary releases the CPUs, each of the secondary CPUs > > > proceed to issue CPU_OFF. > > > > > > The primary CPU then checks for the status of the individual CPU_OFF > > > request. There is a chance that some CPUs might return from the CPU_OFF > > > function call after the primary CPU has finished the scan. There is no > > > foolproof method to handle this, but the test tries its best to > > > eliminate these false positives by introducing an extra delay if all the > > > CPUs are reported offline after the initial scan. > > > > > > Signed-off-by: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > > [ Alex E: Skip CPU_OFF test if CPU_ON failed, drop cpu_off_success in > > > favour of checking AFFINITY_INFO, commit message tweaking ] > > > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> > > > --- > > > > > > Decided to drop Drew's Reviewed-by tag because the changes are not trivial > > > from the previous version. > > > > > > arm/psci.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > > > 1 file changed, 75 insertions(+), 5 deletions(-) > > > > > > diff --git a/arm/psci.c b/arm/psci.c > > > index f7238f8e0bbd..7034d8ebe6e1 100644 > > > --- a/arm/psci.c > > > +++ b/arm/psci.c > > > @@ -72,8 +72,9 @@ static bool psci_affinity_info_off(void) > > > } > > > > > > static int cpu_on_ret[NR_CPUS]; > > > -static cpumask_t cpu_on_ready, cpu_on_done; > > > +static cpumask_t cpu_on_ready, cpu_on_done, cpu_off_done; > > > static volatile int cpu_on_start; > > > +static volatile int cpu_off_start; > > > > > > extern void secondary_entry(void); > > > static void cpu_on_do_wake_target(void) > > > @@ -171,9 +172,71 @@ static bool psci_cpu_on_test(void) > > > return !failed; > > > } > > > > > > -int main(void) > > > +static void cpu_off_secondary_entry(void *data) > > > +{ > > > + int cpu = smp_processor_id(); > > > + > > > + while (!cpu_off_start) > > > + cpu_relax(); > > > + cpumask_set_cpu(cpu, &cpu_off_done); > > > + cpu_psci_cpu_die(); > > > +} > > > + > > > +static bool psci_cpu_off_test(void) > > > +{ > > > + bool failed = false; > > > + int i, count, cpu; > > > + > > > + for_each_present_cpu(cpu) { > > > + if (cpu == 0) > > > + continue; > > > + on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > > > + } > > > + > > > + cpumask_set_cpu(0, &cpu_off_done); > > > + > > > + cpu_off_start = 1; > > > + report_info("waiting for the CPUs to be offlined..."); > > > + while (!cpumask_full(&cpu_off_done)) > > > + cpu_relax(); > > > + > > > + /* Allow all the other CPUs to complete the operation */ > > > + for (i = 0; i < 100; i++) { > > > + mdelay(10); > > > + > > > + count = 0; > > > + for_each_present_cpu(cpu) { > > > + if (cpu == 0) > > > + continue; > > > + if (psci_affinity_info(cpus[cpu], 0) != PSCI_0_2_AFFINITY_LEVEL_OFF) > > > + count++; > > > + } > > > + if (count > 0) > > > + continue; > > > > This should be > > > > if (count == 0) > > break; > > > > otherwise we never leave the loop early. > > Duh, don't know what I was thinking. Thanks for noticing it. > > > > > > + } > > > + > > > + /* Try to catch CPUs that return from CPU_OFF. */ > > > + if (count == 0) > > > + mdelay(100); > > > + > > > + for_each_present_cpu(cpu) { > > > + if (cpu == 0) > > > + continue; > > > + if (cpu_idle(cpu)) { > > > + report_info("CPU%d failed to be offlined", cpu); > > > + if (psci_affinity_info(cpus[cpu], 0) == PSCI_0_2_AFFINITY_LEVEL_OFF) > > > + report_info("AFFINITY_INFO incorrectly reports CPU%d as offline", cpu); > > > + failed = true; > > > + } > > > + } > > > + > > > + return !failed; > > > +} > > > + > > > +int main(int argc, char **argv) I just noticed we're adding argc,argv in this patch, but not using them. > > > { > > > int ver = psci_invoke(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); > > > + bool cpu_on_success = true; > > > > > > report_prefix_push("psci"); > > > > > > @@ -188,10 +251,17 @@ int main(void) > > > report(psci_affinity_info_on(), "affinity-info-on"); > > > report(psci_affinity_info_off(), "affinity-info-off"); > > > > > > - if (ERRATA(6c7a5dce22b3)) > > > - report(psci_cpu_on_test(), "cpu-on"); > > > - else > > > + if (ERRATA(6c7a5dce22b3)) { > > > + cpu_on_success = psci_cpu_on_test(); > > > + report(cpu_on_success, "cpu-on"); > > > + } else { > > > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > > + } > > > + > > > + if (!cpu_on_success) > > > + report_skip("Skipping cpu-off test because the cpu-on test failed"); > > > > We should output "was skipped" when the cpu-on test was skipped, rather > > than always reporting "failed". We need two booleans, try_cpu_on_test and > > cpu_on_success. > > This is not about cpu-on being a precondition for cpu-off. cpu-off makes > only one assumption, and that is that all secondaries can be onlined > successfully. Even if cpu-on is never run, cpu-off calls on_cpu_async, > which will online a secondary. This is safe even if the errata is not > present, because the errata is about concurrent CPU_ON calls for the same > VCPU, not for different VCPUs. > > The cpu-off test is skipped here because it can hang indefinitely if > onlining CPU1 was not successful: > > [..] > for_each_present_cpu(cpu) { > if (cpu == 0) > continue; > on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > } > > cpumask_set_cpu(0, &cpu_off_done); > > cpu_off_start = 1; > report_info("waiting for the CPUs to be offlined..."); > while (!cpumask_full(&cpu_off_done)) // infinite loop if CPU1 > cpu_relax(); // cannot be onlined. > > Does that make sense? Should I add a comment to make it clear why cpu-off > is skipped when cpu-on fails? I missed that cpu_on_success was initialized to true. Seeing that now, I understand how the only time it's false is if the cpu-on test failed. When I thought it was initialized to false it had two ways to be false, failure or skip. I think it's a bit confusing to set a 'success' variable to true when the test is skipped. Also, we can relax the condition as to whether or not we try cpu-off by simply checking that all cpus, other than cpu0, are in idle. How about if (ERRATA(6c7a5dce22b3)) report(psci_cpu_on_test(), "cpu-on"); else report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); assert(!cpu_idle(0)); if (!ERRATA(6c7a5dce22b3) || cpumask_weight(&cpu_idle_mask) == nr_cpus - 1) report(psci_cpu_off_test(), "cpu-off"); else report_skip("Skipping cpu-off test because the cpu-on test failed"); Thanks, drew > > Thanks, > Alex > > > > > > + else > > > + report(psci_cpu_off_test(), "cpu-off"); > > > > > > done: > > > #if 0 > > > -- > > > 2.39.0 > > > > > > > Thanks, > > drew
Hi Drew, On Tue, Jan 31, 2023 at 11:46:10AM +0100, Andrew Jones wrote: > On Tue, Jan 31, 2023 at 09:52:56AM +0000, Alexandru Elisei wrote: > > Hi Drew, > > > > On Tue, Jan 31, 2023 at 07:56:23AM +0100, Andrew Jones wrote: > > > On Fri, Jan 27, 2023 at 05:59:16PM +0000, Alexandru Elisei wrote: > > > > From: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > > > > > > > The test uses the following method. > > > > > > > > The primary CPU brings up all the secondary CPUs, which are held in a wait > > > > loop. Once the primary releases the CPUs, each of the secondary CPUs > > > > proceed to issue CPU_OFF. > > > > > > > > The primary CPU then checks for the status of the individual CPU_OFF > > > > request. There is a chance that some CPUs might return from the CPU_OFF > > > > function call after the primary CPU has finished the scan. There is no > > > > foolproof method to handle this, but the test tries its best to > > > > eliminate these false positives by introducing an extra delay if all the > > > > CPUs are reported offline after the initial scan. > > > > > > > > Signed-off-by: Nikita Venkatesh <Nikita.Venkatesh@arm.com> > > > > [ Alex E: Skip CPU_OFF test if CPU_ON failed, drop cpu_off_success in > > > > favour of checking AFFINITY_INFO, commit message tweaking ] > > > > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> > > > > --- > > > > > > > > Decided to drop Drew's Reviewed-by tag because the changes are not trivial > > > > from the previous version. > > > > > > > > arm/psci.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > > > > 1 file changed, 75 insertions(+), 5 deletions(-) > > > > > > > > diff --git a/arm/psci.c b/arm/psci.c > > > > index f7238f8e0bbd..7034d8ebe6e1 100644 > > > > --- a/arm/psci.c > > > > +++ b/arm/psci.c > > > > @@ -72,8 +72,9 @@ static bool psci_affinity_info_off(void) > > > > } > > > > > > > > static int cpu_on_ret[NR_CPUS]; > > > > -static cpumask_t cpu_on_ready, cpu_on_done; > > > > +static cpumask_t cpu_on_ready, cpu_on_done, cpu_off_done; > > > > static volatile int cpu_on_start; > > > > +static volatile int cpu_off_start; > > > > > > > > extern void secondary_entry(void); > > > > static void cpu_on_do_wake_target(void) > > > > @@ -171,9 +172,71 @@ static bool psci_cpu_on_test(void) > > > > return !failed; > > > > } > > > > > > > > -int main(void) > > > > +static void cpu_off_secondary_entry(void *data) > > > > +{ > > > > + int cpu = smp_processor_id(); > > > > + > > > > + while (!cpu_off_start) > > > > + cpu_relax(); > > > > + cpumask_set_cpu(cpu, &cpu_off_done); > > > > + cpu_psci_cpu_die(); > > > > +} > > > > + > > > > +static bool psci_cpu_off_test(void) > > > > +{ > > > > + bool failed = false; > > > > + int i, count, cpu; > > > > + > > > > + for_each_present_cpu(cpu) { > > > > + if (cpu == 0) > > > > + continue; > > > > + on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > > > > + } > > > > + > > > > + cpumask_set_cpu(0, &cpu_off_done); > > > > + > > > > + cpu_off_start = 1; > > > > + report_info("waiting for the CPUs to be offlined..."); > > > > + while (!cpumask_full(&cpu_off_done)) > > > > + cpu_relax(); > > > > + > > > > + /* Allow all the other CPUs to complete the operation */ > > > > + for (i = 0; i < 100; i++) { > > > > + mdelay(10); > > > > + > > > > + count = 0; > > > > + for_each_present_cpu(cpu) { > > > > + if (cpu == 0) > > > > + continue; > > > > + if (psci_affinity_info(cpus[cpu], 0) != PSCI_0_2_AFFINITY_LEVEL_OFF) > > > > + count++; > > > > + } > > > > + if (count > 0) > > > > + continue; > > > > > > This should be > > > > > > if (count == 0) > > > break; > > > > > > otherwise we never leave the loop early. > > > > Duh, don't know what I was thinking. Thanks for noticing it. > > > > > > > > > + } > > > > + > > > > + /* Try to catch CPUs that return from CPU_OFF. */ > > > > + if (count == 0) > > > > + mdelay(100); > > > > + > > > > + for_each_present_cpu(cpu) { > > > > + if (cpu == 0) > > > > + continue; > > > > + if (cpu_idle(cpu)) { > > > > + report_info("CPU%d failed to be offlined", cpu); > > > > + if (psci_affinity_info(cpus[cpu], 0) == PSCI_0_2_AFFINITY_LEVEL_OFF) > > > > + report_info("AFFINITY_INFO incorrectly reports CPU%d as offline", cpu); > > > > + failed = true; > > > > + } > > > > + } > > > > + > > > > + return !failed; > > > > +} > > > > + > > > > +int main(int argc, char **argv) > > I just noticed we're adding argc,argv in this patch, but not using them. Will remove that. It's a leftoever from a previous version where the cpu-off test was a separate test selected via the command line. > > > > > { > > > > int ver = psci_invoke(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); > > > > + bool cpu_on_success = true; > > > > > > > > report_prefix_push("psci"); > > > > > > > > @@ -188,10 +251,17 @@ int main(void) > > > > report(psci_affinity_info_on(), "affinity-info-on"); > > > > report(psci_affinity_info_off(), "affinity-info-off"); > > > > > > > > - if (ERRATA(6c7a5dce22b3)) > > > > - report(psci_cpu_on_test(), "cpu-on"); > > > > - else > > > > + if (ERRATA(6c7a5dce22b3)) { > > > > + cpu_on_success = psci_cpu_on_test(); > > > > + report(cpu_on_success, "cpu-on"); > > > > + } else { > > > > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > > > + } > > > > + > > > > + if (!cpu_on_success) > > > > + report_skip("Skipping cpu-off test because the cpu-on test failed"); > > > > > > We should output "was skipped" when the cpu-on test was skipped, rather > > > than always reporting "failed". We need two booleans, try_cpu_on_test and > > > cpu_on_success. > > > > This is not about cpu-on being a precondition for cpu-off. cpu-off makes > > only one assumption, and that is that all secondaries can be onlined > > successfully. Even if cpu-on is never run, cpu-off calls on_cpu_async, > > which will online a secondary. This is safe even if the errata is not > > present, because the errata is about concurrent CPU_ON calls for the same > > VCPU, not for different VCPUs. > > > > The cpu-off test is skipped here because it can hang indefinitely if > > onlining CPU1 was not successful: > > > > [..] > > for_each_present_cpu(cpu) { > > if (cpu == 0) > > continue; > > on_cpu_async(cpu, cpu_off_secondary_entry, NULL); > > } > > > > cpumask_set_cpu(0, &cpu_off_done); > > > > cpu_off_start = 1; > > report_info("waiting for the CPUs to be offlined..."); > > while (!cpumask_full(&cpu_off_done)) // infinite loop if CPU1 > > cpu_relax(); // cannot be onlined. > > > > Does that make sense? Should I add a comment to make it clear why cpu-off > > is skipped when cpu-on fails? > > I missed that cpu_on_success was initialized to true. Seeing that now, I > understand how the only time it's false is if the cpu-on test failed. When > I thought it was initialized to false it had two ways to be false, failure > or skip. I think it's a bit confusing to set a 'success' variable to true > when the test is skipped. Also, we can relax the condition as to whether > or not we try cpu-off by simply checking that all cpus, other than cpu0, > are in idle. How about > > if (ERRATA(6c7a5dce22b3)) > report(psci_cpu_on_test(), "cpu-on"); > else > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > assert(!cpu_idle(0)); cpu0 is the boot CPU, I don't see how cpu0 can execute this line of code and be in idle at the same time. Unless this is done for documenting purposes, to explain why we compare the number of cpus in idle to nr_cpus -1 below. But I still find it confusing, especially considering (almost) the same assert is in smp.c: void on_cpu_async(int cpu, void (*func)(void *data), void *data) { [..] assert_msg(cpu != 0 || cpu0_calls_idle, "Waiting on CPU0, which is unlikely to idle. " "If this is intended set cpu0_calls_idle=1"); I know, it's a different scenario, but the point I'm trying to make is that kvm-unit-tests really doesn't expect cpu0 to be in idle. I would prefer not to have the assert here. > > if (!ERRATA(6c7a5dce22b3) || cpumask_weight(&cpu_idle_mask) == nr_cpus - 1) > report(psci_cpu_off_test(), "cpu-off"); > else > report_skip("Skipping cpu-off test because the cpu-on test failed"); Looks good, will do it this way. Thanks, Alex > > > Thanks, > drew > > > > > > Thanks, > > Alex > > > > > > > > > + else > > > > + report(psci_cpu_off_test(), "cpu-off"); > > > > > > > > done: > > > > #if 0 > > > > -- > > > > 2.39.0 > > > > > > > > > > Thanks, > > > drew
On Tue, Jan 31, 2023 at 11:16:22AM +0000, Alexandru Elisei wrote: ... > > > Does that make sense? Should I add a comment to make it clear why cpu-off > > > is skipped when cpu-on fails? > > > > I missed that cpu_on_success was initialized to true. Seeing that now, I > > understand how the only time it's false is if the cpu-on test failed. When > > I thought it was initialized to false it had two ways to be false, failure > > or skip. I think it's a bit confusing to set a 'success' variable to true > > when the test is skipped. Also, we can relax the condition as to whether > > or not we try cpu-off by simply checking that all cpus, other than cpu0, > > are in idle. How about > > > > if (ERRATA(6c7a5dce22b3)) > > report(psci_cpu_on_test(), "cpu-on"); > > else > > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > > > assert(!cpu_idle(0)); > > cpu0 is the boot CPU, I don't see how cpu0 can execute this line of code > and be in idle at the same time. That's why it's an assert and not an if, i.e. it should never happen. It could happen if things are messed up in the lib code, a previous test mucked with cpu_idle_mask, or a previous test idled cpu0 and manipulated another cpu into executing this line. > Unless this is done for documenting > purposes, to explain why we compare the number of cpus in idle to nr_cpus > -1 below. Exactly, and furthermore that we expect the missing cpu to be cpu0. > But I still find it confusing, especially considering (almost) > the same assert is in smp.c: > > void on_cpu_async(int cpu, void (*func)(void *data), void *data) > { > [..] > assert_msg(cpu != 0 || cpu0_calls_idle, "Waiting on CPU0, which is unlikely to idle. " > "If this is intended set cpu0_calls_idle=1"); > > I know, it's a different scenario, but the point I'm trying to make is that > kvm-unit-tests really doesn't expect cpu0 to be in idle. I would prefer not > to have the assert here. asserts are for things we assume, but also want to ensure, as other code depends on the assumptions. Please keep the assert. It doesn't hurt :-) Thanks, drew
Hi Drew, On Tue, Jan 31, 2023 at 12:45:49PM +0100, Andrew Jones wrote: > On Tue, Jan 31, 2023 at 11:16:22AM +0000, Alexandru Elisei wrote: > ... > > > > Does that make sense? Should I add a comment to make it clear why cpu-off > > > > is skipped when cpu-on fails? > > > > > > I missed that cpu_on_success was initialized to true. Seeing that now, I > > > understand how the only time it's false is if the cpu-on test failed. When > > > I thought it was initialized to false it had two ways to be false, failure > > > or skip. I think it's a bit confusing to set a 'success' variable to true > > > when the test is skipped. Also, we can relax the condition as to whether > > > or not we try cpu-off by simply checking that all cpus, other than cpu0, > > > are in idle. How about > > > > > > if (ERRATA(6c7a5dce22b3)) > > > report(psci_cpu_on_test(), "cpu-on"); > > > else > > > report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); > > > > > > assert(!cpu_idle(0)); > > > > cpu0 is the boot CPU, I don't see how cpu0 can execute this line of code > > and be in idle at the same time. > > That's why it's an assert and not an if, i.e. it should never happen. It > could happen if things are messed up in the lib code, a previous test > mucked with cpu_idle_mask, or a previous test idled cpu0 and manipulated > another cpu into executing this line. > > > Unless this is done for documenting > > purposes, to explain why we compare the number of cpus in idle to nr_cpus > > -1 below. > > Exactly, and furthermore that we expect the missing cpu to be cpu0. > > > But I still find it confusing, especially considering (almost) > > the same assert is in smp.c: > > > > void on_cpu_async(int cpu, void (*func)(void *data), void *data) > > { > > [..] > > assert_msg(cpu != 0 || cpu0_calls_idle, "Waiting on CPU0, which is unlikely to idle. " > > "If this is intended set cpu0_calls_idle=1"); > > > > I know, it's a different scenario, but the point I'm trying to make is that > > kvm-unit-tests really doesn't expect cpu0 to be in idle. I would prefer not > > to have the assert here. > > asserts are for things we assume, but also want to ensure, as other code > depends on the assumptions. Please keep the assert. It doesn't hurt :-) I'm keeping the assert then :) Thanks, Alex > > Thanks, > drew
diff --git a/arm/psci.c b/arm/psci.c index f7238f8e0bbd..7034d8ebe6e1 100644 --- a/arm/psci.c +++ b/arm/psci.c @@ -72,8 +72,9 @@ static bool psci_affinity_info_off(void) } static int cpu_on_ret[NR_CPUS]; -static cpumask_t cpu_on_ready, cpu_on_done; +static cpumask_t cpu_on_ready, cpu_on_done, cpu_off_done; static volatile int cpu_on_start; +static volatile int cpu_off_start; extern void secondary_entry(void); static void cpu_on_do_wake_target(void) @@ -171,9 +172,71 @@ static bool psci_cpu_on_test(void) return !failed; } -int main(void) +static void cpu_off_secondary_entry(void *data) +{ + int cpu = smp_processor_id(); + + while (!cpu_off_start) + cpu_relax(); + cpumask_set_cpu(cpu, &cpu_off_done); + cpu_psci_cpu_die(); +} + +static bool psci_cpu_off_test(void) +{ + bool failed = false; + int i, count, cpu; + + for_each_present_cpu(cpu) { + if (cpu == 0) + continue; + on_cpu_async(cpu, cpu_off_secondary_entry, NULL); + } + + cpumask_set_cpu(0, &cpu_off_done); + + cpu_off_start = 1; + report_info("waiting for the CPUs to be offlined..."); + while (!cpumask_full(&cpu_off_done)) + cpu_relax(); + + /* Allow all the other CPUs to complete the operation */ + for (i = 0; i < 100; i++) { + mdelay(10); + + count = 0; + for_each_present_cpu(cpu) { + if (cpu == 0) + continue; + if (psci_affinity_info(cpus[cpu], 0) != PSCI_0_2_AFFINITY_LEVEL_OFF) + count++; + } + if (count > 0) + continue; + } + + /* Try to catch CPUs that return from CPU_OFF. */ + if (count == 0) + mdelay(100); + + for_each_present_cpu(cpu) { + if (cpu == 0) + continue; + if (cpu_idle(cpu)) { + report_info("CPU%d failed to be offlined", cpu); + if (psci_affinity_info(cpus[cpu], 0) == PSCI_0_2_AFFINITY_LEVEL_OFF) + report_info("AFFINITY_INFO incorrectly reports CPU%d as offline", cpu); + failed = true; + } + } + + return !failed; +} + +int main(int argc, char **argv) { int ver = psci_invoke(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); + bool cpu_on_success = true; report_prefix_push("psci"); @@ -188,10 +251,17 @@ int main(void) report(psci_affinity_info_on(), "affinity-info-on"); report(psci_affinity_info_off(), "affinity-info-off"); - if (ERRATA(6c7a5dce22b3)) - report(psci_cpu_on_test(), "cpu-on"); - else + if (ERRATA(6c7a5dce22b3)) { + cpu_on_success = psci_cpu_on_test(); + report(cpu_on_success, "cpu-on"); + } else { report_skip("Skipping unsafe cpu-on test. Set ERRATA_6c7a5dce22b3=y to enable."); + } + + if (!cpu_on_success) + report_skip("Skipping cpu-off test because the cpu-on test failed"); + else + report(psci_cpu_off_test(), "cpu-off"); done: #if 0