diff mbox series

[kvm-unit-tests,GIT,PULL,4/4] s390x: add CMM test during migration

Message ID 20230105121538.52008-5-imbrenda@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series s390x: storage key and CMM concurrent tests | expand

Commit Message

Claudio Imbrenda Jan. 5, 2023, 12:15 p.m. UTC
From: Nico Boehr <nrb@linux.ibm.com>

Add a test which modifies CMM page states while migration is in
progress.

This is added to the existing migration-cmm test, which gets a new
command line argument for the sequential and parallel variants.

Signed-off-by: Nico Boehr <nrb@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20221221090953.341247-2-nrb@linux.ibm.com
Message-Id: <20221221090953.341247-2-nrb@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 s390x/migration-cmm.c | 258 +++++++++++++++++++++++++++++++++++++-----
 s390x/unittests.cfg   |  15 ++-
 2 files changed, 240 insertions(+), 33 deletions(-)

Comments

Thomas Huth Jan. 5, 2023, 12:24 p.m. UTC | #1
On 05/01/2023 13.15, Claudio Imbrenda wrote:
> From: Nico Boehr <nrb@linux.ibm.com>
> 
> Add a test which modifies CMM page states while migration is in
> progress.
> 
> This is added to the existing migration-cmm test, which gets a new
> command line argument for the sequential and parallel variants.
> 
> Signed-off-by: Nico Boehr <nrb@linux.ibm.com>
> Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Link: https://lore.kernel.org/r/20221221090953.341247-2-nrb@linux.ibm.com
> Message-Id: <20221221090953.341247-2-nrb@linux.ibm.com>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> ---
>   s390x/migration-cmm.c | 258 +++++++++++++++++++++++++++++++++++++-----
>   s390x/unittests.cfg   |  15 ++-
>   2 files changed, 240 insertions(+), 33 deletions(-)

  Hi!

While this works fine on my z15 LPAR, I'm getting a failure when running 
this test on my z13 LPAR:

$ cat logs/migration-cmm-parallel.log
run_migration timeout -k 1s --foreground 90s /usr/local/bin/qemu-kvm 
-nodefaults -nographic -machine s390-ccw-virtio,accel=kvm -chardev 
stdio,id=con0 -device sclpconsole,chardev=con0 -kernel 
s390x/migration-cmm.elf -smp 2 -append --parallel # -initrd /tmp/tmp.YKFTGTHnwt
SMP: Initializing, found 2 cpus
Now migrate the VM, then press a key to continue...
INFO: migration-cmm: parallel: Migration complete
INFO: migration-cmm: parallel: thread completed 65308 iterations
FAIL: migration-cmm: parallel: during migration: page state mismatch: first 
page idx = 0, addr = 28000, expected_mask = 0x1, actual_mask = 0x2
FAIL: migration-cmm: parallel: after migration: page state mismatch: first 
page idx = 0, addr = 28000, expected_mask = 0x1, actual_mask = 0x2
SUMMARY: 2 tests, 2 unexpected failures

EXIT: STATUS=3

Could you please fix that first?

  Thanks,
   Thomas
Nico Boehr Jan. 5, 2023, 4:39 p.m. UTC | #2
Quoting Thomas Huth (2023-01-05 13:24:03)
> On 05/01/2023 13.15, Claudio Imbrenda wrote:
> > From: Nico Boehr <nrb@linux.ibm.com>
> > 
> > Add a test which modifies CMM page states while migration is in
> > progress.
> > 
> > This is added to the existing migration-cmm test, which gets a new
> > command line argument for the sequential and parallel variants.
> > 
> > Signed-off-by: Nico Boehr <nrb@linux.ibm.com>
> > Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Link: https://lore.kernel.org/r/20221221090953.341247-2-nrb@linux.ibm.com
> > Message-Id: <20221221090953.341247-2-nrb@linux.ibm.com>
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > ---
> >   s390x/migration-cmm.c | 258 +++++++++++++++++++++++++++++++++++++-----
> >   s390x/unittests.cfg   |  15 ++-
> >   2 files changed, 240 insertions(+), 33 deletions(-)
> 
>   Hi!
> 
> While this works fine on my z15 LPAR, I'm getting a failure when running 
> this test on my z13 LPAR:

I can _sometimes_ reproduce this on z16, z13 and misc older machines. The older the machine, the more often it seems to happen.

I think we may have a bug somewhere. While I investigate, feel free to leave out this patch if you prefer.
diff mbox series

Patch

diff --git a/s390x/migration-cmm.c b/s390x/migration-cmm.c
index 43673f18..2d46c6be 100644
--- a/s390x/migration-cmm.c
+++ b/s390x/migration-cmm.c
@@ -2,6 +2,12 @@ 
 /*
  * CMM migration tests (ESSA)
  *
+ * There are two variants of this test:
+ * - sequential: sets CMM page states, then migrates the VM and - after
+ *   migration finished - verifies that page states have been preserved.
+ * - parallel: migrate VM and - while migration is in progress - change
+ *   page states and verify that they are preserved.
+ *
  * Copyright IBM Corp. 2022
  *
  * Authors:
@@ -13,55 +19,249 @@ 
 #include <asm/interrupt.h>
 #include <asm/page.h>
 #include <asm/cmm.h>
+#include <asm/barrier.h>
 #include <bitops.h>
+#include <smp.h>
+
+struct verify_result {
+	bool verify_failed;
+	char expected_mask;
+	char actual_mask;
+	unsigned long page_mismatch_idx;
+	unsigned long page_mismatch_addr;
+};
+
+static enum {
+	TEST_INVLALID,
+	TEST_SEQUENTIAL,
+	TEST_PARALLEL
+} arg_test_to_run;
 
 #define NUM_PAGES 128
-static uint8_t pagebuf[NUM_PAGES][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
 
-static void test_migration(void)
+/*
+ * Allocate 3 pages more than we need so we can start at different offsets.
+ * For the parallel test, this ensures page states change on every loop iteration.
+ */
+static uint8_t pagebuf[(NUM_PAGES + 3) * PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+
+static struct verify_result result;
+
+static unsigned int thread_iters;
+static int thread_should_exit;
+static int thread_exited;
+
+/*
+ * Maps ESSA actions to states the page is allowed to be in after the
+ * respective action was executed.
+ */
+static const unsigned long allowed_essa_state_masks[4] = {
+	BIT(ESSA_USAGE_STABLE),					/* ESSA_SET_STABLE */
+	BIT(ESSA_USAGE_UNUSED),					/* ESSA_SET_UNUSED */
+	BIT(ESSA_USAGE_VOLATILE),				/* ESSA_SET_VOLATILE */
+	BIT(ESSA_USAGE_VOLATILE) | BIT(ESSA_USAGE_POT_VOLATILE) /* ESSA_SET_POT_VOLATILE */
+};
+
+/*
+ * Set CMM page state test pattern on pagebuf.
+ * pagebuf must point to page_count consecutive pages.
+ * page_count must be a multiple of 4.
+ */
+static void set_test_pattern(uint8_t *pagebuf, size_t page_count)
+{
+	unsigned long addr = (unsigned long)pagebuf;
+	size_t i;
+
+	assert(page_count % 4 == 0);
+	for (i = 0; i < page_count; i += 4) {
+		essa(ESSA_SET_STABLE, addr + i * PAGE_SIZE);
+		essa(ESSA_SET_UNUSED, addr + (i + 1) * PAGE_SIZE);
+		essa(ESSA_SET_VOLATILE, addr + (i + 2) * PAGE_SIZE);
+		essa(ESSA_SET_POT_VOLATILE, addr + (i + 3) * PAGE_SIZE);
+	}
+}
+
+/*
+ * Verify CMM page states on pagebuf.
+ * Page states must have been set by set_test_pattern on pagebuf before.
+ * page_count must be a multiple of 4.
+ *
+ * If page states match the expected result, will return a verify_result
+ * with verify_failed false. All other fields are then invalid.
+ * If there is a mismatch, the returned struct will have verify_failed true
+ * and will be filled with details on the first mismatch encountered.
+ */
+static struct verify_result verify_page_states(uint8_t *pagebuf, size_t page_count)
+{
+	unsigned long expected_mask, actual_mask;
+	struct verify_result result = {
+		.verify_failed = true
+	};
+	unsigned long addr;
+	size_t i;
+
+	assert(page_count % 4 == 0);
+	for (i = 0; i < page_count; i++) {
+		addr = (unsigned long)(pagebuf + i * PAGE_SIZE);
+		actual_mask = essa(ESSA_GET_STATE, addr);
+		/* usage state in bits 60 and 61 */
+		actual_mask = BIT((actual_mask >> 2) & 0x3);
+		expected_mask = allowed_essa_state_masks[i % ARRAY_SIZE(allowed_essa_state_masks)];
+		if (!(actual_mask & expected_mask)) {
+			result.page_mismatch_idx = i;
+			result.page_mismatch_addr = addr;
+			result.expected_mask = expected_mask;
+			result.actual_mask = actual_mask;
+			return result;
+		}
+	}
+
+	result.verify_failed = false;
+	return result;
+}
+
+static void report_verify_result(const struct verify_result *result)
+{
+	if (result->verify_failed)
+		report_fail("page state mismatch: first page idx = %lu, addr = %lx, "
+			    "expected_mask = 0x%x, actual_mask = 0x%x",
+			    result->page_mismatch_idx, result->page_mismatch_addr,
+			    result->expected_mask, result->actual_mask);
+	else
+		report_pass("page states match");
+}
+
+static void test_cmm_migration_sequential(void)
+{
+	report_prefix_push("sequential");
+
+	set_test_pattern(pagebuf, NUM_PAGES);
+
+	migrate_once();
+
+	result = verify_page_states(pagebuf, NUM_PAGES);
+	report_verify_result(&result);
+
+	report_prefix_pop();
+}
+
+static void set_cmm_thread(void)
 {
-	int i, state_mask, actual_state;
+	uint8_t *pagebuf_start;
 	/*
-	 * Maps ESSA actions to states the page is allowed to be in after the
-	 * respective action was executed.
+	 * The second CPU must not print to the console, otherwise it will race with
+	 * the primary CPU on the SCLP buffer.
 	 */
-	int allowed_essa_state_masks[4] = {
-		BIT(ESSA_USAGE_STABLE),					/* ESSA_SET_STABLE */
-		BIT(ESSA_USAGE_UNUSED),					/* ESSA_SET_UNUSED */
-		BIT(ESSA_USAGE_VOLATILE),				/* ESSA_SET_VOLATILE */
-		BIT(ESSA_USAGE_VOLATILE) | BIT(ESSA_USAGE_POT_VOLATILE) /* ESSA_SET_POT_VOLATILE */
-	};
+	while (!READ_ONCE(thread_should_exit)) {
+		/*
+		 * Start on a offset different from the last iteration so page states change with
+		 * every iteration. This is why pagebuf has 3 extra pages.
+		 */
+		pagebuf_start = pagebuf + (thread_iters % 4) * PAGE_SIZE;
+		set_test_pattern(pagebuf_start, NUM_PAGES);
+
+		/*
+		 * Always increment even if the verify fails. This ensures primary CPU knows where
+		 * we left off and can do an additional verify round after migration finished.
+		 */
+		thread_iters++;
 
-	assert(NUM_PAGES % 4 == 0);
-	for (i = 0; i < NUM_PAGES; i += 4) {
-		essa(ESSA_SET_STABLE, (unsigned long)pagebuf[i]);
-		essa(ESSA_SET_UNUSED, (unsigned long)pagebuf[i + 1]);
-		essa(ESSA_SET_VOLATILE, (unsigned long)pagebuf[i + 2]);
-		essa(ESSA_SET_POT_VOLATILE, (unsigned long)pagebuf[i + 3]);
+		result = verify_page_states(pagebuf_start, NUM_PAGES);
+		if (result.verify_failed)
+			break;
 	}
 
+	WRITE_ONCE(thread_exited, 1);
+}
+
+static void test_cmm_migration_parallel(void)
+{
+	report_prefix_push("parallel");
+
+	if (smp_query_num_cpus() == 1) {
+		report_skip("need at least 2 cpus for this test");
+		goto error;
+	}
+
+	smp_cpu_setup(1, PSW_WITH_CUR_MASK(set_cmm_thread));
+
 	migrate_once();
 
-	for (i = 0; i < NUM_PAGES; i++) {
-		actual_state = essa(ESSA_GET_STATE, (unsigned long)pagebuf[i]);
-		/* extract the usage state in bits 60 and 61 */
-		actual_state = (actual_state >> 2) & 0x3;
-		state_mask = allowed_essa_state_masks[i % ARRAY_SIZE(allowed_essa_state_masks)];
-		report(BIT(actual_state) & state_mask, "page %d state: expected_mask=0x%x actual_mask=0x%lx", i, state_mask, BIT(actual_state));
+	WRITE_ONCE(thread_should_exit, 1);
+
+	while (!READ_ONCE(thread_exited))
+		;
+
+	/* Ensure thread_iters and result below are read from memory after thread completed */
+	mb();
+
+	report_info("thread completed %u iterations", thread_iters);
+
+	report_prefix_push("during migration");
+	report_verify_result(&result);
+	report_prefix_pop();
+
+	/*
+	 * Verification of page states occurs on the thread. We don't know if we
+	 * were still migrating during the verification.
+	 * To be sure, make another verification round after the migration
+	 * finished to catch page states which might not have been migrated
+	 * correctly.
+	 */
+	report_prefix_push("after migration");
+	assert(thread_iters > 0);
+	result = verify_page_states(pagebuf + ((thread_iters - 1) % 4) * PAGE_SIZE, NUM_PAGES);
+	report_verify_result(&result);
+	report_prefix_pop();
+
+error:
+	report_prefix_pop();
+}
+
+static void print_usage(void)
+{
+	report_info("Usage: migration-cmm [--parallel|--sequential]");
+}
+
+static void parse_args(int argc, char **argv)
+{
+	if (argc < 2) {
+		/* default to sequential since it only needs one CPU */
+		arg_test_to_run = TEST_SEQUENTIAL;
+		return;
 	}
+
+	if (!strcmp("--parallel", argv[1]))
+		arg_test_to_run = TEST_PARALLEL;
+	else if (!strcmp("--sequential", argv[1]))
+		arg_test_to_run = TEST_SEQUENTIAL;
+	else
+		arg_test_to_run = TEST_INVLALID;
 }
 
-int main(void)
+int main(int argc, char **argv)
 {
 	report_prefix_push("migration-cmm");
-
-	if (!check_essa_available())
+	if (!check_essa_available()) {
 		report_skip("ESSA is not available");
-	else
-		test_migration();
+		goto error;
+	}
 
-	migrate_once();
+	parse_args(argc, argv);
+
+	switch (arg_test_to_run) {
+	case TEST_SEQUENTIAL:
+		test_cmm_migration_sequential();
+		break;
+	case TEST_PARALLEL:
+		test_cmm_migration_parallel();
+		break;
+	default:
+		print_usage();
+	}
 
+error:
+	migrate_once();
 	report_prefix_pop();
 	return report_summary();
 }
diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg
index d97eb5e9..96977f2b 100644
--- a/s390x/unittests.cfg
+++ b/s390x/unittests.cfg
@@ -181,10 +181,6 @@  file = migration.elf
 groups = migration
 smp = 2
 
-[migration-cmm]
-file = migration-cmm.elf
-groups = migration
-
 [panic-loop-extint]
 file = panic-loop-extint.elf
 groups = panic
@@ -215,3 +211,14 @@  file = migration-skey.elf
 smp = 2
 groups = migration
 extra_params = -append '--parallel'
+
+[migration-cmm-sequential]
+file = migration-cmm.elf
+groups = migration
+extra_params = -append '--sequential'
+
+[migration-cmm-parallel]
+file = migration-cmm.elf
+groups = migration
+smp = 2
+extra_params = -append '--parallel'