diff mbox series

[POC,2/2] add test exposing AHCI reset issue

Message ID 20230824133831.617833-2-f.ebner@proxmox.com (mailing list archive)
State New, archived
Headers show
Series [1/2] hw/ide: reset: cancel async DMA operation before reseting state | expand

Commit Message

Fiona Ebner Aug. 24, 2023, 1:38 p.m. UTC
Fails without the previous commit "hw/ide: reset: cancel async DMA
operation before reseting state".

I haven't ever written such a test before, but I wanted something to
expose the problem more easily. It hardcodes the behavior that the
pending write actually is done during reset, which might not be ideal.
It could just check that the first sector is still intact instead.

If I should make this a proper test, I'd be happy about some guidance,
but not sure if required for such a specific one-off issue. After all,
a different variation of the bug might have written to some other
sector not covered by this test.

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
 tests/qtest/ahci-test.c | 81 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

Comments

Philippe Mathieu-Daudé Aug. 24, 2023, 3:09 p.m. UTC | #1
On 24/8/23 15:38, Fiona Ebner wrote:
> Fails without the previous commit "hw/ide: reset: cancel async DMA
> operation before reseting state".
> 
> I haven't ever written such a test before, but I wanted something to
> expose the problem more easily. It hardcodes the behavior that the
> pending write actually is done during reset, which might not be ideal.
> It could just check that the first sector is still intact instead.
> 
> If I should make this a proper test, I'd be happy about some guidance,
> but not sure if required for such a specific one-off issue. After all,
> a different variation of the bug might have written to some other
> sector not covered by this test.
> 
> Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
> ---
>   tests/qtest/ahci-test.c | 81 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 81 insertions(+)

Per our style, variables are allocated on function prologue,
otherwise the test LGTM:

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Thomas Huth Aug. 24, 2023, 3:52 p.m. UTC | #2
On 24/08/2023 15.38, Fiona Ebner wrote:
> Fails without the previous commit "hw/ide: reset: cancel async DMA
> operation before reseting state".
> 
> I haven't ever written such a test before, but I wanted something to
> expose the problem more easily. It hardcodes the behavior that the
> pending write actually is done during reset, which might not be ideal.
> It could just check that the first sector is still intact instead.
> 
> If I should make this a proper test, I'd be happy about some guidance,
> but not sure if required for such a specific one-off issue. After all,
> a different variation of the bug might have written to some other
> sector not covered by this test.
> 
> Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
> ---
>   tests/qtest/ahci-test.c | 81 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 81 insertions(+)
> 
> diff --git a/tests/qtest/ahci-test.c b/tests/qtest/ahci-test.c
> index abab761c26..3ebeb4e255 100644
> --- a/tests/qtest/ahci-test.c
> +++ b/tests/qtest/ahci-test.c
> @@ -1401,6 +1401,84 @@ static void test_max(void)
>       ahci_shutdown(ahci);
>   }
>   
> +static void test_reset_with_pending_callback(void)
> +{
> +    AHCIQState *ahci;
> +
> +    ahci = ahci_boot(NULL);
> +    ahci_test_pci_spec(ahci);
> +    ahci_pci_enable(ahci);
> +
> +    int bufsize = 512 * 1024;
> +    int offset1 = 0;
> +    int offset2 = bufsize / AHCI_SECTOR_SIZE;
> +
> +    ahci_test_hba_spec(ahci);
> +    ahci_hba_enable(ahci);
> +    ahci_test_identify(ahci);
> +
> +    uint8_t port = ahci_port_select(ahci);
> +    ahci_port_clear(ahci, port);
> +
> +    unsigned char *tx1 = g_malloc(bufsize);
> +    unsigned char *tx2 = g_malloc(bufsize);
> +    unsigned char *rx1 = g_malloc0(bufsize);
> +    unsigned char *rx2 = g_malloc0(bufsize);
> +    uint64_t ptr1 = ahci_alloc(ahci, bufsize);
> +    uint64_t ptr2 = ahci_alloc(ahci, bufsize);

As Philippe already mentioned, please declare variables at the beginning of 
the functions to match our coding style.

I'd also oike to suggest to use g_autofree for the buffers that you 
malloced, so you can drop the g_frees at the end of the function.

> +    g_assert(ptr1 && ptr2);
> +
> +    /* Need two different patterns. */
> +    do {
> +        generate_pattern(tx1, bufsize, AHCI_SECTOR_SIZE);
> +        generate_pattern(tx2, bufsize, AHCI_SECTOR_SIZE);
> +    } while (memcmp(tx1, tx2, bufsize) == 0);
> +
> +    qtest_bufwrite(ahci->parent->qts, ptr1, tx1, bufsize);
> +    qtest_bufwrite(ahci->parent->qts, ptr2, tx2, bufsize);
> +
> +    /* Write to beginning of disk to check it wasn't overwritten later. */
> +    ahci_guest_io(ahci, port, CMD_WRITE_DMA_EXT, ptr1, bufsize, offset1);
> +
> +    /* Issue asynchronously to get a pending callback during reset. */
> +    AHCICommand *cmd = ahci_command_create(CMD_WRITE_DMA_EXT);
> +    ahci_command_adjust(cmd, offset2, ptr2, bufsize, 0);
> +    ahci_command_commit(ahci, cmd, port);
> +    ahci_command_issue_async(ahci, cmd);
> +
> +    ahci_set(ahci, AHCI_GHC, AHCI_GHC_HR);
> +
> +    ahci_command_free(cmd);
> +
> +    /* Start again. */
> +    ahci_clean_mem(ahci);
> +    ahci_pci_enable(ahci);
> +    ahci_hba_enable(ahci);
> +    port = ahci_port_select(ahci);
> +    ahci_port_clear(ahci, port);
> +
> +    /* Read and verify. */
> +    ahci_guest_io(ahci, port, CMD_READ_DMA_EXT, ptr1, bufsize, offset1);
> +    qtest_bufread(ahci->parent->qts, ptr1, rx1, bufsize);
> +    g_assert_cmphex(memcmp(tx1, rx1, bufsize), ==, 0);
> +
> +    ahci_guest_io(ahci, port, CMD_READ_DMA_EXT, ptr2, bufsize, offset2);
> +    qtest_bufread(ahci->parent->qts, ptr2, rx2, bufsize);
> +    g_assert_cmphex(memcmp(tx2, rx2, bufsize), ==, 0);
> +
> +    ahci_free(ahci, ptr1);
> +    ahci_free(ahci, ptr2);
> +    g_free(tx1);
> +    g_free(tx2);
> +    g_free(rx1);
> +    g_free(rx2);
> +
> +    ahci_clean_mem(ahci);
> +
> +    ahci_shutdown(ahci);
> +}
> +
>   static void test_reset(void)
>   {
>       AHCIQState *ahci;
> @@ -1915,6 +1993,9 @@ int main(int argc, char **argv)
>       g_assert(fd >= 0);
>       close(fd);
>   
> +    qtest_add_func("/ahci/reset_with_pending_callback",
> +                   test_reset_with_pending_callback);

It would make more sense to put this below the "Run the tests" comment below.

>       /* Run the tests */
>       qtest_add_func("/ahci/sanity",     test_sanity);
>       qtest_add_func("/ahci/pci_spec",   test_pci_spec);

  Thomas
Fiona Ebner Aug. 25, 2023, 10:17 a.m. UTC | #3
Am 24.08.23 um 15:38 schrieb Fiona Ebner:
> Fails without the previous commit "hw/ide: reset: cancel async DMA
> operation before reseting state".
> 
> I haven't ever written such a test before, but I wanted something to
> expose the problem more easily. It hardcodes the behavior that the
> pending write actually is done during reset, which might not be ideal.
> It could just check that the first sector is still intact instead.
> 
> If I should make this a proper test, I'd be happy about some guidance,
> but not sure if required for such a specific one-off issue. After all,
> a different variation of the bug might have written to some other
> sector not covered by this test.
> 

While trying to turn it into a proper test with Philippe's and Thomas's
suggestions, I wanted to add a comment about the buffer size. So I tried
figuring out what the "magic" value is. At the very beginning, I had
tried 4 KiB, but then the callback wouldn't be pending, so I just picked
512 KiB for my proof-of-concept. It turns out to be racy though, and
with a buffer size of 64 KiB, it is flaky whether or not the callback is
still pending on my system. Should I just pick a large enough buffer
size (maybe 4 MiB) and hope for the best?

Best Regards,
Fiona
Kevin Wolf Sept. 4, 2023, 9:26 a.m. UTC | #4
Am 25.08.2023 um 12:17 hat Fiona Ebner geschrieben:
> Am 24.08.23 um 15:38 schrieb Fiona Ebner:
> > Fails without the previous commit "hw/ide: reset: cancel async DMA
> > operation before reseting state".
> > 
> > I haven't ever written such a test before, but I wanted something to
> > expose the problem more easily. It hardcodes the behavior that the
> > pending write actually is done during reset, which might not be ideal.
> > It could just check that the first sector is still intact instead.
> > 
> > If I should make this a proper test, I'd be happy about some guidance,
> > but not sure if required for such a specific one-off issue. After all,
> > a different variation of the bug might have written to some other
> > sector not covered by this test.
> > 
> 
> While trying to turn it into a proper test with Philippe's and Thomas's
> suggestions, I wanted to add a comment about the buffer size. So I tried
> figuring out what the "magic" value is. At the very beginning, I had
> tried 4 KiB, but then the callback wouldn't be pending, so I just picked
> 512 KiB for my proof-of-concept. It turns out to be racy though, and
> with a buffer size of 64 KiB, it is flaky whether or not the callback is
> still pending on my system. Should I just pick a large enough buffer
> size (maybe 4 MiB) and hope for the best?

If the problem is that the request may complete too fast, have you tried
using I/O throttling? This is a common approach in qemu-iotests.

Note however that a single big request won't be throttled. If you exceed
the limit, it's only the next request that has to wait until we made up
for the previous one. So you'll want to set the limit below the request
size of a first request (so that we do get some delay), but not much
lower (to avoid having to wait for too long), and then send the second
request that should be delayed for a bit.

Kevin
diff mbox series

Patch

diff --git a/tests/qtest/ahci-test.c b/tests/qtest/ahci-test.c
index abab761c26..3ebeb4e255 100644
--- a/tests/qtest/ahci-test.c
+++ b/tests/qtest/ahci-test.c
@@ -1401,6 +1401,84 @@  static void test_max(void)
     ahci_shutdown(ahci);
 }
 
+static void test_reset_with_pending_callback(void)
+{
+    AHCIQState *ahci;
+
+    ahci = ahci_boot(NULL);
+    ahci_test_pci_spec(ahci);
+    ahci_pci_enable(ahci);
+
+    int bufsize = 512 * 1024;
+    int offset1 = 0;
+    int offset2 = bufsize / AHCI_SECTOR_SIZE;
+
+    ahci_test_hba_spec(ahci);
+    ahci_hba_enable(ahci);
+    ahci_test_identify(ahci);
+
+    uint8_t port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
+
+    unsigned char *tx1 = g_malloc(bufsize);
+    unsigned char *tx2 = g_malloc(bufsize);
+    unsigned char *rx1 = g_malloc0(bufsize);
+    unsigned char *rx2 = g_malloc0(bufsize);
+    uint64_t ptr1 = ahci_alloc(ahci, bufsize);
+    uint64_t ptr2 = ahci_alloc(ahci, bufsize);
+
+    g_assert(ptr1 && ptr2);
+
+    /* Need two different patterns. */
+    do {
+        generate_pattern(tx1, bufsize, AHCI_SECTOR_SIZE);
+        generate_pattern(tx2, bufsize, AHCI_SECTOR_SIZE);
+    } while (memcmp(tx1, tx2, bufsize) == 0);
+
+    qtest_bufwrite(ahci->parent->qts, ptr1, tx1, bufsize);
+    qtest_bufwrite(ahci->parent->qts, ptr2, tx2, bufsize);
+
+    /* Write to beginning of disk to check it wasn't overwritten later. */
+    ahci_guest_io(ahci, port, CMD_WRITE_DMA_EXT, ptr1, bufsize, offset1);
+
+    /* Issue asynchronously to get a pending callback during reset. */
+    AHCICommand *cmd = ahci_command_create(CMD_WRITE_DMA_EXT);
+    ahci_command_adjust(cmd, offset2, ptr2, bufsize, 0);
+    ahci_command_commit(ahci, cmd, port);
+    ahci_command_issue_async(ahci, cmd);
+
+    ahci_set(ahci, AHCI_GHC, AHCI_GHC_HR);
+
+    ahci_command_free(cmd);
+
+    /* Start again. */
+    ahci_clean_mem(ahci);
+    ahci_pci_enable(ahci);
+    ahci_hba_enable(ahci);
+    port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
+
+    /* Read and verify. */
+    ahci_guest_io(ahci, port, CMD_READ_DMA_EXT, ptr1, bufsize, offset1);
+    qtest_bufread(ahci->parent->qts, ptr1, rx1, bufsize);
+    g_assert_cmphex(memcmp(tx1, rx1, bufsize), ==, 0);
+
+    ahci_guest_io(ahci, port, CMD_READ_DMA_EXT, ptr2, bufsize, offset2);
+    qtest_bufread(ahci->parent->qts, ptr2, rx2, bufsize);
+    g_assert_cmphex(memcmp(tx2, rx2, bufsize), ==, 0);
+
+    ahci_free(ahci, ptr1);
+    ahci_free(ahci, ptr2);
+    g_free(tx1);
+    g_free(tx2);
+    g_free(rx1);
+    g_free(rx2);
+
+    ahci_clean_mem(ahci);
+
+    ahci_shutdown(ahci);
+}
+
 static void test_reset(void)
 {
     AHCIQState *ahci;
@@ -1915,6 +1993,9 @@  int main(int argc, char **argv)
     g_assert(fd >= 0);
     close(fd);
 
+    qtest_add_func("/ahci/reset_with_pending_callback",
+                   test_reset_with_pending_callback);
+
     /* Run the tests */
     qtest_add_func("/ahci/sanity",     test_sanity);
     qtest_add_func("/ahci/pci_spec",   test_pci_spec);