diff mbox

[V1,1/1] tests: Add migration test for aarch64

Message ID 20180124212246.2352-1-wei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wei Huang Jan. 24, 2018, 9:22 p.m. UTC
This patch adds the migration test support for aarch64. The test code,
which implements the same functionality as x86, is compiled into a binary
and booted as a kernel in qemu. Here are the design ideas:

 * We choose this -kernel design because aarch64 QEMU doesn't provide a
   built-in fw like x86 does. So instead of relying on a boot loader, we
   use -kernel approach for aarch64.
 * The serial output is sent to PL011 directly.
 * The physical memory base for mach-virt machine is 0x40000000. We change
   the start_address and end_address for aarch64.

RFC->V1:
 * aarch64 kernel blob is defined as an uint32_t array
 * The test code is re-written to address a data caching issue under KVM.
   Tests passed under both x86 and aarch64.
 * Re-use init_bootfile_x86() for both x86 and aarch64
 * Other minor fixes

Note that the test code is as the following:

.section .text

        .globl  start
        
start:
        /* disable MMU to use phys mem address */
        mrs     x0, sctlr_el1
        bic     x0, x0, #(1<<0)
        msr     sctlr_el1, x0
        isb

        /* output char 'A' to PL011 */
        mov     w4, #65
        mov     x5, #0x9000000
        strb    w4, [x5]

        /* w6 keeps a counter so we can limit the output speed */
        mov     w6, #0

        /* phys mem base addr = 0x40000000 */
        mov     x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */
        mov     x2, #(0x40000000 + 1 * 1024*1024)

        /* clean up memory first */
        mov     w1, #0  
clean:
        strb    w1, [x2]
        add     x2, x2, #(4096)
        cmp     x2, x3
        ble     clean

        /* main body */
mainloop:
        mov     x2, #(0x40000000 + 1 * 1024*1024)
        
innerloop:
        /* clean cache because el2 might still cache guest data under KVM */
        dc      civac, x2
        ldrb    w1, [x2]
        add     w1, w1, #1
        and     w1, w1, #(0xff)
        strb    w1, [x2]

        add     x2, x2, #(4096)
        cmp     x2, x3
        blt     innerloop

        add     w6, w6, #1
        and     w6, w6, #(0xff)
        cmp     w6, #0
        bne     mainloop
        
        /* output char 'B' to PL011 */
        mov     w4, #66
        mov     x5, #0x9000000
        strb    w4, [x5]

        bl      mainloop

The code is compiled with the following commands:
 > gcc -c -o fill.o fill.s
 > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \
   -nostdlib fill.o
 > objcopy -O binary fill.elf fill.flat
 > truncate -c -s 144 ./fill.flat
 > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }'

The linker file (borrowed from KVM unit test) is defined as:

SECTIONS
{
    .text : { *(.init) *(.text) *(.text.*) }
    . = ALIGN(64K);
    etext = .;
    .data : {
        *(.data)
    }
    . = ALIGN(16);
    .rodata : { *(.rodata) }
    . = ALIGN(16);
    .bss : { *(.bss) }
    . = ALIGN(64K);
    edata = .;
    . += 64K;
    . = ALIGN(64K);
    /*
     * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE
     * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm
     * sp must always be strictly less than the true stacktop
     */
    stackptr = . - 16;
    stacktop = .;
}

ENTRY(start)

Signed-off-by: Wei Huang <wei@redhat.com>
---
 tests/Makefile.include |  1 +
 tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 32 insertions(+), 6 deletions(-)

Comments

Dr. David Alan Gilbert Jan. 25, 2018, 8:05 p.m. UTC | #1
* Wei Huang (wei@redhat.com) wrote:
> This patch adds the migration test support for aarch64. The test code,
> which implements the same functionality as x86, is compiled into a binary
> and booted as a kernel in qemu. Here are the design ideas:
> 
>  * We choose this -kernel design because aarch64 QEMU doesn't provide a
>    built-in fw like x86 does. So instead of relying on a boot loader, we
>    use -kernel approach for aarch64.
>  * The serial output is sent to PL011 directly.
>  * The physical memory base for mach-virt machine is 0x40000000. We change
>    the start_address and end_address for aarch64.
> 
> RFC->V1:
>  * aarch64 kernel blob is defined as an uint32_t array
>  * The test code is re-written to address a data caching issue under KVM.
>    Tests passed under both x86 and aarch64.
>  * Re-use init_bootfile_x86() for both x86 and aarch64
>  * Other minor fixes
> 
> Note that the test code is as the following:
> 
> .section .text
> 
>         .globl  start
>         
> start:
>         /* disable MMU to use phys mem address */
>         mrs     x0, sctlr_el1
>         bic     x0, x0, #(1<<0)
>         msr     sctlr_el1, x0
>         isb
> 
>         /* output char 'A' to PL011 */
>         mov     w4, #65
>         mov     x5, #0x9000000
>         strb    w4, [x5]
> 
>         /* w6 keeps a counter so we can limit the output speed */
>         mov     w6, #0
> 
>         /* phys mem base addr = 0x40000000 */
>         mov     x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */
>         mov     x2, #(0x40000000 + 1 * 1024*1024)
> 
>         /* clean up memory first */
>         mov     w1, #0  
> clean:
>         strb    w1, [x2]
>         add     x2, x2, #(4096)
>         cmp     x2, x3
>         ble     clean
> 
>         /* main body */
> mainloop:
>         mov     x2, #(0x40000000 + 1 * 1024*1024)
>         
> innerloop:
>         /* clean cache because el2 might still cache guest data under KVM */
>         dc      civac, x2

Can you explain a bit more about that please;  if it's guest
visible acorss migration, doesn't that suggest we need the cache clear
in KVM or QEMU?

Dave

>         ldrb    w1, [x2]
>         add     w1, w1, #1
>         and     w1, w1, #(0xff)
>         strb    w1, [x2]
> 
>         add     x2, x2, #(4096)
>         cmp     x2, x3
>         blt     innerloop
> 
>         add     w6, w6, #1
>         and     w6, w6, #(0xff)
>         cmp     w6, #0
>         bne     mainloop
>         
>         /* output char 'B' to PL011 */
>         mov     w4, #66
>         mov     x5, #0x9000000
>         strb    w4, [x5]
> 
>         bl      mainloop
> 
> The code is compiled with the following commands:
>  > gcc -c -o fill.o fill.s
>  > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \
>    -nostdlib fill.o
>  > objcopy -O binary fill.elf fill.flat
>  > truncate -c -s 144 ./fill.flat
>  > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }'
> 
> The linker file (borrowed from KVM unit test) is defined as:
> 
> SECTIONS
> {
>     .text : { *(.init) *(.text) *(.text.*) }
>     . = ALIGN(64K);
>     etext = .;
>     .data : {
>         *(.data)
>     }
>     . = ALIGN(16);
>     .rodata : { *(.rodata) }
>     . = ALIGN(16);
>     .bss : { *(.bss) }
>     . = ALIGN(64K);
>     edata = .;
>     . += 64K;
>     . = ALIGN(64K);
>     /*
>      * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE
>      * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm
>      * sp must always be strictly less than the true stacktop
>      */
>     stackptr = . - 16;
>     stacktop = .;
> }
> 
> ENTRY(start)
> 
> Signed-off-by: Wei Huang <wei@redhat.com>
> ---
>  tests/Makefile.include |  1 +
>  tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------
>  2 files changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index b4bcc872f2..2a520e53ab 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF)
>  gcov-files-arm-y += hw/timer/arm_mptimer.c
>  
>  check-qtest-aarch64-y = tests/numa-test$(EXESUF)
> +check-qtest-aarch64-y += tests/migration-test$(EXESUF)
>  
>  check-qtest-microblazeel-y = $(check-qtest-microblaze-y)
>  
> diff --git a/tests/migration-test.c b/tests/migration-test.c
> index be598d3257..3237fe93b2 100644
> --- a/tests/migration-test.c
> +++ b/tests/migration-test.c
> @@ -22,8 +22,8 @@
>  
>  #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */
>  
> -const unsigned start_address = 1024 * 1024;
> -const unsigned end_address = 100 * 1024 * 1024;
> +unsigned start_address = 1024 * 1024;
> +unsigned end_address = 100 * 1024 * 1024;
>  bool got_stop;
>  
>  #if defined(__linux__)
> @@ -79,7 +79,7 @@ static const char *tmpfs;
>  /* A simple PC boot sector that modifies memory (1-100MB) quickly
>   * outputing a 'B' every so often if it's still running.
>   */
> -unsigned char bootsect[] = {
> +unsigned char x86_bootsect[] = {
>    0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
>    0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
> @@ -125,11 +125,20 @@ unsigned char bootsect[] = {
>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa
>  };
>  
> -static void init_bootfile_x86(const char *bootpath)
> +uint32_t aarch64_kernel[] = {
> +    0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005,
> +    0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041,
> +    0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041,
> +    0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b,
> +    0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005,
> +    0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
> +};
> +
> +static void init_bootfile(const char *bootpath, void *content)
>  {
>      FILE *bootfile = fopen(bootpath, "wb");
>  
> -    g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1);
> +    g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1);
>      fclose(bootfile);
>  }
>  
> @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to,
>      got_stop = false;
>  
>      if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
> -        init_bootfile_x86(bootpath);
> +        init_bootfile(bootpath, x86_bootsect);
>          cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
>                                    " -name pcsource,debug-threads=on"
>                                    " -serial file:%s/src_serial"
> @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to,
>                                    " -serial file:%s/dest_serial"
>                                    " -incoming %s",
>                                    accel, tmpfs, uri);
> +    } else if (strcmp(arch, "aarch64") == 0) {
> +        init_bootfile(bootpath, aarch64_kernel);
> +        cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
> +                                  "-name vmsource,debug-threads=on -cpu host "
> +                                  "-serial file:%s/src_serial "
> +                                  "-kernel %s ",
> +                                  tmpfs, bootpath);
> +        cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
> +                                  "-name vmdest,debug-threads=on -cpu host "
> +                                  "-serial file:%s/dest_serial "
> +                                  "-kernel %s "
> +                                  "-incoming %s ",
> +                                  tmpfs, bootpath, uri);
> +        /* aarch64 virt machine physical mem started from 0x40000000 */
> +        start_address += 0x40000000;
> +        end_address += 0x40000000;
>      } else {
>          g_assert_not_reached();
>      }
> -- 
> 2.14.3
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Wei Huang Jan. 26, 2018, 3:47 p.m. UTC | #2
On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote:
> * Wei Huang (wei@redhat.com) wrote:
>> This patch adds the migration test support for aarch64. The test code,
>> which implements the same functionality as x86, is compiled into a binary
>> and booted as a kernel in qemu. Here are the design ideas:
>>
>>  * We choose this -kernel design because aarch64 QEMU doesn't provide a
>>    built-in fw like x86 does. So instead of relying on a boot loader, we
>>    use -kernel approach for aarch64.
>>  * The serial output is sent to PL011 directly.
>>  * The physical memory base for mach-virt machine is 0x40000000. We change
>>    the start_address and end_address for aarch64.
>>
>> RFC->V1:
>>  * aarch64 kernel blob is defined as an uint32_t array
>>  * The test code is re-written to address a data caching issue under KVM.
>>    Tests passed under both x86 and aarch64.
>>  * Re-use init_bootfile_x86() for both x86 and aarch64
>>  * Other minor fixes
>>
>> Note that the test code is as the following:
>>
>> .section .text
>>
>>         .globl  start
>>         
>> start:
>>         /* disable MMU to use phys mem address */
>>         mrs     x0, sctlr_el1
>>         bic     x0, x0, #(1<<0)
>>         msr     sctlr_el1, x0
>>         isb
>>
>>         /* output char 'A' to PL011 */
>>         mov     w4, #65
>>         mov     x5, #0x9000000
>>         strb    w4, [x5]
>>
>>         /* w6 keeps a counter so we can limit the output speed */
>>         mov     w6, #0
>>
>>         /* phys mem base addr = 0x40000000 */
>>         mov     x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */
>>         mov     x2, #(0x40000000 + 1 * 1024*1024)
>>
>>         /* clean up memory first */
>>         mov     w1, #0  
>> clean:
>>         strb    w1, [x2]
>>         add     x2, x2, #(4096)
>>         cmp     x2, x3
>>         ble     clean
>>
>>         /* main body */
>> mainloop:
>>         mov     x2, #(0x40000000 + 1 * 1024*1024)
>>         
>> innerloop:
>>         /* clean cache because el2 might still cache guest data under KVM */
>>         dc      civac, x2
> 
> Can you explain a bit more about that please;  if it's guest
> visible acorss migration, doesn't that suggest we need the cache clear
> in KVM or QEMU?

I think this is ARM specific. This is caused by the inconsistency
between guest VM's data accesses and userspace's accesses (in
check_guest_ram) at the destination:

1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So
the data accesses from guest VM go directly into memory.
2) QEMU user space will use the cacheable version. So it is possible
that data can come from cache, instead of RAM. The difference between
(1) and (2) obviously can cause check_guest_ram() to fail sometimes.

x86's EPT has the capability to ignore guest-provided memory type. So it
is possible to have consistent data views between guest and QEMU user-space.


> 
> Dave
> 
>>         ldrb    w1, [x2]
>>         add     w1, w1, #1
>>         and     w1, w1, #(0xff)
>>         strb    w1, [x2]
>>
>>         add     x2, x2, #(4096)
>>         cmp     x2, x3
>>         blt     innerloop
>>
>>         add     w6, w6, #1
>>         and     w6, w6, #(0xff)
>>         cmp     w6, #0
>>         bne     mainloop
>>         
>>         /* output char 'B' to PL011 */
>>         mov     w4, #66
>>         mov     x5, #0x9000000
>>         strb    w4, [x5]
>>
>>         bl      mainloop
>>
>> The code is compiled with the following commands:
>>  > gcc -c -o fill.o fill.s
>>  > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \
>>    -nostdlib fill.o
>>  > objcopy -O binary fill.elf fill.flat
>>  > truncate -c -s 144 ./fill.flat
>>  > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }'
>>
>> The linker file (borrowed from KVM unit test) is defined as:
>>
>> SECTIONS
>> {
>>     .text : { *(.init) *(.text) *(.text.*) }
>>     . = ALIGN(64K);
>>     etext = .;
>>     .data : {
>>         *(.data)
>>     }
>>     . = ALIGN(16);
>>     .rodata : { *(.rodata) }
>>     . = ALIGN(16);
>>     .bss : { *(.bss) }
>>     . = ALIGN(64K);
>>     edata = .;
>>     . += 64K;
>>     . = ALIGN(64K);
>>     /*
>>      * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE
>>      * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm
>>      * sp must always be strictly less than the true stacktop
>>      */
>>     stackptr = . - 16;
>>     stacktop = .;
>> }
>>
>> ENTRY(start)
>>
>> Signed-off-by: Wei Huang <wei@redhat.com>
>> ---
>>  tests/Makefile.include |  1 +
>>  tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------
>>  2 files changed, 32 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/Makefile.include b/tests/Makefile.include
>> index b4bcc872f2..2a520e53ab 100644
>> --- a/tests/Makefile.include
>> +++ b/tests/Makefile.include
>> @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF)
>>  gcov-files-arm-y += hw/timer/arm_mptimer.c
>>  
>>  check-qtest-aarch64-y = tests/numa-test$(EXESUF)
>> +check-qtest-aarch64-y += tests/migration-test$(EXESUF)
>>  
>>  check-qtest-microblazeel-y = $(check-qtest-microblaze-y)
>>  
>> diff --git a/tests/migration-test.c b/tests/migration-test.c
>> index be598d3257..3237fe93b2 100644
>> --- a/tests/migration-test.c
>> +++ b/tests/migration-test.c
>> @@ -22,8 +22,8 @@
>>  
>>  #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */
>>  
>> -const unsigned start_address = 1024 * 1024;
>> -const unsigned end_address = 100 * 1024 * 1024;
>> +unsigned start_address = 1024 * 1024;
>> +unsigned end_address = 100 * 1024 * 1024;
>>  bool got_stop;
>>  
>>  #if defined(__linux__)
>> @@ -79,7 +79,7 @@ static const char *tmpfs;
>>  /* A simple PC boot sector that modifies memory (1-100MB) quickly
>>   * outputing a 'B' every so often if it's still running.
>>   */
>> -unsigned char bootsect[] = {
>> +unsigned char x86_bootsect[] = {
>>    0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
>>    0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
>>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
>> @@ -125,11 +125,20 @@ unsigned char bootsect[] = {
>>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa
>>  };
>>  
>> -static void init_bootfile_x86(const char *bootpath)
>> +uint32_t aarch64_kernel[] = {
>> +    0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005,
>> +    0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041,
>> +    0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041,
>> +    0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b,
>> +    0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005,
>> +    0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
>> +};
>> +
>> +static void init_bootfile(const char *bootpath, void *content)
>>  {
>>      FILE *bootfile = fopen(bootpath, "wb");
>>  
>> -    g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1);
>> +    g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1);
>>      fclose(bootfile);
>>  }
>>  
>> @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to,
>>      got_stop = false;
>>  
>>      if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
>> -        init_bootfile_x86(bootpath);
>> +        init_bootfile(bootpath, x86_bootsect);
>>          cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
>>                                    " -name pcsource,debug-threads=on"
>>                                    " -serial file:%s/src_serial"
>> @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to,
>>                                    " -serial file:%s/dest_serial"
>>                                    " -incoming %s",
>>                                    accel, tmpfs, uri);
>> +    } else if (strcmp(arch, "aarch64") == 0) {
>> +        init_bootfile(bootpath, aarch64_kernel);
>> +        cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
>> +                                  "-name vmsource,debug-threads=on -cpu host "
>> +                                  "-serial file:%s/src_serial "
>> +                                  "-kernel %s ",
>> +                                  tmpfs, bootpath);
>> +        cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
>> +                                  "-name vmdest,debug-threads=on -cpu host "
>> +                                  "-serial file:%s/dest_serial "
>> +                                  "-kernel %s "
>> +                                  "-incoming %s ",
>> +                                  tmpfs, bootpath, uri);
>> +        /* aarch64 virt machine physical mem started from 0x40000000 */
>> +        start_address += 0x40000000;
>> +        end_address += 0x40000000;
>>      } else {
>>          g_assert_not_reached();
>>      }
>> -- 
>> 2.14.3
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Peter Maydell Jan. 26, 2018, 4:39 p.m. UTC | #3
On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote:
>
>
> On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote:
>> * Wei Huang (wei@redhat.com) wrote:
>>> innerloop:
>>>         /* clean cache because el2 might still cache guest data under KVM */
>>>         dc      civac, x2
>>
>> Can you explain a bit more about that please;  if it's guest
>> visible acorss migration, doesn't that suggest we need the cache clear
>> in KVM or QEMU?
>
> I think this is ARM specific. This is caused by the inconsistency
> between guest VM's data accesses and userspace's accesses (in
> check_guest_ram) at the destination:
>
> 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So
> the data accesses from guest VM go directly into memory.
> 2) QEMU user space will use the cacheable version. So it is possible
> that data can come from cache, instead of RAM. The difference between
> (1) and (2) obviously can cause check_guest_ram() to fail sometimes.

I think the correct fix here is that your test code should turn
its MMU on. Trying to treat guest RAM as uncacheable doesn't work
for Arm KVM guests (for the same reason that VGA device video memory
doesn't work). If it's RAM your guest has to arrange to map it as
Normal Cacheable, and then everything should work fine.

thanks
-- PMM
Wei Huang Jan. 26, 2018, 5:08 p.m. UTC | #4
On 01/26/2018 10:39 AM, Peter Maydell wrote:
> On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote:
>>
>>
>> On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote:
>>> * Wei Huang (wei@redhat.com) wrote:
>>>> innerloop:
>>>>         /* clean cache because el2 might still cache guest data under KVM */
>>>>         dc      civac, x2
>>>
>>> Can you explain a bit more about that please;  if it's guest
>>> visible acorss migration, doesn't that suggest we need the cache clear
>>> in KVM or QEMU?
>>
>> I think this is ARM specific. This is caused by the inconsistency
>> between guest VM's data accesses and userspace's accesses (in
>> check_guest_ram) at the destination:
>>
>> 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So
>> the data accesses from guest VM go directly into memory.
>> 2) QEMU user space will use the cacheable version. So it is possible
>> that data can come from cache, instead of RAM. The difference between
>> (1) and (2) obviously can cause check_guest_ram() to fail sometimes.
> 
> I think the correct fix here is that your test code should turn
> its MMU on. Trying to treat guest RAM as uncacheable doesn't work

I did try MMU-on while debugging this problem. It was a migration unit
test written on top of kvm-unit-tests and I forced QEMU's
tests/migration-test.c to use this .flat file as -kernel parameter. I
didn't see any memory check failures using this approach. So I can
confirm your suggestion.

But turning MMU on means a much larger binary blob. I know you don't
like the existing migration-test.c approach. Building a more complicated
test case and embedding it in migration-test.c code will make the
situation worse. Because of this, I chose the current version: small and
manageable (even with one extra cache flush instruction).

> for Arm KVM guests (for the same reason that VGA device video memory
> doesn't work). If it's RAM your guest has to arrange to map it as
> Normal Cacheable, and then everything should work fine.
> 
> thanks
> -- PMM
>
Dr. David Alan Gilbert Jan. 26, 2018, 7:46 p.m. UTC | #5
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote:
> >
> >
> > On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote:
> >> * Wei Huang (wei@redhat.com) wrote:
> >>> innerloop:
> >>>         /* clean cache because el2 might still cache guest data under KVM */
> >>>         dc      civac, x2
> >>
> >> Can you explain a bit more about that please;  if it's guest
> >> visible acorss migration, doesn't that suggest we need the cache clear
> >> in KVM or QEMU?
> >
> > I think this is ARM specific. This is caused by the inconsistency
> > between guest VM's data accesses and userspace's accesses (in
> > check_guest_ram) at the destination:
> >
> > 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So
> > the data accesses from guest VM go directly into memory.
> > 2) QEMU user space will use the cacheable version. So it is possible
> > that data can come from cache, instead of RAM. The difference between
> > (1) and (2) obviously can cause check_guest_ram() to fail sometimes.
> 
> I think the correct fix here is that your test code should turn
> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> for Arm KVM guests (for the same reason that VGA device video memory
> doesn't work). If it's RAM your guest has to arrange to map it as
> Normal Cacheable, and then everything should work fine.

Does this cause problems with migrating at just the wrong point during
a VM boot?

Dave

> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell Jan. 28, 2018, 3:08 p.m. UTC | #6
On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> * Peter Maydell (peter.maydell@linaro.org) wrote:
>> I think the correct fix here is that your test code should turn
>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>> for Arm KVM guests (for the same reason that VGA device video memory
>> doesn't work). If it's RAM your guest has to arrange to map it as
>> Normal Cacheable, and then everything should work fine.
>
> Does this cause problems with migrating at just the wrong point during
> a VM boot?

It wouldn't surprise me if it did, but I don't think I've ever
tried to provoke that problem...

thanks
-- PMM
Dr. David Alan Gilbert Jan. 29, 2018, 9:53 a.m. UTC | #7
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > * Peter Maydell (peter.maydell@linaro.org) wrote:
> >> I think the correct fix here is that your test code should turn
> >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> >> for Arm KVM guests (for the same reason that VGA device video memory
> >> doesn't work). If it's RAM your guest has to arrange to map it as
> >> Normal Cacheable, and then everything should work fine.
> >
> > Does this cause problems with migrating at just the wrong point during
> > a VM boot?
> 
> It wouldn't surprise me if it did, but I don't think I've ever
> tried to provoke that problem...

If you think it'll get the RAM contents wrong, it might be best to fail
the migration if you can detect the cache is disabled in the guest.

Dave

> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Maydell Jan. 29, 2018, 10:04 a.m. UTC | #8
On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> * Peter Maydell (peter.maydell@linaro.org) wrote:
>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>> > * Peter Maydell (peter.maydell@linaro.org) wrote:
>> >> I think the correct fix here is that your test code should turn
>> >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>> >> for Arm KVM guests (for the same reason that VGA device video memory
>> >> doesn't work). If it's RAM your guest has to arrange to map it as
>> >> Normal Cacheable, and then everything should work fine.
>> >
>> > Does this cause problems with migrating at just the wrong point during
>> > a VM boot?
>>
>> It wouldn't surprise me if it did, but I don't think I've ever
>> tried to provoke that problem...
>
> If you think it'll get the RAM contents wrong, it might be best to fail
> the migration if you can detect the cache is disabled in the guest.

I guess QEMU could look at the value of the "MMU disabled/enabled" bit
in the guest's system registers, and refuse migration if it's off...

(cc'd Marc, Christoffer to check that I don't have the wrong end
of the stick about how thin the ice is in the period before the
guest turns on its MMU...)

thanks
-- PMM
Dr. David Alan Gilbert Jan. 29, 2018, 10:19 a.m. UTC | #9
* Peter Maydell (peter.maydell@linaro.org) wrote:
> On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > * Peter Maydell (peter.maydell@linaro.org) wrote:
> >> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >> > * Peter Maydell (peter.maydell@linaro.org) wrote:
> >> >> I think the correct fix here is that your test code should turn
> >> >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> >> >> for Arm KVM guests (for the same reason that VGA device video memory
> >> >> doesn't work). If it's RAM your guest has to arrange to map it as
> >> >> Normal Cacheable, and then everything should work fine.
> >> >
> >> > Does this cause problems with migrating at just the wrong point during
> >> > a VM boot?
> >>
> >> It wouldn't surprise me if it did, but I don't think I've ever
> >> tried to provoke that problem...
> >
> > If you think it'll get the RAM contents wrong, it might be best to fail
> > the migration if you can detect the cache is disabled in the guest.
> 
> I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> in the guest's system registers, and refuse migration if it's off...
> 
> (cc'd Marc, Christoffer to check that I don't have the wrong end
> of the stick about how thin the ice is in the period before the
> guest turns on its MMU...)

OK; I mean it's not nice either way - but a failed migration is better
than one with corrupt RAM.

It's not pretty for cloud users; they do host-evacuations and the like
fully automatically without any knowledge of the state of a VM and hope
for migrations to work in any state.

Dave

> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Marc Zyngier Jan. 29, 2018, 10:32 a.m. UTC | #10
On 29/01/18 10:04, Peter Maydell wrote:
> On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>> I think the correct fix here is that your test code should turn
>>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>> Normal Cacheable, and then everything should work fine.
>>>>
>>>> Does this cause problems with migrating at just the wrong point during
>>>> a VM boot?
>>>
>>> It wouldn't surprise me if it did, but I don't think I've ever
>>> tried to provoke that problem...
>>
>> If you think it'll get the RAM contents wrong, it might be best to fail
>> the migration if you can detect the cache is disabled in the guest.
> 
> I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> in the guest's system registers, and refuse migration if it's off...
> 
> (cc'd Marc, Christoffer to check that I don't have the wrong end
> of the stick about how thin the ice is in the period before the
> guest turns on its MMU...)

Once MMU and caches are on, we should be in a reasonable place for QEMU
to have a consistent view of the memory. The trick is to prevent the
vcpus from changing that. A guest could perfectly turn off its MMU at
any given time if it needs to (and it is actually required on some HW if
you want to mitigate headlining CVEs), and KVM won't know about that.

You may have to pause the vcpus before starting the migration, or
introduce a new KVM feature that would automatically pause a vcpu that
is trying to disable its MMU while the migration is on. This would
involve trapping all the virtual memory related system registers, with
an obvious cost. But that cost would be limited to the time it takes to
migrate the memory, so maybe that's acceptable.

Thoughts?

	M.
Christoffer Dall Jan. 31, 2018, 9:53 a.m. UTC | #11
On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
> On 29/01/18 10:04, Peter Maydell wrote:
> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >>>>> I think the correct fix here is that your test code should turn
> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
> >>>>> Normal Cacheable, and then everything should work fine.
> >>>>
> >>>> Does this cause problems with migrating at just the wrong point during
> >>>> a VM boot?
> >>>
> >>> It wouldn't surprise me if it did, but I don't think I've ever
> >>> tried to provoke that problem...
> >>
> >> If you think it'll get the RAM contents wrong, it might be best to fail
> >> the migration if you can detect the cache is disabled in the guest.
> > 
> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> > in the guest's system registers, and refuse migration if it's off...
> > 
> > (cc'd Marc, Christoffer to check that I don't have the wrong end
> > of the stick about how thin the ice is in the period before the
> > guest turns on its MMU...)
> 
> Once MMU and caches are on, we should be in a reasonable place for QEMU
> to have a consistent view of the memory. The trick is to prevent the
> vcpus from changing that. A guest could perfectly turn off its MMU at
> any given time if it needs to (and it is actually required on some HW if
> you want to mitigate headlining CVEs), and KVM won't know about that.
> 

(Clarification: KVM can detect this is it bother to check the VCPU's
system registers, but we don't trap to KVM when the VCPU turns off its
caches, right?)

> You may have to pause the vcpus before starting the migration, or
> introduce a new KVM feature that would automatically pause a vcpu that
> is trying to disable its MMU while the migration is on. This would
> involve trapping all the virtual memory related system registers, with
> an obvious cost. But that cost would be limited to the time it takes to
> migrate the memory, so maybe that's acceptable.
> 
Is that even sufficient?

What if the following happened. (1) guest turns off MMU, (2) guest
writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
guest ram.  QEMU's view of guest ram is now incorrect (stale,
incoherent, ...).

I'm also not really sure if pausing one VCPU because it turned off its
MMU will go very well when trying to migrate a large VM (wouldn't this
ask for all the other VCPUs beginning to complain that the stopped VCPU
appears to be dead?).  As a short-term 'fix' it's probably better to
refuse migration if you detect that a VCPU had begun turning off its
MMU.

On the larger scale of thins; this appears to me to be another case of
us really needing some way to coherently access memory between QEMU and
the VM, but in the case of the VCPU turning off the MMU prior to
migration, we don't even know where it may have written data, and I'm
therefore not really sure what the 'proper' solution would be.

(cc'ing Ard who has has thought about this problem before in the context
of UEFI and VGA.)

Thanks,
-Christoffer
Ard Biesheuvel Jan. 31, 2018, 3:18 p.m. UTC | #12
On 31 January 2018 at 09:53, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>> On 29/01/18 10:04, Peter Maydell wrote:
>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>> >>>>> I think the correct fix here is that your test code should turn
>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>> >>>>> Normal Cacheable, and then everything should work fine.
>> >>>>
>> >>>> Does this cause problems with migrating at just the wrong point during
>> >>>> a VM boot?
>> >>>
>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>> >>> tried to provoke that problem...
>> >>
>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>> >> the migration if you can detect the cache is disabled in the guest.
>> >
>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>> > in the guest's system registers, and refuse migration if it's off...
>> >
>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>> > of the stick about how thin the ice is in the period before the
>> > guest turns on its MMU...)
>>
>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>> to have a consistent view of the memory. The trick is to prevent the
>> vcpus from changing that. A guest could perfectly turn off its MMU at
>> any given time if it needs to (and it is actually required on some HW if
>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>
>
> (Clarification: KVM can detect this is it bother to check the VCPU's
> system registers, but we don't trap to KVM when the VCPU turns off its
> caches, right?)
>
>> You may have to pause the vcpus before starting the migration, or
>> introduce a new KVM feature that would automatically pause a vcpu that
>> is trying to disable its MMU while the migration is on. This would
>> involve trapping all the virtual memory related system registers, with
>> an obvious cost. But that cost would be limited to the time it takes to
>> migrate the memory, so maybe that's acceptable.
>>
> Is that even sufficient?
>
> What if the following happened. (1) guest turns off MMU, (2) guest
> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
> guest ram.  QEMU's view of guest ram is now incorrect (stale,
> incoherent, ...).
>
> I'm also not really sure if pausing one VCPU because it turned off its
> MMU will go very well when trying to migrate a large VM (wouldn't this
> ask for all the other VCPUs beginning to complain that the stopped VCPU
> appears to be dead?).  As a short-term 'fix' it's probably better to
> refuse migration if you detect that a VCPU had begun turning off its
> MMU.
>
> On the larger scale of thins; this appears to me to be another case of
> us really needing some way to coherently access memory between QEMU and
> the VM, but in the case of the VCPU turning off the MMU prior to
> migration, we don't even know where it may have written data, and I'm
> therefore not really sure what the 'proper' solution would be.
>
> (cc'ing Ard who has has thought about this problem before in the context
> of UEFI and VGA.)
>

Actually, the VGA case is much simpler because the host is not
expected to write to the framebuffer, only read from it, and the guest
is not expected to create a cacheable mapping for it, so any
incoherency can be trivially solved by cache invalidation on the host
side. (Note that this has nothing to do with DMA coherency, but only
with PCI MMIO BARs that are backed by DRAM in the host)

In the migration case, it is much more complicated, and I think
capturing the state of the VM in a way that takes incoherency between
caches and main memory into account is simply infeasible (i.e., the
act of recording the state of guest RAM via a cached mapping may evict
clean cachelines that are out of sync, and so it is impossible to
record both the cached *and* the delta with the uncached state)

I wonder how difficult it would be to
a) enable trapping of the MMU system register when a guest CPU is
found to have its MMU off at migration time
b) allow the guest CPU to complete whatever it thinks it needs to be
doing with the MMU off
c) once it re-enables the MMU, proceed with capturing the memory state

Full disclosure: I know very little about KVM migration ...
Dr. David Alan Gilbert Jan. 31, 2018, 3:23 p.m. UTC | #13
* Ard Biesheuvel (ard.biesheuvel@linaro.org) wrote:
> On 31 January 2018 at 09:53, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
> > On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
> >> On 29/01/18 10:04, Peter Maydell wrote:
> >> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >> >>>>> I think the correct fix here is that your test code should turn
> >> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> >> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
> >> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
> >> >>>>> Normal Cacheable, and then everything should work fine.
> >> >>>>
> >> >>>> Does this cause problems with migrating at just the wrong point during
> >> >>>> a VM boot?
> >> >>>
> >> >>> It wouldn't surprise me if it did, but I don't think I've ever
> >> >>> tried to provoke that problem...
> >> >>
> >> >> If you think it'll get the RAM contents wrong, it might be best to fail
> >> >> the migration if you can detect the cache is disabled in the guest.
> >> >
> >> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> >> > in the guest's system registers, and refuse migration if it's off...
> >> >
> >> > (cc'd Marc, Christoffer to check that I don't have the wrong end
> >> > of the stick about how thin the ice is in the period before the
> >> > guest turns on its MMU...)
> >>
> >> Once MMU and caches are on, we should be in a reasonable place for QEMU
> >> to have a consistent view of the memory. The trick is to prevent the
> >> vcpus from changing that. A guest could perfectly turn off its MMU at
> >> any given time if it needs to (and it is actually required on some HW if
> >> you want to mitigate headlining CVEs), and KVM won't know about that.
> >>
> >
> > (Clarification: KVM can detect this is it bother to check the VCPU's
> > system registers, but we don't trap to KVM when the VCPU turns off its
> > caches, right?)
> >
> >> You may have to pause the vcpus before starting the migration, or
> >> introduce a new KVM feature that would automatically pause a vcpu that
> >> is trying to disable its MMU while the migration is on. This would
> >> involve trapping all the virtual memory related system registers, with
> >> an obvious cost. But that cost would be limited to the time it takes to
> >> migrate the memory, so maybe that's acceptable.
> >>
> > Is that even sufficient?
> >
> > What if the following happened. (1) guest turns off MMU, (2) guest
> > writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
> > guest ram.  QEMU's view of guest ram is now incorrect (stale,
> > incoherent, ...).
> >
> > I'm also not really sure if pausing one VCPU because it turned off its
> > MMU will go very well when trying to migrate a large VM (wouldn't this
> > ask for all the other VCPUs beginning to complain that the stopped VCPU
> > appears to be dead?).  As a short-term 'fix' it's probably better to
> > refuse migration if you detect that a VCPU had begun turning off its
> > MMU.
> >
> > On the larger scale of thins; this appears to me to be another case of
> > us really needing some way to coherently access memory between QEMU and
> > the VM, but in the case of the VCPU turning off the MMU prior to
> > migration, we don't even know where it may have written data, and I'm
> > therefore not really sure what the 'proper' solution would be.
> >
> > (cc'ing Ard who has has thought about this problem before in the context
> > of UEFI and VGA.)
> >
> 
> Actually, the VGA case is much simpler because the host is not
> expected to write to the framebuffer, only read from it, and the guest
> is not expected to create a cacheable mapping for it, so any
> incoherency can be trivially solved by cache invalidation on the host
> side. (Note that this has nothing to do with DMA coherency, but only
> with PCI MMIO BARs that are backed by DRAM in the host)
> 
> In the migration case, it is much more complicated, and I think
> capturing the state of the VM in a way that takes incoherency between
> caches and main memory into account is simply infeasible (i.e., the
> act of recording the state of guest RAM via a cached mapping may evict
> clean cachelines that are out of sync, and so it is impossible to
> record both the cached *and* the delta with the uncached state)
> 
> I wonder how difficult it would be to
> a) enable trapping of the MMU system register when a guest CPU is
> found to have its MMU off at migration time
> b) allow the guest CPU to complete whatever it thinks it needs to be
> doing with the MMU off
> c) once it re-enables the MMU, proceed with capturing the memory state
> 
> Full disclosure: I know very little about KVM migration ...

The difficulty is that migration is 'live' - i.e. the guest is running
while we're copying the data across; that means that a guest might
do any of these MMU things multiple times - so if we wait for it
to be right, will it go back to being wrong?  How long do you wait?
(It's not a bad hack if that's the best we can do though).

Now of course 'live' itself sounds scary for consistency, but the only thing we really
require is that a page is marked dirty some time after it's been
written to so that we cause it to be sent again and that we
eventually send a correct version; it's ok for us to be sending
inconsistent versions as long as we eventually send the right
version.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christoffer Dall Jan. 31, 2018, 4:53 p.m. UTC | #14
On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 31 January 2018 at 09:53, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>> On 29/01/18 10:04, Peter Maydell wrote:
>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>> >>>>> I think the correct fix here is that your test code should turn
>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>> >>>>> Normal Cacheable, and then everything should work fine.
>>> >>>>
>>> >>>> Does this cause problems with migrating at just the wrong point during
>>> >>>> a VM boot?
>>> >>>
>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>> >>> tried to provoke that problem...
>>> >>
>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>> >> the migration if you can detect the cache is disabled in the guest.
>>> >
>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>> > in the guest's system registers, and refuse migration if it's off...
>>> >
>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>> > of the stick about how thin the ice is in the period before the
>>> > guest turns on its MMU...)
>>>
>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>> to have a consistent view of the memory. The trick is to prevent the
>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>> any given time if it needs to (and it is actually required on some HW if
>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>
>>
>> (Clarification: KVM can detect this is it bother to check the VCPU's
>> system registers, but we don't trap to KVM when the VCPU turns off its
>> caches, right?)
>>
>>> You may have to pause the vcpus before starting the migration, or
>>> introduce a new KVM feature that would automatically pause a vcpu that
>>> is trying to disable its MMU while the migration is on. This would
>>> involve trapping all the virtual memory related system registers, with
>>> an obvious cost. But that cost would be limited to the time it takes to
>>> migrate the memory, so maybe that's acceptable.
>>>
>> Is that even sufficient?
>>
>> What if the following happened. (1) guest turns off MMU, (2) guest
>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>> incoherent, ...).
>>
>> I'm also not really sure if pausing one VCPU because it turned off its
>> MMU will go very well when trying to migrate a large VM (wouldn't this
>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>> appears to be dead?).  As a short-term 'fix' it's probably better to
>> refuse migration if you detect that a VCPU had begun turning off its
>> MMU.
>>
>> On the larger scale of thins; this appears to me to be another case of
>> us really needing some way to coherently access memory between QEMU and
>> the VM, but in the case of the VCPU turning off the MMU prior to
>> migration, we don't even know where it may have written data, and I'm
>> therefore not really sure what the 'proper' solution would be.
>>
>> (cc'ing Ard who has has thought about this problem before in the context
>> of UEFI and VGA.)
>>
>
> Actually, the VGA case is much simpler because the host is not
> expected to write to the framebuffer, only read from it, and the guest
> is not expected to create a cacheable mapping for it, so any
> incoherency can be trivially solved by cache invalidation on the host
> side. (Note that this has nothing to do with DMA coherency, but only
> with PCI MMIO BARs that are backed by DRAM in the host)

In case of the running guest, the host will also only read from the
cached mapping.  Of course, at restore, the host will also write
through a cached mapping, but shouldn't the latter case be solvable by
having KVM clean the cache lines when faulting in any page?

>
> In the migration case, it is much more complicated, and I think
> capturing the state of the VM in a way that takes incoherency between
> caches and main memory into account is simply infeasible (i.e., the
> act of recording the state of guest RAM via a cached mapping may evict
> clean cachelines that are out of sync, and so it is impossible to
> record both the cached *and* the delta with the uncached state)

This may be an incredibly stupid question (and I may have asked it
before), but why can't we clean+invalidate the guest page before
reading it and thereby obtain a coherent view of a page?

>
> I wonder how difficult it would be to
> a) enable trapping of the MMU system register when a guest CPU is
> found to have its MMU off at migration time
> b) allow the guest CPU to complete whatever it thinks it needs to be
> doing with the MMU off
> c) once it re-enables the MMU, proceed with capturing the memory state
>
I don't think this is impossible, but it would prevent migration of
things like bare-metal guests or weirdo unikernel guests running with
the MMU off, wouldn't it?

Thanks,
-Christoffer
Ard Biesheuvel Jan. 31, 2018, 4:59 p.m. UTC | #15
On 31 January 2018 at 16:53, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 31 January 2018 at 09:53, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>> >>>>> I think the correct fix here is that your test code should turn
>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>> >>>>
>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>> >>>> a VM boot?
>>>> >>>
>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>> >>> tried to provoke that problem...
>>>> >>
>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>> >
>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>> > in the guest's system registers, and refuse migration if it's off...
>>>> >
>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>> > of the stick about how thin the ice is in the period before the
>>>> > guest turns on its MMU...)
>>>>
>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>> to have a consistent view of the memory. The trick is to prevent the
>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>> any given time if it needs to (and it is actually required on some HW if
>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>
>>>
>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>> caches, right?)
>>>
>>>> You may have to pause the vcpus before starting the migration, or
>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>> is trying to disable its MMU while the migration is on. This would
>>>> involve trapping all the virtual memory related system registers, with
>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>> migrate the memory, so maybe that's acceptable.
>>>>
>>> Is that even sufficient?
>>>
>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>> incoherent, ...).
>>>
>>> I'm also not really sure if pausing one VCPU because it turned off its
>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>> refuse migration if you detect that a VCPU had begun turning off its
>>> MMU.
>>>
>>> On the larger scale of thins; this appears to me to be another case of
>>> us really needing some way to coherently access memory between QEMU and
>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>> migration, we don't even know where it may have written data, and I'm
>>> therefore not really sure what the 'proper' solution would be.
>>>
>>> (cc'ing Ard who has has thought about this problem before in the context
>>> of UEFI and VGA.)
>>>
>>
>> Actually, the VGA case is much simpler because the host is not
>> expected to write to the framebuffer, only read from it, and the guest
>> is not expected to create a cacheable mapping for it, so any
>> incoherency can be trivially solved by cache invalidation on the host
>> side. (Note that this has nothing to do with DMA coherency, but only
>> with PCI MMIO BARs that are backed by DRAM in the host)
>
> In case of the running guest, the host will also only read from the
> cached mapping.  Of course, at restore, the host will also write
> through a cached mapping, but shouldn't the latter case be solvable by
> having KVM clean the cache lines when faulting in any page?
>

We are still talking about the contents of the framebuffer, right? In
that case, yes, afaict

>>
>> In the migration case, it is much more complicated, and I think
>> capturing the state of the VM in a way that takes incoherency between
>> caches and main memory into account is simply infeasible (i.e., the
>> act of recording the state of guest RAM via a cached mapping may evict
>> clean cachelines that are out of sync, and so it is impossible to
>> record both the cached *and* the delta with the uncached state)
>
> This may be an incredibly stupid question (and I may have asked it
> before), but why can't we clean+invalidate the guest page before
> reading it and thereby obtain a coherent view of a page?
>

Because cleaning from the host will clobber whatever the guest wrote
directly to memory with the MMU off, if there is a dirty cacheline
shadowing that memory. However, that same cacheline could be dirty
because the guest itself wrote to memory with the MMU on.

So in general, it is hard to tell which version of the data is the
correct one without having intimate knowledge of how the guest is
using which parts of memory with the MMU off (the same applies to
non-coherent DMA btw), and so whether clean or invalidate is the
correct operation to perform from the host is not trivial to decide.

>>
>> I wonder how difficult it would be to
>> a) enable trapping of the MMU system register when a guest CPU is
>> found to have its MMU off at migration time
>> b) allow the guest CPU to complete whatever it thinks it needs to be
>> doing with the MMU off
>> c) once it re-enables the MMU, proceed with capturing the memory state
>>
> I don't think this is impossible, but it would prevent migration of
> things like bare-metal guests or weirdo unikernel guests running with
> the MMU off, wouldn't it?
>

Yep.
Christoffer Dall Jan. 31, 2018, 5:39 p.m. UTC | #16
On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 31 January 2018 at 16:53, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>> <ard.biesheuvel@linaro.org> wrote:
>>> On 31 January 2018 at 09:53, Christoffer Dall
>>> <christoffer.dall@linaro.org> wrote:
>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>> >>>>
>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>> >>>> a VM boot?
>>>>> >>>
>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>> >>> tried to provoke that problem...
>>>>> >>
>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>> >
>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>> >
>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>> > of the stick about how thin the ice is in the period before the
>>>>> > guest turns on its MMU...)
>>>>>
>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>
>>>>
>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>> caches, right?)
>>>>
>>>>> You may have to pause the vcpus before starting the migration, or
>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>> is trying to disable its MMU while the migration is on. This would
>>>>> involve trapping all the virtual memory related system registers, with
>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>> migrate the memory, so maybe that's acceptable.
>>>>>
>>>> Is that even sufficient?
>>>>
>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>> incoherent, ...).
>>>>
>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>> MMU.
>>>>
>>>> On the larger scale of thins; this appears to me to be another case of
>>>> us really needing some way to coherently access memory between QEMU and
>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>> migration, we don't even know where it may have written data, and I'm
>>>> therefore not really sure what the 'proper' solution would be.
>>>>
>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>> of UEFI and VGA.)
>>>>
>>>
>>> Actually, the VGA case is much simpler because the host is not
>>> expected to write to the framebuffer, only read from it, and the guest
>>> is not expected to create a cacheable mapping for it, so any
>>> incoherency can be trivially solved by cache invalidation on the host
>>> side. (Note that this has nothing to do with DMA coherency, but only
>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>
>> In case of the running guest, the host will also only read from the
>> cached mapping.  Of course, at restore, the host will also write
>> through a cached mapping, but shouldn't the latter case be solvable by
>> having KVM clean the cache lines when faulting in any page?
>>
>
> We are still talking about the contents of the framebuffer, right? In
> that case, yes, afaict
>

I was talking about normal RAM actually... not sure if that changes anything?

>>>
>>> In the migration case, it is much more complicated, and I think
>>> capturing the state of the VM in a way that takes incoherency between
>>> caches and main memory into account is simply infeasible (i.e., the
>>> act of recording the state of guest RAM via a cached mapping may evict
>>> clean cachelines that are out of sync, and so it is impossible to
>>> record both the cached *and* the delta with the uncached state)
>>
>> This may be an incredibly stupid question (and I may have asked it
>> before), but why can't we clean+invalidate the guest page before
>> reading it and thereby obtain a coherent view of a page?
>>
>
> Because cleaning from the host will clobber whatever the guest wrote
> directly to memory with the MMU off, if there is a dirty cacheline
> shadowing that memory.

If the host never wrote anything to that memory (it shouldn't mess
with the guest's memory) there will only be clean cache lines (even if
they contain content shadowing the memory) and cleaning them would be
equivalent to an invalidate.  Am I misremembering how this works?

> However, that same cacheline could be dirty
> because the guest itself wrote to memory with the MMU on.

Yes, but the guest would have no control over when such a cache line
gets flushed to main memory by the hardware, and can have no
reasonable expectation that the cache lines don't get cleaned behind
its back.  The fact that a migration triggers this, is reasonable.  A
guest that wants hand-off from main memory that its accessing with the
MMU off, must invalidate the appropriate cache lines or ensure they're
clean.  There's very likely some subtle aspect to all of this that I'm
forgetting.

Thanks,
-Christoffer
Ard Biesheuvel Jan. 31, 2018, 6 p.m. UTC | #17
On 31 January 2018 at 17:39, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 31 January 2018 at 16:53, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>> <ard.biesheuvel@linaro.org> wrote:
>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>> <christoffer.dall@linaro.org> wrote:
>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>> >>>>
>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>> >>>> a VM boot?
>>>>>> >>>
>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>> >>> tried to provoke that problem...
>>>>>> >>
>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>> >
>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>> >
>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>> > guest turns on its MMU...)
>>>>>>
>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>
>>>>>
>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>> caches, right?)
>>>>>
>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>
>>>>> Is that even sufficient?
>>>>>
>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>> incoherent, ...).
>>>>>
>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>> MMU.
>>>>>
>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>> us really needing some way to coherently access memory between QEMU and
>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>> migration, we don't even know where it may have written data, and I'm
>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>
>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>> of UEFI and VGA.)
>>>>>
>>>>
>>>> Actually, the VGA case is much simpler because the host is not
>>>> expected to write to the framebuffer, only read from it, and the guest
>>>> is not expected to create a cacheable mapping for it, so any
>>>> incoherency can be trivially solved by cache invalidation on the host
>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>
>>> In case of the running guest, the host will also only read from the
>>> cached mapping.  Of course, at restore, the host will also write
>>> through a cached mapping, but shouldn't the latter case be solvable by
>>> having KVM clean the cache lines when faulting in any page?
>>>
>>
>> We are still talking about the contents of the framebuffer, right? In
>> that case, yes, afaict
>>
>
> I was talking about normal RAM actually... not sure if that changes anything?
>

The main difference is that with a framebuffer BAR, it is pointless
for the guest to map it cacheable, given that the purpose of a
framebuffer is its side effects, which are not guaranteed to occur
timely if the mapping is cacheable.

If we are talking about normal RAM, then why are we discussing it here
and not down there?

vvv


>>>>
>>>> In the migration case, it is much more complicated, and I think
>>>> capturing the state of the VM in a way that takes incoherency between
>>>> caches and main memory into account is simply infeasible (i.e., the
>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>> clean cachelines that are out of sync, and so it is impossible to
>>>> record both the cached *and* the delta with the uncached state)
>>>
>>> This may be an incredibly stupid question (and I may have asked it
>>> before), but why can't we clean+invalidate the guest page before
>>> reading it and thereby obtain a coherent view of a page?
>>>
>>
>> Because cleaning from the host will clobber whatever the guest wrote
>> directly to memory with the MMU off, if there is a dirty cacheline
>> shadowing that memory.
>
> If the host never wrote anything to that memory (it shouldn't mess
> with the guest's memory) there will only be clean cache lines (even if
> they contain content shadowing the memory) and cleaning them would be
> equivalent to an invalidate.  Am I misremembering how this works?
>

Cleaning doesn't actually invalidate, but it should be a no-op for
clean cachelines.

>> However, that same cacheline could be dirty
>> because the guest itself wrote to memory with the MMU on.
>
> Yes, but the guest would have no control over when such a cache line
> gets flushed to main memory by the hardware, and can have no
> reasonable expectation that the cache lines don't get cleaned behind
> its back.  The fact that a migration triggers this, is reasonable.  A
> guest that wants hand-off from main memory that its accessing with the
> MMU off, must invalidate the appropriate cache lines or ensure they're
> clean.  There's very likely some subtle aspect to all of this that I'm
> forgetting.
>

OK, so if the only way cachelines covering guest memory could be dirty
is after the guest wrote to that memory itself via a cacheable
mapping, I guess it would be reasonable to do clean+invalidate before
reading the memory. Then, the only way for the guest to lose anything
is in cases where it could not reasonably expect it to be retained
anyway.

However, that does leave a window, between the invalidate and the
read, where the guest could modify memory without it being visible to
the host.
Christoffer Dall Jan. 31, 2018, 7:12 p.m. UTC | #18
On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 31 January 2018 at 17:39, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
>> <ard.biesheuvel@linaro.org> wrote:
>>> On 31 January 2018 at 16:53, Christoffer Dall
>>> <christoffer.dall@linaro.org> wrote:
>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>>> >>>>
>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>>> >>>> a VM boot?
>>>>>>> >>>
>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>>> >>> tried to provoke that problem...
>>>>>>> >>
>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>>> >
>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>>> >
>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>>> > guest turns on its MMU...)
>>>>>>>
>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>>
>>>>>>
>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>>> caches, right?)
>>>>>>
>>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>>
>>>>>> Is that even sufficient?
>>>>>>
>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>>> incoherent, ...).
>>>>>>
>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>>> MMU.
>>>>>>
>>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>>> us really needing some way to coherently access memory between QEMU and
>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>>> migration, we don't even know where it may have written data, and I'm
>>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>>
>>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>>> of UEFI and VGA.)
>>>>>>
>>>>>
>>>>> Actually, the VGA case is much simpler because the host is not
>>>>> expected to write to the framebuffer, only read from it, and the guest
>>>>> is not expected to create a cacheable mapping for it, so any
>>>>> incoherency can be trivially solved by cache invalidation on the host
>>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>>
>>>> In case of the running guest, the host will also only read from the
>>>> cached mapping.  Of course, at restore, the host will also write
>>>> through a cached mapping, but shouldn't the latter case be solvable by
>>>> having KVM clean the cache lines when faulting in any page?
>>>>
>>>
>>> We are still talking about the contents of the framebuffer, right? In
>>> that case, yes, afaict
>>>
>>
>> I was talking about normal RAM actually... not sure if that changes anything?
>>
>
> The main difference is that with a framebuffer BAR, it is pointless
> for the guest to map it cacheable, given that the purpose of a
> framebuffer is its side effects, which are not guaranteed to occur
> timely if the mapping is cacheable.
>
> If we are talking about normal RAM, then why are we discussing it here
> and not down there?
>

Because I was trying to figure out how the challenge of accessing the
VGA framebuffer differs from the challenge of accessing guest RAM
which may have been written by the guest with the MMU off.

First approximation, they are extremely similar because the guest is
writing uncached to memory, which the host now has to access via a
cached mapping.

But I'm guessing that a "clean+invalidate before read on the host"
solution will result in terrible performance for a framebuffer and
therefore isn't a good solution for that problem...

>
>
>>>>>
>>>>> In the migration case, it is much more complicated, and I think
>>>>> capturing the state of the VM in a way that takes incoherency between
>>>>> caches and main memory into account is simply infeasible (i.e., the
>>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>>> clean cachelines that are out of sync, and so it is impossible to
>>>>> record both the cached *and* the delta with the uncached state)
>>>>
>>>> This may be an incredibly stupid question (and I may have asked it
>>>> before), but why can't we clean+invalidate the guest page before
>>>> reading it and thereby obtain a coherent view of a page?
>>>>
>>>
>>> Because cleaning from the host will clobber whatever the guest wrote
>>> directly to memory with the MMU off, if there is a dirty cacheline
>>> shadowing that memory.
>>
>> If the host never wrote anything to that memory (it shouldn't mess
>> with the guest's memory) there will only be clean cache lines (even if
>> they contain content shadowing the memory) and cleaning them would be
>> equivalent to an invalidate.  Am I misremembering how this works?
>>
>
> Cleaning doesn't actually invalidate, but it should be a no-op for
> clean cachelines.
>
>>> However, that same cacheline could be dirty
>>> because the guest itself wrote to memory with the MMU on.
>>
>> Yes, but the guest would have no control over when such a cache line
>> gets flushed to main memory by the hardware, and can have no
>> reasonable expectation that the cache lines don't get cleaned behind
>> its back.  The fact that a migration triggers this, is reasonable.  A
>> guest that wants hand-off from main memory that its accessing with the
>> MMU off, must invalidate the appropriate cache lines or ensure they're
>> clean.  There's very likely some subtle aspect to all of this that I'm
>> forgetting.
>>
>
> OK, so if the only way cachelines covering guest memory could be dirty
> is after the guest wrote to that memory itself via a cacheable
> mapping, I guess it would be reasonable to do clean+invalidate before
> reading the memory. Then, the only way for the guest to lose anything
> is in cases where it could not reasonably expect it to be retained
> anyway.

Right, that's what I'm thinking.

>
> However, that does leave a window, between the invalidate and the
> read, where the guest could modify memory without it being visible to
> the host.

Is that a problem specific to the coherency challenge?  I thought this
problem was already addressed by dirty page tracking, but there's like
some interaction with the cache maintenance that we'd have to figure
out.

Thanks,
-Christoffer
Ard Biesheuvel Jan. 31, 2018, 8:15 p.m. UTC | #19
On 31 January 2018 at 19:12, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 31 January 2018 at 17:39, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
>>> <ard.biesheuvel@linaro.org> wrote:
>>>> On 31 January 2018 at 16:53, Christoffer Dall
>>>> <christoffer.dall@linaro.org> wrote:
>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>>>> >>>>
>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>>>> >>>> a VM boot?
>>>>>>>> >>>
>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>>>> >>> tried to provoke that problem...
>>>>>>>> >>
>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>>>> >
>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>>>> >
>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>>>> > guest turns on its MMU...)
>>>>>>>>
>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>>>
>>>>>>>
>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>>>> caches, right?)
>>>>>>>
>>>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>>>
>>>>>>> Is that even sufficient?
>>>>>>>
>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>>>> incoherent, ...).
>>>>>>>
>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>>>> MMU.
>>>>>>>
>>>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>>>> us really needing some way to coherently access memory between QEMU and
>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>>>> migration, we don't even know where it may have written data, and I'm
>>>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>>>
>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>>>> of UEFI and VGA.)
>>>>>>>
>>>>>>
>>>>>> Actually, the VGA case is much simpler because the host is not
>>>>>> expected to write to the framebuffer, only read from it, and the guest
>>>>>> is not expected to create a cacheable mapping for it, so any
>>>>>> incoherency can be trivially solved by cache invalidation on the host
>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>>>
>>>>> In case of the running guest, the host will also only read from the
>>>>> cached mapping.  Of course, at restore, the host will also write
>>>>> through a cached mapping, but shouldn't the latter case be solvable by
>>>>> having KVM clean the cache lines when faulting in any page?
>>>>>
>>>>
>>>> We are still talking about the contents of the framebuffer, right? In
>>>> that case, yes, afaict
>>>>
>>>
>>> I was talking about normal RAM actually... not sure if that changes anything?
>>>
>>
>> The main difference is that with a framebuffer BAR, it is pointless
>> for the guest to map it cacheable, given that the purpose of a
>> framebuffer is its side effects, which are not guaranteed to occur
>> timely if the mapping is cacheable.
>>
>> If we are talking about normal RAM, then why are we discussing it here
>> and not down there?
>>
>
> Because I was trying to figure out how the challenge of accessing the
> VGA framebuffer differs from the challenge of accessing guest RAM
> which may have been written by the guest with the MMU off.
>
> First approximation, they are extremely similar because the guest is
> writing uncached to memory, which the host now has to access via a
> cached mapping.
>
> But I'm guessing that a "clean+invalidate before read on the host"
> solution will result in terrible performance for a framebuffer and
> therefore isn't a good solution for that problem...
>

That highly depends on where 'not working' resides on the performance
scale. Currently, VGA on KVM simply does not work at all, and so
working but slow would be a huge improvement over the current
situation.

Also, the performance hit is caused by the fact that the data needs to
make a round trip to memory, and the invalidation (without cleaning)
performed by the host shouldn't make that much worse than it
fundamentally is to begin with.

A paravirtualized framebuffer (as was proposed recently by Gerd I
think?) would solve this, since the guest can just map it as
cacheable.

>>
>>
>>>>>>
>>>>>> In the migration case, it is much more complicated, and I think
>>>>>> capturing the state of the VM in a way that takes incoherency between
>>>>>> caches and main memory into account is simply infeasible (i.e., the
>>>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>>>> clean cachelines that are out of sync, and so it is impossible to
>>>>>> record both the cached *and* the delta with the uncached state)
>>>>>
>>>>> This may be an incredibly stupid question (and I may have asked it
>>>>> before), but why can't we clean+invalidate the guest page before
>>>>> reading it and thereby obtain a coherent view of a page?
>>>>>
>>>>
>>>> Because cleaning from the host will clobber whatever the guest wrote
>>>> directly to memory with the MMU off, if there is a dirty cacheline
>>>> shadowing that memory.
>>>
>>> If the host never wrote anything to that memory (it shouldn't mess
>>> with the guest's memory) there will only be clean cache lines (even if
>>> they contain content shadowing the memory) and cleaning them would be
>>> equivalent to an invalidate.  Am I misremembering how this works?
>>>
>>
>> Cleaning doesn't actually invalidate, but it should be a no-op for
>> clean cachelines.
>>
>>>> However, that same cacheline could be dirty
>>>> because the guest itself wrote to memory with the MMU on.
>>>
>>> Yes, but the guest would have no control over when such a cache line
>>> gets flushed to main memory by the hardware, and can have no
>>> reasonable expectation that the cache lines don't get cleaned behind
>>> its back.  The fact that a migration triggers this, is reasonable.  A
>>> guest that wants hand-off from main memory that its accessing with the
>>> MMU off, must invalidate the appropriate cache lines or ensure they're
>>> clean.  There's very likely some subtle aspect to all of this that I'm
>>> forgetting.
>>>
>>
>> OK, so if the only way cachelines covering guest memory could be dirty
>> is after the guest wrote to that memory itself via a cacheable
>> mapping, I guess it would be reasonable to do clean+invalidate before
>> reading the memory. Then, the only way for the guest to lose anything
>> is in cases where it could not reasonably expect it to be retained
>> anyway.
>
> Right, that's what I'm thinking.
>
>>
>> However, that does leave a window, between the invalidate and the
>> read, where the guest could modify memory without it being visible to
>> the host.
>
> Is that a problem specific to the coherency challenge?  I thought this
> problem was already addressed by dirty page tracking, but there's like
> some interaction with the cache maintenance that we'd have to figure
> out.
>

I don't know how dirty page tracking works exactly, but if it that can
track direct writes to memory as easily as cached writes, it would
probably cover this as well.
Christoffer Dall Feb. 1, 2018, 9:17 a.m. UTC | #20
On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 31 January 2018 at 19:12, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
>> <ard.biesheuvel@linaro.org> wrote:
>>> On 31 January 2018 at 17:39, Christoffer Dall
>>> <christoffer.dall@linaro.org> wrote:
>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>> On 31 January 2018 at 16:53, Christoffer Dall
>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>>>>> >>>>
>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>>>>> >>>> a VM boot?
>>>>>>>>> >>>
>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>>>>> >>> tried to provoke that problem...
>>>>>>>>> >>
>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>>>>> >
>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>>>>> >
>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>>>>> > guest turns on its MMU...)
>>>>>>>>>
>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>>>>
>>>>>>>>
>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>>>>> caches, right?)
>>>>>>>>
>>>>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>>>>
>>>>>>>> Is that even sufficient?
>>>>>>>>
>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>>>>> incoherent, ...).
>>>>>>>>
>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>>>>> MMU.
>>>>>>>>
>>>>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>>>>> us really needing some way to coherently access memory between QEMU and
>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>>>>> migration, we don't even know where it may have written data, and I'm
>>>>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>>>>
>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>>>>> of UEFI and VGA.)
>>>>>>>>
>>>>>>>
>>>>>>> Actually, the VGA case is much simpler because the host is not
>>>>>>> expected to write to the framebuffer, only read from it, and the guest
>>>>>>> is not expected to create a cacheable mapping for it, so any
>>>>>>> incoherency can be trivially solved by cache invalidation on the host
>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>>>>
>>>>>> In case of the running guest, the host will also only read from the
>>>>>> cached mapping.  Of course, at restore, the host will also write
>>>>>> through a cached mapping, but shouldn't the latter case be solvable by
>>>>>> having KVM clean the cache lines when faulting in any page?
>>>>>>
>>>>>
>>>>> We are still talking about the contents of the framebuffer, right? In
>>>>> that case, yes, afaict
>>>>>
>>>>
>>>> I was talking about normal RAM actually... not sure if that changes anything?
>>>>
>>>
>>> The main difference is that with a framebuffer BAR, it is pointless
>>> for the guest to map it cacheable, given that the purpose of a
>>> framebuffer is its side effects, which are not guaranteed to occur
>>> timely if the mapping is cacheable.
>>>
>>> If we are talking about normal RAM, then why are we discussing it here
>>> and not down there?
>>>
>>
>> Because I was trying to figure out how the challenge of accessing the
>> VGA framebuffer differs from the challenge of accessing guest RAM
>> which may have been written by the guest with the MMU off.
>>
>> First approximation, they are extremely similar because the guest is
>> writing uncached to memory, which the host now has to access via a
>> cached mapping.
>>
>> But I'm guessing that a "clean+invalidate before read on the host"
>> solution will result in terrible performance for a framebuffer and
>> therefore isn't a good solution for that problem...
>>
>
> That highly depends on where 'not working' resides on the performance
> scale. Currently, VGA on KVM simply does not work at all, and so
> working but slow would be a huge improvement over the current
> situation.
>
> Also, the performance hit is caused by the fact that the data needs to
> make a round trip to memory, and the invalidation (without cleaning)
> performed by the host shouldn't make that much worse than it
> fundamentally is to begin with.
>

True, but Marc pointed out (on IRC) that the cache maintenance
instructions are broadcast across all CPUs and may therefore adversely
affect the performance of your entire system by running an emulated
VGA framebuffer on a subset of the host CPUs.  However, he also
pointed out that the necessary cache maintenance can be done in EL0
and we wouldn't even need a new ioctl or anything else from KVM or the
kernel to do this.  I think we should measure the effect of this
before dismissing it though.

> A paravirtualized framebuffer (as was proposed recently by Gerd I
> think?) would solve this, since the guest can just map it as
> cacheable.
>

Yes, but it seems to me that the problem of incoherent views of memory
between the guest and QEMU keeps coming up, and we therefore need to
have a fix for this in software.

>>>
>>>
>>>>>>>
>>>>>>> In the migration case, it is much more complicated, and I think
>>>>>>> capturing the state of the VM in a way that takes incoherency between
>>>>>>> caches and main memory into account is simply infeasible (i.e., the
>>>>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>>>>> clean cachelines that are out of sync, and so it is impossible to
>>>>>>> record both the cached *and* the delta with the uncached state)
>>>>>>
>>>>>> This may be an incredibly stupid question (and I may have asked it
>>>>>> before), but why can't we clean+invalidate the guest page before
>>>>>> reading it and thereby obtain a coherent view of a page?
>>>>>>
>>>>>
>>>>> Because cleaning from the host will clobber whatever the guest wrote
>>>>> directly to memory with the MMU off, if there is a dirty cacheline
>>>>> shadowing that memory.
>>>>
>>>> If the host never wrote anything to that memory (it shouldn't mess
>>>> with the guest's memory) there will only be clean cache lines (even if
>>>> they contain content shadowing the memory) and cleaning them would be
>>>> equivalent to an invalidate.  Am I misremembering how this works?
>>>>
>>>
>>> Cleaning doesn't actually invalidate, but it should be a no-op for
>>> clean cachelines.
>>>
>>>>> However, that same cacheline could be dirty
>>>>> because the guest itself wrote to memory with the MMU on.
>>>>
>>>> Yes, but the guest would have no control over when such a cache line
>>>> gets flushed to main memory by the hardware, and can have no
>>>> reasonable expectation that the cache lines don't get cleaned behind
>>>> its back.  The fact that a migration triggers this, is reasonable.  A
>>>> guest that wants hand-off from main memory that its accessing with the
>>>> MMU off, must invalidate the appropriate cache lines or ensure they're
>>>> clean.  There's very likely some subtle aspect to all of this that I'm
>>>> forgetting.
>>>>
>>>
>>> OK, so if the only way cachelines covering guest memory could be dirty
>>> is after the guest wrote to that memory itself via a cacheable
>>> mapping, I guess it would be reasonable to do clean+invalidate before
>>> reading the memory. Then, the only way for the guest to lose anything
>>> is in cases where it could not reasonably expect it to be retained
>>> anyway.
>>
>> Right, that's what I'm thinking.
>>
>>>
>>> However, that does leave a window, between the invalidate and the
>>> read, where the guest could modify memory without it being visible to
>>> the host.
>>
>> Is that a problem specific to the coherency challenge?  I thought this
>> problem was already addressed by dirty page tracking, but there's like
>> some interaction with the cache maintenance that we'd have to figure
>> out.
>>
>
> I don't know how dirty page tracking works exactly, but if it that can
> track direct writes to memory as easily as cached writes, it would
> probably cover this as well.

I don't see why it shouldn't.  Guest pages get mapped as read-only in
stage 2, and their memory attributes shouldn't be able to affect
permission checks (one can only hope).

Unless I'm missing something fundamental, I think we should add
functionality in QEMU to clean+invalidate pages on aarch64 hosts and
see if that solves both VGA and migration, and if so, what the
potential performance impacts are.

Thanks,
-Christoffer
Ard Biesheuvel Feb. 1, 2018, 9:33 a.m. UTC | #21
On 1 February 2018 at 09:17, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 31 January 2018 at 19:12, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
>>> <ard.biesheuvel@linaro.org> wrote:
>>>> On 31 January 2018 at 17:39, Christoffer Dall
>>>> <christoffer.dall@linaro.org> wrote:
>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>> On 31 January 2018 at 16:53, Christoffer Dall
>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>>>>>> >>>>
>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>>>>>> >>>> a VM boot?
>>>>>>>>>> >>>
>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>>>>>> >>> tried to provoke that problem...
>>>>>>>>>> >>
>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>>>>>> >
>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>>>>>> >
>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>>>>>> > guest turns on its MMU...)
>>>>>>>>>>
>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>>>>>> caches, right?)
>>>>>>>>>
>>>>>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>>>>>
>>>>>>>>> Is that even sufficient?
>>>>>>>>>
>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>>>>>> incoherent, ...).
>>>>>>>>>
>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>>>>>> MMU.
>>>>>>>>>
>>>>>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>>>>>> us really needing some way to coherently access memory between QEMU and
>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>>>>>> migration, we don't even know where it may have written data, and I'm
>>>>>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>>>>>
>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>>>>>> of UEFI and VGA.)
>>>>>>>>>
>>>>>>>>
>>>>>>>> Actually, the VGA case is much simpler because the host is not
>>>>>>>> expected to write to the framebuffer, only read from it, and the guest
>>>>>>>> is not expected to create a cacheable mapping for it, so any
>>>>>>>> incoherency can be trivially solved by cache invalidation on the host
>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>>>>>
>>>>>>> In case of the running guest, the host will also only read from the
>>>>>>> cached mapping.  Of course, at restore, the host will also write
>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by
>>>>>>> having KVM clean the cache lines when faulting in any page?
>>>>>>>
>>>>>>
>>>>>> We are still talking about the contents of the framebuffer, right? In
>>>>>> that case, yes, afaict
>>>>>>
>>>>>
>>>>> I was talking about normal RAM actually... not sure if that changes anything?
>>>>>
>>>>
>>>> The main difference is that with a framebuffer BAR, it is pointless
>>>> for the guest to map it cacheable, given that the purpose of a
>>>> framebuffer is its side effects, which are not guaranteed to occur
>>>> timely if the mapping is cacheable.
>>>>
>>>> If we are talking about normal RAM, then why are we discussing it here
>>>> and not down there?
>>>>
>>>
>>> Because I was trying to figure out how the challenge of accessing the
>>> VGA framebuffer differs from the challenge of accessing guest RAM
>>> which may have been written by the guest with the MMU off.
>>>
>>> First approximation, they are extremely similar because the guest is
>>> writing uncached to memory, which the host now has to access via a
>>> cached mapping.
>>>
>>> But I'm guessing that a "clean+invalidate before read on the host"
>>> solution will result in terrible performance for a framebuffer and
>>> therefore isn't a good solution for that problem...
>>>
>>
>> That highly depends on where 'not working' resides on the performance
>> scale. Currently, VGA on KVM simply does not work at all, and so
>> working but slow would be a huge improvement over the current
>> situation.
>>
>> Also, the performance hit is caused by the fact that the data needs to
>> make a round trip to memory, and the invalidation (without cleaning)
>> performed by the host shouldn't make that much worse than it
>> fundamentally is to begin with.
>>
>
> True, but Marc pointed out (on IRC) that the cache maintenance
> instructions are broadcast across all CPUs and may therefore adversely
> affect the performance of your entire system by running an emulated
> VGA framebuffer on a subset of the host CPUs.  However, he also
> pointed out that the necessary cache maintenance can be done in EL0
> and we wouldn't even need a new ioctl or anything else from KVM or the
> kernel to do this.  I think we should measure the effect of this
> before dismissing it though.
>

If you could use dirty page tracking to only perform the cache
invalidation when the framebuffer memory has been updated, you can at
least limit the impact to cases where the framebuffer is actually
used, rather than sitting idle with a nice wallpaper image.


>> A paravirtualized framebuffer (as was proposed recently by Gerd I
>> think?) would solve this, since the guest can just map it as
>> cacheable.
>>
>
> Yes, but it seems to me that the problem of incoherent views of memory
> between the guest and QEMU keeps coming up, and we therefore need to
> have a fix for this in software.
>

Agreed.

>>>>
>>>>
>>>>>>>>
>>>>>>>> In the migration case, it is much more complicated, and I think
>>>>>>>> capturing the state of the VM in a way that takes incoherency between
>>>>>>>> caches and main memory into account is simply infeasible (i.e., the
>>>>>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>>>>>> clean cachelines that are out of sync, and so it is impossible to
>>>>>>>> record both the cached *and* the delta with the uncached state)
>>>>>>>
>>>>>>> This may be an incredibly stupid question (and I may have asked it
>>>>>>> before), but why can't we clean+invalidate the guest page before
>>>>>>> reading it and thereby obtain a coherent view of a page?
>>>>>>>
>>>>>>
>>>>>> Because cleaning from the host will clobber whatever the guest wrote
>>>>>> directly to memory with the MMU off, if there is a dirty cacheline
>>>>>> shadowing that memory.
>>>>>
>>>>> If the host never wrote anything to that memory (it shouldn't mess
>>>>> with the guest's memory) there will only be clean cache lines (even if
>>>>> they contain content shadowing the memory) and cleaning them would be
>>>>> equivalent to an invalidate.  Am I misremembering how this works?
>>>>>
>>>>
>>>> Cleaning doesn't actually invalidate, but it should be a no-op for
>>>> clean cachelines.
>>>>
>>>>>> However, that same cacheline could be dirty
>>>>>> because the guest itself wrote to memory with the MMU on.
>>>>>
>>>>> Yes, but the guest would have no control over when such a cache line
>>>>> gets flushed to main memory by the hardware, and can have no
>>>>> reasonable expectation that the cache lines don't get cleaned behind
>>>>> its back.  The fact that a migration triggers this, is reasonable.  A
>>>>> guest that wants hand-off from main memory that its accessing with the
>>>>> MMU off, must invalidate the appropriate cache lines or ensure they're
>>>>> clean.  There's very likely some subtle aspect to all of this that I'm
>>>>> forgetting.
>>>>>
>>>>
>>>> OK, so if the only way cachelines covering guest memory could be dirty
>>>> is after the guest wrote to that memory itself via a cacheable
>>>> mapping, I guess it would be reasonable to do clean+invalidate before
>>>> reading the memory. Then, the only way for the guest to lose anything
>>>> is in cases where it could not reasonably expect it to be retained
>>>> anyway.
>>>
>>> Right, that's what I'm thinking.
>>>
>>>>
>>>> However, that does leave a window, between the invalidate and the
>>>> read, where the guest could modify memory without it being visible to
>>>> the host.
>>>
>>> Is that a problem specific to the coherency challenge?  I thought this
>>> problem was already addressed by dirty page tracking, but there's like
>>> some interaction with the cache maintenance that we'd have to figure
>>> out.
>>>
>>
>> I don't know how dirty page tracking works exactly, but if it that can
>> track direct writes to memory as easily as cached writes, it would
>> probably cover this as well.
>
> I don't see why it shouldn't.  Guest pages get mapped as read-only in
> stage 2, and their memory attributes shouldn't be able to affect
> permission checks (one can only hope).
>
> Unless I'm missing something fundamental, I think we should add
> functionality in QEMU to clean+invalidate pages on aarch64 hosts and
> see if that solves both VGA and migration, and if so, what the
> potential performance impacts are.
>

Indeed.

Identifying the framebuffer region is easy for QEMU, so there we can
invalidate (without clean) selectively. However, although we'd
probably need to capture the state of the MMU bit anyway when handling
the stage 2 fault for the dirty page tracking, I don't think we can
generally infer whether a faulting access was made via an uncached
mapping while the MMU was on, and so this would require
clean+invalidate on all dirty pages when doing a migration.
Christoffer Dall Feb. 1, 2018, 9:59 a.m. UTC | #22
On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 1 February 2018 at 09:17, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel
>> <ard.biesheuvel@linaro.org> wrote:
>>> On 31 January 2018 at 19:12, Christoffer Dall
>>> <christoffer.dall@linaro.org> wrote:
>>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>> On 31 January 2018 at 17:39, Christoffer Dall
>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
>>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>>> On 31 January 2018 at 16:53, Christoffer Dall
>>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
>>>>>>>> <ard.biesheuvel@linaro.org> wrote:
>>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
>>>>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
>>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
>>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
>>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
>>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
>>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
>>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
>>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
>>>>>>>>>>> >>>>
>>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
>>>>>>>>>>> >>>> a VM boot?
>>>>>>>>>>> >>>
>>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
>>>>>>>>>>> >>> tried to provoke that problem...
>>>>>>>>>>> >>
>>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
>>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
>>>>>>>>>>> >
>>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
>>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
>>>>>>>>>>> >
>>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
>>>>>>>>>>> > of the stick about how thin the ice is in the period before the
>>>>>>>>>>> > guest turns on its MMU...)
>>>>>>>>>>>
>>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
>>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
>>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
>>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if
>>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
>>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
>>>>>>>>>> caches, right?)
>>>>>>>>>>
>>>>>>>>>>> You may have to pause the vcpus before starting the migration, or
>>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
>>>>>>>>>>> is trying to disable its MMU while the migration is on. This would
>>>>>>>>>>> involve trapping all the virtual memory related system registers, with
>>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
>>>>>>>>>>> migrate the memory, so maybe that's acceptable.
>>>>>>>>>>>
>>>>>>>>>> Is that even sufficient?
>>>>>>>>>>
>>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
>>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
>>>>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
>>>>>>>>>> incoherent, ...).
>>>>>>>>>>
>>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
>>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
>>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
>>>>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
>>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
>>>>>>>>>> MMU.
>>>>>>>>>>
>>>>>>>>>> On the larger scale of thins; this appears to me to be another case of
>>>>>>>>>> us really needing some way to coherently access memory between QEMU and
>>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
>>>>>>>>>> migration, we don't even know where it may have written data, and I'm
>>>>>>>>>> therefore not really sure what the 'proper' solution would be.
>>>>>>>>>>
>>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
>>>>>>>>>> of UEFI and VGA.)
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Actually, the VGA case is much simpler because the host is not
>>>>>>>>> expected to write to the framebuffer, only read from it, and the guest
>>>>>>>>> is not expected to create a cacheable mapping for it, so any
>>>>>>>>> incoherency can be trivially solved by cache invalidation on the host
>>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
>>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
>>>>>>>>
>>>>>>>> In case of the running guest, the host will also only read from the
>>>>>>>> cached mapping.  Of course, at restore, the host will also write
>>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by
>>>>>>>> having KVM clean the cache lines when faulting in any page?
>>>>>>>>
>>>>>>>
>>>>>>> We are still talking about the contents of the framebuffer, right? In
>>>>>>> that case, yes, afaict
>>>>>>>
>>>>>>
>>>>>> I was talking about normal RAM actually... not sure if that changes anything?
>>>>>>
>>>>>
>>>>> The main difference is that with a framebuffer BAR, it is pointless
>>>>> for the guest to map it cacheable, given that the purpose of a
>>>>> framebuffer is its side effects, which are not guaranteed to occur
>>>>> timely if the mapping is cacheable.
>>>>>
>>>>> If we are talking about normal RAM, then why are we discussing it here
>>>>> and not down there?
>>>>>
>>>>
>>>> Because I was trying to figure out how the challenge of accessing the
>>>> VGA framebuffer differs from the challenge of accessing guest RAM
>>>> which may have been written by the guest with the MMU off.
>>>>
>>>> First approximation, they are extremely similar because the guest is
>>>> writing uncached to memory, which the host now has to access via a
>>>> cached mapping.
>>>>
>>>> But I'm guessing that a "clean+invalidate before read on the host"
>>>> solution will result in terrible performance for a framebuffer and
>>>> therefore isn't a good solution for that problem...
>>>>
>>>
>>> That highly depends on where 'not working' resides on the performance
>>> scale. Currently, VGA on KVM simply does not work at all, and so
>>> working but slow would be a huge improvement over the current
>>> situation.
>>>
>>> Also, the performance hit is caused by the fact that the data needs to
>>> make a round trip to memory, and the invalidation (without cleaning)
>>> performed by the host shouldn't make that much worse than it
>>> fundamentally is to begin with.
>>>
>>
>> True, but Marc pointed out (on IRC) that the cache maintenance
>> instructions are broadcast across all CPUs and may therefore adversely
>> affect the performance of your entire system by running an emulated
>> VGA framebuffer on a subset of the host CPUs.  However, he also
>> pointed out that the necessary cache maintenance can be done in EL0
>> and we wouldn't even need a new ioctl or anything else from KVM or the
>> kernel to do this.  I think we should measure the effect of this
>> before dismissing it though.
>>
>
> If you could use dirty page tracking to only perform the cache
> invalidation when the framebuffer memory has been updated, you can at
> least limit the impact to cases where the framebuffer is actually
> used, rather than sitting idle with a nice wallpaper image.
>
>
>>> A paravirtualized framebuffer (as was proposed recently by Gerd I
>>> think?) would solve this, since the guest can just map it as
>>> cacheable.
>>>
>>
>> Yes, but it seems to me that the problem of incoherent views of memory
>> between the guest and QEMU keeps coming up, and we therefore need to
>> have a fix for this in software.
>>
>
> Agreed.
>
>>>>>
>>>>>
>>>>>>>>>
>>>>>>>>> In the migration case, it is much more complicated, and I think
>>>>>>>>> capturing the state of the VM in a way that takes incoherency between
>>>>>>>>> caches and main memory into account is simply infeasible (i.e., the
>>>>>>>>> act of recording the state of guest RAM via a cached mapping may evict
>>>>>>>>> clean cachelines that are out of sync, and so it is impossible to
>>>>>>>>> record both the cached *and* the delta with the uncached state)
>>>>>>>>
>>>>>>>> This may be an incredibly stupid question (and I may have asked it
>>>>>>>> before), but why can't we clean+invalidate the guest page before
>>>>>>>> reading it and thereby obtain a coherent view of a page?
>>>>>>>>
>>>>>>>
>>>>>>> Because cleaning from the host will clobber whatever the guest wrote
>>>>>>> directly to memory with the MMU off, if there is a dirty cacheline
>>>>>>> shadowing that memory.
>>>>>>
>>>>>> If the host never wrote anything to that memory (it shouldn't mess
>>>>>> with the guest's memory) there will only be clean cache lines (even if
>>>>>> they contain content shadowing the memory) and cleaning them would be
>>>>>> equivalent to an invalidate.  Am I misremembering how this works?
>>>>>>
>>>>>
>>>>> Cleaning doesn't actually invalidate, but it should be a no-op for
>>>>> clean cachelines.
>>>>>
>>>>>>> However, that same cacheline could be dirty
>>>>>>> because the guest itself wrote to memory with the MMU on.
>>>>>>
>>>>>> Yes, but the guest would have no control over when such a cache line
>>>>>> gets flushed to main memory by the hardware, and can have no
>>>>>> reasonable expectation that the cache lines don't get cleaned behind
>>>>>> its back.  The fact that a migration triggers this, is reasonable.  A
>>>>>> guest that wants hand-off from main memory that its accessing with the
>>>>>> MMU off, must invalidate the appropriate cache lines or ensure they're
>>>>>> clean.  There's very likely some subtle aspect to all of this that I'm
>>>>>> forgetting.
>>>>>>
>>>>>
>>>>> OK, so if the only way cachelines covering guest memory could be dirty
>>>>> is after the guest wrote to that memory itself via a cacheable
>>>>> mapping, I guess it would be reasonable to do clean+invalidate before
>>>>> reading the memory. Then, the only way for the guest to lose anything
>>>>> is in cases where it could not reasonably expect it to be retained
>>>>> anyway.
>>>>
>>>> Right, that's what I'm thinking.
>>>>
>>>>>
>>>>> However, that does leave a window, between the invalidate and the
>>>>> read, where the guest could modify memory without it being visible to
>>>>> the host.
>>>>
>>>> Is that a problem specific to the coherency challenge?  I thought this
>>>> problem was already addressed by dirty page tracking, but there's like
>>>> some interaction with the cache maintenance that we'd have to figure
>>>> out.
>>>>
>>>
>>> I don't know how dirty page tracking works exactly, but if it that can
>>> track direct writes to memory as easily as cached writes, it would
>>> probably cover this as well.
>>
>> I don't see why it shouldn't.  Guest pages get mapped as read-only in
>> stage 2, and their memory attributes shouldn't be able to affect
>> permission checks (one can only hope).
>>
>> Unless I'm missing something fundamental, I think we should add
>> functionality in QEMU to clean+invalidate pages on aarch64 hosts and
>> see if that solves both VGA and migration, and if so, what the
>> potential performance impacts are.
>>
>
> Indeed.
>
> Identifying the framebuffer region is easy for QEMU, so there we can
> invalidate (without clean) selectively.

Do you think there's a material win of doing invalidate rather than
clean+invalidate or could we just aim for "access coherent" ==
"clean+invalidate" and we don't have to make any assumptions about how
the guest is accessing a particular piece of memory?

> However, although we'd
> probably need to capture the state of the MMU bit anyway when handling
> the stage 2 fault for the dirty page tracking, I don't think we can
> generally infer whether a faulting access was made via an uncached
> mapping while the MMU was on, and so this would require
> clean+invalidate on all dirty pages when doing a migration.

My gut feeling here is that there might be some very clever scheme to
optimize cache maintenance of the 'hot pages' when dirty page tracking
is enabled, but since the set of hot pages by virtue of migration
algorithms has to be small, the win is not likely to impact anything
real-life, and it's therefore not worth looking into.

Thanks,
-Christoffer
Ard Biesheuvel Feb. 1, 2018, 10:09 a.m. UTC | #23
On 1 February 2018 at 09:59, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 1 February 2018 at 09:17, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
...
>>>
>>> Unless I'm missing something fundamental, I think we should add
>>> functionality in QEMU to clean+invalidate pages on aarch64 hosts and
>>> see if that solves both VGA and migration, and if so, what the
>>> potential performance impacts are.
>>>
>>
>> Indeed.
>>
>> Identifying the framebuffer region is easy for QEMU, so there we can
>> invalidate (without clean) selectively.
>
> Do you think there's a material win of doing invalidate rather than
> clean+invalidate or could we just aim for "access coherent" ==
> "clean+invalidate" and we don't have to make any assumptions about how
> the guest is accessing a particular piece of memory?
>

In general, invalidate should be cheaper than clean, given that just
discarding the data involves less work than writing it back to memory.
Whether that actually holds in current SMP designs, and whether it
still applies when the memory can be assumed to be clean in the first
place are things I can't really answer.

>> However, although we'd
>> probably need to capture the state of the MMU bit anyway when handling
>> the stage 2 fault for the dirty page tracking, I don't think we can
>> generally infer whether a faulting access was made via an uncached
>> mapping while the MMU was on, and so this would require
>> clean+invalidate on all dirty pages when doing a migration.
>
> My gut feeling here is that there might be some very clever scheme to
> optimize cache maintenance of the 'hot pages' when dirty page tracking
> is enabled, but since the set of hot pages by virtue of migration
> algorithms has to be small, the win is not likely to impact anything
> real-life, and it's therefore not worth looking into.
>

Sounds reasonable.
Andrew Jones Feb. 1, 2018, 10:42 a.m. UTC | #24
On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote:
> On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
> > On 1 February 2018 at 09:17, Christoffer Dall
> > <christoffer.dall@linaro.org> wrote:
> >> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel
> >> <ard.biesheuvel@linaro.org> wrote:
> >>> On 31 January 2018 at 19:12, Christoffer Dall
> >>> <christoffer.dall@linaro.org> wrote:
> >>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
> >>>> <ard.biesheuvel@linaro.org> wrote:
> >>>>> On 31 January 2018 at 17:39, Christoffer Dall
> >>>>> <christoffer.dall@linaro.org> wrote:
> >>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
> >>>>>> <ard.biesheuvel@linaro.org> wrote:
> >>>>>>> On 31 January 2018 at 16:53, Christoffer Dall
> >>>>>>> <christoffer.dall@linaro.org> wrote:
> >>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
> >>>>>>>> <ard.biesheuvel@linaro.org> wrote:
> >>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
> >>>>>>>>> <christoffer.dall@linaro.org> wrote:
> >>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
> >>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
> >>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> >>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
> >>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
> >>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> >>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
> >>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
> >>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
> >>>>>>>>>>> >>>>
> >>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
> >>>>>>>>>>> >>>> a VM boot?
> >>>>>>>>>>> >>>
> >>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
> >>>>>>>>>>> >>> tried to provoke that problem...
> >>>>>>>>>>> >>
> >>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
> >>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
> >>>>>>>>>>> >
> >>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> >>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
> >>>>>>>>>>> >
> >>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
> >>>>>>>>>>> > of the stick about how thin the ice is in the period before the
> >>>>>>>>>>> > guest turns on its MMU...)
> >>>>>>>>>>>
> >>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
> >>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
> >>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
> >>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if
> >>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
> >>>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
> >>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
> >>>>>>>>>> caches, right?)
> >>>>>>>>>>
> >>>>>>>>>>> You may have to pause the vcpus before starting the migration, or
> >>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
> >>>>>>>>>>> is trying to disable its MMU while the migration is on. This would
> >>>>>>>>>>> involve trapping all the virtual memory related system registers, with
> >>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
> >>>>>>>>>>> migrate the memory, so maybe that's acceptable.
> >>>>>>>>>>>
> >>>>>>>>>> Is that even sufficient?
> >>>>>>>>>>
> >>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
> >>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
> >>>>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
> >>>>>>>>>> incoherent, ...).
> >>>>>>>>>>
> >>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
> >>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
> >>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
> >>>>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
> >>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
> >>>>>>>>>> MMU.
> >>>>>>>>>>
> >>>>>>>>>> On the larger scale of thins; this appears to me to be another case of
> >>>>>>>>>> us really needing some way to coherently access memory between QEMU and
> >>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
> >>>>>>>>>> migration, we don't even know where it may have written data, and I'm
> >>>>>>>>>> therefore not really sure what the 'proper' solution would be.
> >>>>>>>>>>
> >>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
> >>>>>>>>>> of UEFI and VGA.)
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> Actually, the VGA case is much simpler because the host is not
> >>>>>>>>> expected to write to the framebuffer, only read from it, and the guest
> >>>>>>>>> is not expected to create a cacheable mapping for it, so any
> >>>>>>>>> incoherency can be trivially solved by cache invalidation on the host
> >>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
> >>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
> >>>>>>>>
> >>>>>>>> In case of the running guest, the host will also only read from the
> >>>>>>>> cached mapping.  Of course, at restore, the host will also write
> >>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by
> >>>>>>>> having KVM clean the cache lines when faulting in any page?
> >>>>>>>>
> >>>>>>>
> >>>>>>> We are still talking about the contents of the framebuffer, right? In
> >>>>>>> that case, yes, afaict
> >>>>>>>
> >>>>>>
> >>>>>> I was talking about normal RAM actually... not sure if that changes anything?
> >>>>>>
> >>>>>
> >>>>> The main difference is that with a framebuffer BAR, it is pointless
> >>>>> for the guest to map it cacheable, given that the purpose of a
> >>>>> framebuffer is its side effects, which are not guaranteed to occur
> >>>>> timely if the mapping is cacheable.
> >>>>>
> >>>>> If we are talking about normal RAM, then why are we discussing it here
> >>>>> and not down there?
> >>>>>
> >>>>
> >>>> Because I was trying to figure out how the challenge of accessing the
> >>>> VGA framebuffer differs from the challenge of accessing guest RAM
> >>>> which may have been written by the guest with the MMU off.
> >>>>
> >>>> First approximation, they are extremely similar because the guest is
> >>>> writing uncached to memory, which the host now has to access via a
> >>>> cached mapping.
> >>>>
> >>>> But I'm guessing that a "clean+invalidate before read on the host"
> >>>> solution will result in terrible performance for a framebuffer and
> >>>> therefore isn't a good solution for that problem...
> >>>>
> >>>
> >>> That highly depends on where 'not working' resides on the performance
> >>> scale. Currently, VGA on KVM simply does not work at all, and so
> >>> working but slow would be a huge improvement over the current
> >>> situation.
> >>>
> >>> Also, the performance hit is caused by the fact that the data needs to
> >>> make a round trip to memory, and the invalidation (without cleaning)
> >>> performed by the host shouldn't make that much worse than it
> >>> fundamentally is to begin with.
> >>>
> >>
> >> True, but Marc pointed out (on IRC) that the cache maintenance
> >> instructions are broadcast across all CPUs and may therefore adversely
> >> affect the performance of your entire system by running an emulated
> >> VGA framebuffer on a subset of the host CPUs.  However, he also
> >> pointed out that the necessary cache maintenance can be done in EL0

I had some memory of this not being the case, but I just checked and
indeed 'dc civac' will work from el0 with sctlr_el1.uci=1. Note, I
see that the gcc builtin __builtin___clear_cache() does not use that
instruction (it uses 'dc cvau'), so we'd need to implement a wrapper
ourselves (maybe that's why I thought it wasn't available...)

> >> and we wouldn't even need a new ioctl or anything else from KVM or the
> >> kernel to do this.  I think we should measure the effect of this
> >> before dismissing it though.
> >>
> >
> > If you could use dirty page tracking to only perform the cache
> > invalidation when the framebuffer memory has been updated, you can at
> > least limit the impact to cases where the framebuffer is actually
> > used, rather than sitting idle with a nice wallpaper image.

Yes, this is the exact approach I took back when I experimented with
this. I must have screwed up my PoC in some way (like using the gcc
builtin), because it wasn't working for me...

> >
> >
> >>> A paravirtualized framebuffer (as was proposed recently by Gerd I
> >>> think?) would solve this, since the guest can just map it as
> >>> cacheable.
> >>>
> >>
> >> Yes, but it seems to me that the problem of incoherent views of memory
> >> between the guest and QEMU keeps coming up, and we therefore need to
> >> have a fix for this in software.
> >>
> >
> > Agreed.
> >
> >>>>>
> >>>>>
> >>>>>>>>>
> >>>>>>>>> In the migration case, it is much more complicated, and I think
> >>>>>>>>> capturing the state of the VM in a way that takes incoherency between
> >>>>>>>>> caches and main memory into account is simply infeasible (i.e., the
> >>>>>>>>> act of recording the state of guest RAM via a cached mapping may evict
> >>>>>>>>> clean cachelines that are out of sync, and so it is impossible to
> >>>>>>>>> record both the cached *and* the delta with the uncached state)
> >>>>>>>>
> >>>>>>>> This may be an incredibly stupid question (and I may have asked it
> >>>>>>>> before), but why can't we clean+invalidate the guest page before
> >>>>>>>> reading it and thereby obtain a coherent view of a page?
> >>>>>>>>
> >>>>>>>
> >>>>>>> Because cleaning from the host will clobber whatever the guest wrote
> >>>>>>> directly to memory with the MMU off, if there is a dirty cacheline
> >>>>>>> shadowing that memory.
> >>>>>>
> >>>>>> If the host never wrote anything to that memory (it shouldn't mess
> >>>>>> with the guest's memory) there will only be clean cache lines (even if
> >>>>>> they contain content shadowing the memory) and cleaning them would be
> >>>>>> equivalent to an invalidate.  Am I misremembering how this works?
> >>>>>>
> >>>>>
> >>>>> Cleaning doesn't actually invalidate, but it should be a no-op for
> >>>>> clean cachelines.
> >>>>>
> >>>>>>> However, that same cacheline could be dirty
> >>>>>>> because the guest itself wrote to memory with the MMU on.
> >>>>>>
> >>>>>> Yes, but the guest would have no control over when such a cache line
> >>>>>> gets flushed to main memory by the hardware, and can have no
> >>>>>> reasonable expectation that the cache lines don't get cleaned behind
> >>>>>> its back.  The fact that a migration triggers this, is reasonable.  A
> >>>>>> guest that wants hand-off from main memory that its accessing with the
> >>>>>> MMU off, must invalidate the appropriate cache lines or ensure they're
> >>>>>> clean.  There's very likely some subtle aspect to all of this that I'm
> >>>>>> forgetting.
> >>>>>>
> >>>>>
> >>>>> OK, so if the only way cachelines covering guest memory could be dirty
> >>>>> is after the guest wrote to that memory itself via a cacheable
> >>>>> mapping, I guess it would be reasonable to do clean+invalidate before
> >>>>> reading the memory. Then, the only way for the guest to lose anything
> >>>>> is in cases where it could not reasonably expect it to be retained
> >>>>> anyway.
> >>>>
> >>>> Right, that's what I'm thinking.
> >>>>
> >>>>>
> >>>>> However, that does leave a window, between the invalidate and the
> >>>>> read, where the guest could modify memory without it being visible to
> >>>>> the host.
> >>>>
> >>>> Is that a problem specific to the coherency challenge?  I thought this
> >>>> problem was already addressed by dirty page tracking, but there's like
> >>>> some interaction with the cache maintenance that we'd have to figure
> >>>> out.
> >>>>
> >>>
> >>> I don't know how dirty page tracking works exactly, but if it that can
> >>> track direct writes to memory as easily as cached writes, it would
> >>> probably cover this as well.
> >>
> >> I don't see why it shouldn't.  Guest pages get mapped as read-only in
> >> stage 2, and their memory attributes shouldn't be able to affect
> >> permission checks (one can only hope).
> >>
> >> Unless I'm missing something fundamental, I think we should add
> >> functionality in QEMU to clean+invalidate pages on aarch64 hosts and
> >> see if that solves both VGA and migration, and if so, what the
> >> potential performance impacts are.
> >>
> >
> > Indeed.
> >
> > Identifying the framebuffer region is easy for QEMU, so there we can
> > invalidate (without clean) selectively.
> 
> Do you think there's a material win of doing invalidate rather than
> clean+invalidate or could we just aim for "access coherent" ==
> "clean+invalidate" and we don't have to make any assumptions about how
> the guest is accessing a particular piece of memory?
> 
> > However, although we'd
> > probably need to capture the state of the MMU bit anyway when handling
> > the stage 2 fault for the dirty page tracking, I don't think we can
> > generally infer whether a faulting access was made via an uncached
> > mapping while the MMU was on, and so this would require
> > clean+invalidate on all dirty pages when doing a migration.
> 
> My gut feeling here is that there might be some very clever scheme to
> optimize cache maintenance of the 'hot pages' when dirty page tracking
> is enabled, but since the set of hot pages by virtue of migration
> algorithms has to be small, the win is not likely to impact anything
> real-life, and it's therefore not worth looking into.
>

Thanks,
drew
Christoffer Dall Feb. 1, 2018, 10:48 a.m. UTC | #25
On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote:
> On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote:
> > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
> > <ard.biesheuvel@linaro.org> wrote:
> > > On 1 February 2018 at 09:17, Christoffer Dall
> > > <christoffer.dall@linaro.org> wrote:
> > >> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel
> > >> <ard.biesheuvel@linaro.org> wrote:
> > >>> On 31 January 2018 at 19:12, Christoffer Dall
> > >>> <christoffer.dall@linaro.org> wrote:
> > >>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel
> > >>>> <ard.biesheuvel@linaro.org> wrote:
> > >>>>> On 31 January 2018 at 17:39, Christoffer Dall
> > >>>>> <christoffer.dall@linaro.org> wrote:
> > >>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel
> > >>>>>> <ard.biesheuvel@linaro.org> wrote:
> > >>>>>>> On 31 January 2018 at 16:53, Christoffer Dall
> > >>>>>>> <christoffer.dall@linaro.org> wrote:
> > >>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel
> > >>>>>>>> <ard.biesheuvel@linaro.org> wrote:
> > >>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall
> > >>>>>>>>> <christoffer.dall@linaro.org> wrote:
> > >>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote:
> > >>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote:
> > >>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > >>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote:
> > >>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > >>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote:
> > >>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn
> > >>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work
> > >>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory
> > >>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as
> > >>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine.
> > >>>>>>>>>>> >>>>
> > >>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during
> > >>>>>>>>>>> >>>> a VM boot?
> > >>>>>>>>>>> >>>
> > >>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever
> > >>>>>>>>>>> >>> tried to provoke that problem...
> > >>>>>>>>>>> >>
> > >>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail
> > >>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest.
> > >>>>>>>>>>> >
> > >>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit
> > >>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off...
> > >>>>>>>>>>> >
> > >>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end
> > >>>>>>>>>>> > of the stick about how thin the ice is in the period before the
> > >>>>>>>>>>> > guest turns on its MMU...)
> > >>>>>>>>>>>
> > >>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU
> > >>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the
> > >>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at
> > >>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if
> > >>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that.
> > >>>>>>>>>>>
> > >>>>>>>>>>
> > >>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's
> > >>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its
> > >>>>>>>>>> caches, right?)
> > >>>>>>>>>>
> > >>>>>>>>>>> You may have to pause the vcpus before starting the migration, or
> > >>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that
> > >>>>>>>>>>> is trying to disable its MMU while the migration is on. This would
> > >>>>>>>>>>> involve trapping all the virtual memory related system registers, with
> > >>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to
> > >>>>>>>>>>> migrate the memory, so maybe that's acceptable.
> > >>>>>>>>>>>
> > >>>>>>>>>> Is that even sufficient?
> > >>>>>>>>>>
> > >>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest
> > >>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads
> > >>>>>>>>>> guest ram.  QEMU's view of guest ram is now incorrect (stale,
> > >>>>>>>>>> incoherent, ...).
> > >>>>>>>>>>
> > >>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its
> > >>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this
> > >>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU
> > >>>>>>>>>> appears to be dead?).  As a short-term 'fix' it's probably better to
> > >>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its
> > >>>>>>>>>> MMU.
> > >>>>>>>>>>
> > >>>>>>>>>> On the larger scale of thins; this appears to me to be another case of
> > >>>>>>>>>> us really needing some way to coherently access memory between QEMU and
> > >>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to
> > >>>>>>>>>> migration, we don't even know where it may have written data, and I'm
> > >>>>>>>>>> therefore not really sure what the 'proper' solution would be.
> > >>>>>>>>>>
> > >>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context
> > >>>>>>>>>> of UEFI and VGA.)
> > >>>>>>>>>>
> > >>>>>>>>>
> > >>>>>>>>> Actually, the VGA case is much simpler because the host is not
> > >>>>>>>>> expected to write to the framebuffer, only read from it, and the guest
> > >>>>>>>>> is not expected to create a cacheable mapping for it, so any
> > >>>>>>>>> incoherency can be trivially solved by cache invalidation on the host
> > >>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only
> > >>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host)
> > >>>>>>>>
> > >>>>>>>> In case of the running guest, the host will also only read from the
> > >>>>>>>> cached mapping.  Of course, at restore, the host will also write
> > >>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by
> > >>>>>>>> having KVM clean the cache lines when faulting in any page?
> > >>>>>>>>
> > >>>>>>>
> > >>>>>>> We are still talking about the contents of the framebuffer, right? In
> > >>>>>>> that case, yes, afaict
> > >>>>>>>
> > >>>>>>
> > >>>>>> I was talking about normal RAM actually... not sure if that changes anything?
> > >>>>>>
> > >>>>>
> > >>>>> The main difference is that with a framebuffer BAR, it is pointless
> > >>>>> for the guest to map it cacheable, given that the purpose of a
> > >>>>> framebuffer is its side effects, which are not guaranteed to occur
> > >>>>> timely if the mapping is cacheable.
> > >>>>>
> > >>>>> If we are talking about normal RAM, then why are we discussing it here
> > >>>>> and not down there?
> > >>>>>
> > >>>>
> > >>>> Because I was trying to figure out how the challenge of accessing the
> > >>>> VGA framebuffer differs from the challenge of accessing guest RAM
> > >>>> which may have been written by the guest with the MMU off.
> > >>>>
> > >>>> First approximation, they are extremely similar because the guest is
> > >>>> writing uncached to memory, which the host now has to access via a
> > >>>> cached mapping.
> > >>>>
> > >>>> But I'm guessing that a "clean+invalidate before read on the host"
> > >>>> solution will result in terrible performance for a framebuffer and
> > >>>> therefore isn't a good solution for that problem...
> > >>>>
> > >>>
> > >>> That highly depends on where 'not working' resides on the performance
> > >>> scale. Currently, VGA on KVM simply does not work at all, and so
> > >>> working but slow would be a huge improvement over the current
> > >>> situation.
> > >>>
> > >>> Also, the performance hit is caused by the fact that the data needs to
> > >>> make a round trip to memory, and the invalidation (without cleaning)
> > >>> performed by the host shouldn't make that much worse than it
> > >>> fundamentally is to begin with.
> > >>>
> > >>
> > >> True, but Marc pointed out (on IRC) that the cache maintenance
> > >> instructions are broadcast across all CPUs and may therefore adversely
> > >> affect the performance of your entire system by running an emulated
> > >> VGA framebuffer on a subset of the host CPUs.  However, he also
> > >> pointed out that the necessary cache maintenance can be done in EL0
> 
> I had some memory of this not being the case, but I just checked and
> indeed 'dc civac' will work from el0 with sctlr_el1.uci=1. Note, I
> see that the gcc builtin __builtin___clear_cache() does not use that
> instruction (it uses 'dc cvau'), so we'd need to implement a wrapper
> ourselves (maybe that's why I thought it wasn't available...)
> 
> > >> and we wouldn't even need a new ioctl or anything else from KVM or the
> > >> kernel to do this.  I think we should measure the effect of this
> > >> before dismissing it though.
> > >>
> > >
> > > If you could use dirty page tracking to only perform the cache
> > > invalidation when the framebuffer memory has been updated, you can at
> > > least limit the impact to cases where the framebuffer is actually
> > > used, rather than sitting idle with a nice wallpaper image.
> 
> Yes, this is the exact approach I took back when I experimented with
> this. I must have screwed up my PoC in some way (like using the gcc
> builtin), because it wasn't working for me...
> 
Does that mean you have some code you feel like reviving and use to send
out an RFC based on? ;)

-Christoffer
Andrew Jones Feb. 1, 2018, 12:25 p.m. UTC | #26
On Thu, Feb 01, 2018 at 11:48:31AM +0100, Christoffer Dall wrote:
> On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote:
> > On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote:
> > > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
> > > > If you could use dirty page tracking to only perform the cache
> > > > invalidation when the framebuffer memory has been updated, you can at
> > > > least limit the impact to cases where the framebuffer is actually
> > > > used, rather than sitting idle with a nice wallpaper image.
> > 
> > Yes, this is the exact approach I took back when I experimented with
> > this. I must have screwed up my PoC in some way (like using the gcc
> > builtin), because it wasn't working for me...
> > 
> Does that mean you have some code you feel like reviving and use to send
> out an RFC based on? ;)
>

I don't usually delete things, but I do do some sort of copy on use
from old homedirs to new ones from time to time, letting stuff that
gets really old completely drop at some point. I might be able to
find the old work, or just hack a quick and ugly version from my
memory to share - hopefully whatever I did to cause it to fail last
time has been forgotten :-)

drew
Christoffer Dall Feb. 1, 2018, 2:04 p.m. UTC | #27
On Thu, Feb 01, 2018 at 01:25:20PM +0100, Andrew Jones wrote:
> On Thu, Feb 01, 2018 at 11:48:31AM +0100, Christoffer Dall wrote:
> > On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote:
> > > On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote:
> > > > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel
> > > > > If you could use dirty page tracking to only perform the cache
> > > > > invalidation when the framebuffer memory has been updated, you can at
> > > > > least limit the impact to cases where the framebuffer is actually
> > > > > used, rather than sitting idle with a nice wallpaper image.
> > > 
> > > Yes, this is the exact approach I took back when I experimented with
> > > this. I must have screwed up my PoC in some way (like using the gcc
> > > builtin), because it wasn't working for me...
> > > 
> > Does that mean you have some code you feel like reviving and use to send
> > out an RFC based on? ;)
> >
> 
> I don't usually delete things, but I do do some sort of copy on use
> from old homedirs to new ones from time to time, letting stuff that
> gets really old completely drop at some point. I might be able to
> find the old work, or just hack a quick and ugly version from my
> memory to share - hopefully whatever I did to cause it to fail last
> time has been forgotten :-)
> 

If you have bandwidth to look at this that's great.  Otherwise let us
know, and I'll see if I can devote some time to it.

Thanks,
-Christoffer
Dr. David Alan Gilbert Feb. 1, 2018, 8:01 p.m. UTC | #28
* Wei Huang (wei@redhat.com) wrote:
> 
> 
> On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote:
> > * Wei Huang (wei@redhat.com) wrote:
> >> This patch adds the migration test support for aarch64. The test code,
> >> which implements the same functionality as x86, is compiled into a binary
> >> and booted as a kernel in qemu. Here are the design ideas:
> >>
> >>  * We choose this -kernel design because aarch64 QEMU doesn't provide a
> >>    built-in fw like x86 does. So instead of relying on a boot loader, we
> >>    use -kernel approach for aarch64.
> >>  * The serial output is sent to PL011 directly.
> >>  * The physical memory base for mach-virt machine is 0x40000000. We change
> >>    the start_address and end_address for aarch64.
> >>
> >> RFC->V1:
> >>  * aarch64 kernel blob is defined as an uint32_t array
> >>  * The test code is re-written to address a data caching issue under KVM.
> >>    Tests passed under both x86 and aarch64.
> >>  * Re-use init_bootfile_x86() for both x86 and aarch64
> >>  * Other minor fixes
> >>
> >> Note that the test code is as the following:
> >>
> >> .section .text
> >>
> >>         .globl  start
> >>         
> >> start:
> >>         /* disable MMU to use phys mem address */
> >>         mrs     x0, sctlr_el1
> >>         bic     x0, x0, #(1<<0)
> >>         msr     sctlr_el1, x0
> >>         isb
> >>
> >>         /* output char 'A' to PL011 */
> >>         mov     w4, #65
> >>         mov     x5, #0x9000000
> >>         strb    w4, [x5]
> >>
> >>         /* w6 keeps a counter so we can limit the output speed */
> >>         mov     w6, #0
> >>
> >>         /* phys mem base addr = 0x40000000 */
> >>         mov     x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */
> >>         mov     x2, #(0x40000000 + 1 * 1024*1024)
> >>
> >>         /* clean up memory first */
> >>         mov     w1, #0  
> >> clean:
> >>         strb    w1, [x2]
> >>         add     x2, x2, #(4096)
> >>         cmp     x2, x3
> >>         ble     clean
> >>
> >>         /* main body */
> >> mainloop:
> >>         mov     x2, #(0x40000000 + 1 * 1024*1024)
> >>         
> >> innerloop:
> >>         /* clean cache because el2 might still cache guest data under KVM */
> >>         dc      civac, x2
> > 
> > Can you explain a bit more about that please;  if it's guest
> > visible acorss migration, doesn't that suggest we need the cache clear
> > in KVM or QEMU?
> 
> I think this is ARM specific. This is caused by the inconsistency
> between guest VM's data accesses and userspace's accesses (in
> check_guest_ram) at the destination:
> 
> 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So
> the data accesses from guest VM go directly into memory.
> 2) QEMU user space will use the cacheable version. So it is possible
> that data can come from cache, instead of RAM. The difference between
> (1) and (2) obviously can cause check_guest_ram() to fail sometimes.
> 
> x86's EPT has the capability to ignore guest-provided memory type. So it
> is possible to have consistent data views between guest and QEMU user-space.

I'll admit to not quite understanding where this thread ended up;
it seems to have fallen into other ARM consistency questions.
Other than that it looks fine to me, but see the separate patch I posted
yesterday that includes the x86 source.

Peter: What's your view on the ight fix for hte memory consistency?

Dave

> 
> > 
> > Dave
> > 
> >>         ldrb    w1, [x2]
> >>         add     w1, w1, #1
> >>         and     w1, w1, #(0xff)
> >>         strb    w1, [x2]
> >>
> >>         add     x2, x2, #(4096)
> >>         cmp     x2, x3
> >>         blt     innerloop
> >>
> >>         add     w6, w6, #1
> >>         and     w6, w6, #(0xff)
> >>         cmp     w6, #0
> >>         bne     mainloop
> >>         
> >>         /* output char 'B' to PL011 */
> >>         mov     w4, #66
> >>         mov     x5, #0x9000000
> >>         strb    w4, [x5]
> >>
> >>         bl      mainloop
> >>
> >> The code is compiled with the following commands:
> >>  > gcc -c -o fill.o fill.s
> >>  > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \
> >>    -nostdlib fill.o
> >>  > objcopy -O binary fill.elf fill.flat
> >>  > truncate -c -s 144 ./fill.flat
> >>  > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }'
> >>
> >> The linker file (borrowed from KVM unit test) is defined as:
> >>
> >> SECTIONS
> >> {
> >>     .text : { *(.init) *(.text) *(.text.*) }
> >>     . = ALIGN(64K);
> >>     etext = .;
> >>     .data : {
> >>         *(.data)
> >>     }
> >>     . = ALIGN(16);
> >>     .rodata : { *(.rodata) }
> >>     . = ALIGN(16);
> >>     .bss : { *(.bss) }
> >>     . = ALIGN(64K);
> >>     edata = .;
> >>     . += 64K;
> >>     . = ALIGN(64K);
> >>     /*
> >>      * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE
> >>      * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm
> >>      * sp must always be strictly less than the true stacktop
> >>      */
> >>     stackptr = . - 16;
> >>     stacktop = .;
> >> }
> >>
> >> ENTRY(start)
> >>
> >> Signed-off-by: Wei Huang <wei@redhat.com>
> >> ---
> >>  tests/Makefile.include |  1 +
> >>  tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------
> >>  2 files changed, 32 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/tests/Makefile.include b/tests/Makefile.include
> >> index b4bcc872f2..2a520e53ab 100644
> >> --- a/tests/Makefile.include
> >> +++ b/tests/Makefile.include
> >> @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF)
> >>  gcov-files-arm-y += hw/timer/arm_mptimer.c
> >>  
> >>  check-qtest-aarch64-y = tests/numa-test$(EXESUF)
> >> +check-qtest-aarch64-y += tests/migration-test$(EXESUF)
> >>  
> >>  check-qtest-microblazeel-y = $(check-qtest-microblaze-y)
> >>  
> >> diff --git a/tests/migration-test.c b/tests/migration-test.c
> >> index be598d3257..3237fe93b2 100644
> >> --- a/tests/migration-test.c
> >> +++ b/tests/migration-test.c
> >> @@ -22,8 +22,8 @@
> >>  
> >>  #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */
> >>  
> >> -const unsigned start_address = 1024 * 1024;
> >> -const unsigned end_address = 100 * 1024 * 1024;
> >> +unsigned start_address = 1024 * 1024;
> >> +unsigned end_address = 100 * 1024 * 1024;
> >>  bool got_stop;
> >>  
> >>  #if defined(__linux__)
> >> @@ -79,7 +79,7 @@ static const char *tmpfs;
> >>  /* A simple PC boot sector that modifies memory (1-100MB) quickly
> >>   * outputing a 'B' every so often if it's still running.
> >>   */
> >> -unsigned char bootsect[] = {
> >> +unsigned char x86_bootsect[] = {
> >>    0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
> >>    0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
> >>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
> >> @@ -125,11 +125,20 @@ unsigned char bootsect[] = {
> >>    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa
> >>  };
> >>  
> >> -static void init_bootfile_x86(const char *bootpath)
> >> +uint32_t aarch64_kernel[] = {
> >> +    0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005,
> >> +    0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041,
> >> +    0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041,
> >> +    0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b,
> >> +    0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005,
> >> +    0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
> >> +};
> >> +
> >> +static void init_bootfile(const char *bootpath, void *content)
> >>  {
> >>      FILE *bootfile = fopen(bootpath, "wb");
> >>  
> >> -    g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1);
> >> +    g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1);
> >>      fclose(bootfile);
> >>  }
> >>  
> >> @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to,
> >>      got_stop = false;
> >>  
> >>      if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
> >> -        init_bootfile_x86(bootpath);
> >> +        init_bootfile(bootpath, x86_bootsect);
> >>          cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
> >>                                    " -name pcsource,debug-threads=on"
> >>                                    " -serial file:%s/src_serial"
> >> @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to,
> >>                                    " -serial file:%s/dest_serial"
> >>                                    " -incoming %s",
> >>                                    accel, tmpfs, uri);
> >> +    } else if (strcmp(arch, "aarch64") == 0) {
> >> +        init_bootfile(bootpath, aarch64_kernel);
> >> +        cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
> >> +                                  "-name vmsource,debug-threads=on -cpu host "
> >> +                                  "-serial file:%s/src_serial "
> >> +                                  "-kernel %s ",
> >> +                                  tmpfs, bootpath);
> >> +        cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
> >> +                                  "-name vmdest,debug-threads=on -cpu host "
> >> +                                  "-serial file:%s/dest_serial "
> >> +                                  "-kernel %s "
> >> +                                  "-incoming %s ",
> >> +                                  tmpfs, bootpath, uri);
> >> +        /* aarch64 virt machine physical mem started from 0x40000000 */
> >> +        start_address += 0x40000000;
> >> +        end_address += 0x40000000;
> >>      } else {
> >>          g_assert_not_reached();
> >>      }
> >> -- 
> >> 2.14.3
> >>
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/tests/Makefile.include b/tests/Makefile.include
index b4bcc872f2..2a520e53ab 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -357,6 +357,7 @@  check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF)
 gcov-files-arm-y += hw/timer/arm_mptimer.c
 
 check-qtest-aarch64-y = tests/numa-test$(EXESUF)
+check-qtest-aarch64-y += tests/migration-test$(EXESUF)
 
 check-qtest-microblazeel-y = $(check-qtest-microblaze-y)
 
diff --git a/tests/migration-test.c b/tests/migration-test.c
index be598d3257..3237fe93b2 100644
--- a/tests/migration-test.c
+++ b/tests/migration-test.c
@@ -22,8 +22,8 @@ 
 
 #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */
 
-const unsigned start_address = 1024 * 1024;
-const unsigned end_address = 100 * 1024 * 1024;
+unsigned start_address = 1024 * 1024;
+unsigned end_address = 100 * 1024 * 1024;
 bool got_stop;
 
 #if defined(__linux__)
@@ -79,7 +79,7 @@  static const char *tmpfs;
 /* A simple PC boot sector that modifies memory (1-100MB) quickly
  * outputing a 'B' every so often if it's still running.
  */
-unsigned char bootsect[] = {
+unsigned char x86_bootsect[] = {
   0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
   0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
@@ -125,11 +125,20 @@  unsigned char bootsect[] = {
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa
 };
 
-static void init_bootfile_x86(const char *bootpath)
+uint32_t aarch64_kernel[] = {
+    0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005,
+    0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041,
+    0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041,
+    0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b,
+    0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005,
+    0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static void init_bootfile(const char *bootpath, void *content)
 {
     FILE *bootfile = fopen(bootpath, "wb");
 
-    g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1);
+    g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1);
     fclose(bootfile);
 }
 
@@ -442,7 +451,7 @@  static void test_migrate_start(QTestState **from, QTestState **to,
     got_stop = false;
 
     if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
-        init_bootfile_x86(bootpath);
+        init_bootfile(bootpath, x86_bootsect);
         cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
                                   " -name pcsource,debug-threads=on"
                                   " -serial file:%s/src_serial"
@@ -470,6 +479,22 @@  static void test_migrate_start(QTestState **from, QTestState **to,
                                   " -serial file:%s/dest_serial"
                                   " -incoming %s",
                                   accel, tmpfs, uri);
+    } else if (strcmp(arch, "aarch64") == 0) {
+        init_bootfile(bootpath, aarch64_kernel);
+        cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
+                                  "-name vmsource,debug-threads=on -cpu host "
+                                  "-serial file:%s/src_serial "
+                                  "-kernel %s ",
+                                  tmpfs, bootpath);
+        cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M "
+                                  "-name vmdest,debug-threads=on -cpu host "
+                                  "-serial file:%s/dest_serial "
+                                  "-kernel %s "
+                                  "-incoming %s ",
+                                  tmpfs, bootpath, uri);
+        /* aarch64 virt machine physical mem started from 0x40000000 */
+        start_address += 0x40000000;
+        end_address += 0x40000000;
     } else {
         g_assert_not_reached();
     }