Message ID | 20180124212246.2352-1-wei@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
* Wei Huang (wei@redhat.com) wrote: > This patch adds the migration test support for aarch64. The test code, > which implements the same functionality as x86, is compiled into a binary > and booted as a kernel in qemu. Here are the design ideas: > > * We choose this -kernel design because aarch64 QEMU doesn't provide a > built-in fw like x86 does. So instead of relying on a boot loader, we > use -kernel approach for aarch64. > * The serial output is sent to PL011 directly. > * The physical memory base for mach-virt machine is 0x40000000. We change > the start_address and end_address for aarch64. > > RFC->V1: > * aarch64 kernel blob is defined as an uint32_t array > * The test code is re-written to address a data caching issue under KVM. > Tests passed under both x86 and aarch64. > * Re-use init_bootfile_x86() for both x86 and aarch64 > * Other minor fixes > > Note that the test code is as the following: > > .section .text > > .globl start > > start: > /* disable MMU to use phys mem address */ > mrs x0, sctlr_el1 > bic x0, x0, #(1<<0) > msr sctlr_el1, x0 > isb > > /* output char 'A' to PL011 */ > mov w4, #65 > mov x5, #0x9000000 > strb w4, [x5] > > /* w6 keeps a counter so we can limit the output speed */ > mov w6, #0 > > /* phys mem base addr = 0x40000000 */ > mov x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */ > mov x2, #(0x40000000 + 1 * 1024*1024) > > /* clean up memory first */ > mov w1, #0 > clean: > strb w1, [x2] > add x2, x2, #(4096) > cmp x2, x3 > ble clean > > /* main body */ > mainloop: > mov x2, #(0x40000000 + 1 * 1024*1024) > > innerloop: > /* clean cache because el2 might still cache guest data under KVM */ > dc civac, x2 Can you explain a bit more about that please; if it's guest visible acorss migration, doesn't that suggest we need the cache clear in KVM or QEMU? Dave > ldrb w1, [x2] > add w1, w1, #1 > and w1, w1, #(0xff) > strb w1, [x2] > > add x2, x2, #(4096) > cmp x2, x3 > blt innerloop > > add w6, w6, #1 > and w6, w6, #(0xff) > cmp w6, #0 > bne mainloop > > /* output char 'B' to PL011 */ > mov w4, #66 > mov x5, #0x9000000 > strb w4, [x5] > > bl mainloop > > The code is compiled with the following commands: > > gcc -c -o fill.o fill.s > > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \ > -nostdlib fill.o > > objcopy -O binary fill.elf fill.flat > > truncate -c -s 144 ./fill.flat > > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }' > > The linker file (borrowed from KVM unit test) is defined as: > > SECTIONS > { > .text : { *(.init) *(.text) *(.text.*) } > . = ALIGN(64K); > etext = .; > .data : { > *(.data) > } > . = ALIGN(16); > .rodata : { *(.rodata) } > . = ALIGN(16); > .bss : { *(.bss) } > . = ALIGN(64K); > edata = .; > . += 64K; > . = ALIGN(64K); > /* > * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE > * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm > * sp must always be strictly less than the true stacktop > */ > stackptr = . - 16; > stacktop = .; > } > > ENTRY(start) > > Signed-off-by: Wei Huang <wei@redhat.com> > --- > tests/Makefile.include | 1 + > tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------ > 2 files changed, 32 insertions(+), 6 deletions(-) > > diff --git a/tests/Makefile.include b/tests/Makefile.include > index b4bcc872f2..2a520e53ab 100644 > --- a/tests/Makefile.include > +++ b/tests/Makefile.include > @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF) > gcov-files-arm-y += hw/timer/arm_mptimer.c > > check-qtest-aarch64-y = tests/numa-test$(EXESUF) > +check-qtest-aarch64-y += tests/migration-test$(EXESUF) > > check-qtest-microblazeel-y = $(check-qtest-microblaze-y) > > diff --git a/tests/migration-test.c b/tests/migration-test.c > index be598d3257..3237fe93b2 100644 > --- a/tests/migration-test.c > +++ b/tests/migration-test.c > @@ -22,8 +22,8 @@ > > #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */ > > -const unsigned start_address = 1024 * 1024; > -const unsigned end_address = 100 * 1024 * 1024; > +unsigned start_address = 1024 * 1024; > +unsigned end_address = 100 * 1024 * 1024; > bool got_stop; > > #if defined(__linux__) > @@ -79,7 +79,7 @@ static const char *tmpfs; > /* A simple PC boot sector that modifies memory (1-100MB) quickly > * outputing a 'B' every so often if it's still running. > */ > -unsigned char bootsect[] = { > +unsigned char x86_bootsect[] = { > 0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00, > 0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00, > 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02, > @@ -125,11 +125,20 @@ unsigned char bootsect[] = { > 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa > }; > > -static void init_bootfile_x86(const char *bootpath) > +uint32_t aarch64_kernel[] = { > + 0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005, > + 0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041, > + 0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041, > + 0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b, > + 0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005, > + 0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, > +}; > + > +static void init_bootfile(const char *bootpath, void *content) > { > FILE *bootfile = fopen(bootpath, "wb"); > > - g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1); > + g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1); > fclose(bootfile); > } > > @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to, > got_stop = false; > > if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { > - init_bootfile_x86(bootpath); > + init_bootfile(bootpath, x86_bootsect); > cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M" > " -name pcsource,debug-threads=on" > " -serial file:%s/src_serial" > @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to, > " -serial file:%s/dest_serial" > " -incoming %s", > accel, tmpfs, uri); > + } else if (strcmp(arch, "aarch64") == 0) { > + init_bootfile(bootpath, aarch64_kernel); > + cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " > + "-name vmsource,debug-threads=on -cpu host " > + "-serial file:%s/src_serial " > + "-kernel %s ", > + tmpfs, bootpath); > + cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " > + "-name vmdest,debug-threads=on -cpu host " > + "-serial file:%s/dest_serial " > + "-kernel %s " > + "-incoming %s ", > + tmpfs, bootpath, uri); > + /* aarch64 virt machine physical mem started from 0x40000000 */ > + start_address += 0x40000000; > + end_address += 0x40000000; > } else { > g_assert_not_reached(); > } > -- > 2.14.3 > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote: > * Wei Huang (wei@redhat.com) wrote: >> This patch adds the migration test support for aarch64. The test code, >> which implements the same functionality as x86, is compiled into a binary >> and booted as a kernel in qemu. Here are the design ideas: >> >> * We choose this -kernel design because aarch64 QEMU doesn't provide a >> built-in fw like x86 does. So instead of relying on a boot loader, we >> use -kernel approach for aarch64. >> * The serial output is sent to PL011 directly. >> * The physical memory base for mach-virt machine is 0x40000000. We change >> the start_address and end_address for aarch64. >> >> RFC->V1: >> * aarch64 kernel blob is defined as an uint32_t array >> * The test code is re-written to address a data caching issue under KVM. >> Tests passed under both x86 and aarch64. >> * Re-use init_bootfile_x86() for both x86 and aarch64 >> * Other minor fixes >> >> Note that the test code is as the following: >> >> .section .text >> >> .globl start >> >> start: >> /* disable MMU to use phys mem address */ >> mrs x0, sctlr_el1 >> bic x0, x0, #(1<<0) >> msr sctlr_el1, x0 >> isb >> >> /* output char 'A' to PL011 */ >> mov w4, #65 >> mov x5, #0x9000000 >> strb w4, [x5] >> >> /* w6 keeps a counter so we can limit the output speed */ >> mov w6, #0 >> >> /* phys mem base addr = 0x40000000 */ >> mov x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */ >> mov x2, #(0x40000000 + 1 * 1024*1024) >> >> /* clean up memory first */ >> mov w1, #0 >> clean: >> strb w1, [x2] >> add x2, x2, #(4096) >> cmp x2, x3 >> ble clean >> >> /* main body */ >> mainloop: >> mov x2, #(0x40000000 + 1 * 1024*1024) >> >> innerloop: >> /* clean cache because el2 might still cache guest data under KVM */ >> dc civac, x2 > > Can you explain a bit more about that please; if it's guest > visible acorss migration, doesn't that suggest we need the cache clear > in KVM or QEMU? I think this is ARM specific. This is caused by the inconsistency between guest VM's data accesses and userspace's accesses (in check_guest_ram) at the destination: 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So the data accesses from guest VM go directly into memory. 2) QEMU user space will use the cacheable version. So it is possible that data can come from cache, instead of RAM. The difference between (1) and (2) obviously can cause check_guest_ram() to fail sometimes. x86's EPT has the capability to ignore guest-provided memory type. So it is possible to have consistent data views between guest and QEMU user-space. > > Dave > >> ldrb w1, [x2] >> add w1, w1, #1 >> and w1, w1, #(0xff) >> strb w1, [x2] >> >> add x2, x2, #(4096) >> cmp x2, x3 >> blt innerloop >> >> add w6, w6, #1 >> and w6, w6, #(0xff) >> cmp w6, #0 >> bne mainloop >> >> /* output char 'B' to PL011 */ >> mov w4, #66 >> mov x5, #0x9000000 >> strb w4, [x5] >> >> bl mainloop >> >> The code is compiled with the following commands: >> > gcc -c -o fill.o fill.s >> > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \ >> -nostdlib fill.o >> > objcopy -O binary fill.elf fill.flat >> > truncate -c -s 144 ./fill.flat >> > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }' >> >> The linker file (borrowed from KVM unit test) is defined as: >> >> SECTIONS >> { >> .text : { *(.init) *(.text) *(.text.*) } >> . = ALIGN(64K); >> etext = .; >> .data : { >> *(.data) >> } >> . = ALIGN(16); >> .rodata : { *(.rodata) } >> . = ALIGN(16); >> .bss : { *(.bss) } >> . = ALIGN(64K); >> edata = .; >> . += 64K; >> . = ALIGN(64K); >> /* >> * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE >> * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm >> * sp must always be strictly less than the true stacktop >> */ >> stackptr = . - 16; >> stacktop = .; >> } >> >> ENTRY(start) >> >> Signed-off-by: Wei Huang <wei@redhat.com> >> --- >> tests/Makefile.include | 1 + >> tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------ >> 2 files changed, 32 insertions(+), 6 deletions(-) >> >> diff --git a/tests/Makefile.include b/tests/Makefile.include >> index b4bcc872f2..2a520e53ab 100644 >> --- a/tests/Makefile.include >> +++ b/tests/Makefile.include >> @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF) >> gcov-files-arm-y += hw/timer/arm_mptimer.c >> >> check-qtest-aarch64-y = tests/numa-test$(EXESUF) >> +check-qtest-aarch64-y += tests/migration-test$(EXESUF) >> >> check-qtest-microblazeel-y = $(check-qtest-microblaze-y) >> >> diff --git a/tests/migration-test.c b/tests/migration-test.c >> index be598d3257..3237fe93b2 100644 >> --- a/tests/migration-test.c >> +++ b/tests/migration-test.c >> @@ -22,8 +22,8 @@ >> >> #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */ >> >> -const unsigned start_address = 1024 * 1024; >> -const unsigned end_address = 100 * 1024 * 1024; >> +unsigned start_address = 1024 * 1024; >> +unsigned end_address = 100 * 1024 * 1024; >> bool got_stop; >> >> #if defined(__linux__) >> @@ -79,7 +79,7 @@ static const char *tmpfs; >> /* A simple PC boot sector that modifies memory (1-100MB) quickly >> * outputing a 'B' every so often if it's still running. >> */ >> -unsigned char bootsect[] = { >> +unsigned char x86_bootsect[] = { >> 0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00, >> 0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00, >> 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02, >> @@ -125,11 +125,20 @@ unsigned char bootsect[] = { >> 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa >> }; >> >> -static void init_bootfile_x86(const char *bootpath) >> +uint32_t aarch64_kernel[] = { >> + 0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005, >> + 0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041, >> + 0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041, >> + 0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b, >> + 0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005, >> + 0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, >> +}; >> + >> +static void init_bootfile(const char *bootpath, void *content) >> { >> FILE *bootfile = fopen(bootpath, "wb"); >> >> - g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1); >> + g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1); >> fclose(bootfile); >> } >> >> @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to, >> got_stop = false; >> >> if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { >> - init_bootfile_x86(bootpath); >> + init_bootfile(bootpath, x86_bootsect); >> cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M" >> " -name pcsource,debug-threads=on" >> " -serial file:%s/src_serial" >> @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to, >> " -serial file:%s/dest_serial" >> " -incoming %s", >> accel, tmpfs, uri); >> + } else if (strcmp(arch, "aarch64") == 0) { >> + init_bootfile(bootpath, aarch64_kernel); >> + cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " >> + "-name vmsource,debug-threads=on -cpu host " >> + "-serial file:%s/src_serial " >> + "-kernel %s ", >> + tmpfs, bootpath); >> + cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " >> + "-name vmdest,debug-threads=on -cpu host " >> + "-serial file:%s/dest_serial " >> + "-kernel %s " >> + "-incoming %s ", >> + tmpfs, bootpath, uri); >> + /* aarch64 virt machine physical mem started from 0x40000000 */ >> + start_address += 0x40000000; >> + end_address += 0x40000000; >> } else { >> g_assert_not_reached(); >> } >> -- >> 2.14.3 >> > -- > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK >
On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote: > > > On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote: >> * Wei Huang (wei@redhat.com) wrote: >>> innerloop: >>> /* clean cache because el2 might still cache guest data under KVM */ >>> dc civac, x2 >> >> Can you explain a bit more about that please; if it's guest >> visible acorss migration, doesn't that suggest we need the cache clear >> in KVM or QEMU? > > I think this is ARM specific. This is caused by the inconsistency > between guest VM's data accesses and userspace's accesses (in > check_guest_ram) at the destination: > > 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So > the data accesses from guest VM go directly into memory. > 2) QEMU user space will use the cacheable version. So it is possible > that data can come from cache, instead of RAM. The difference between > (1) and (2) obviously can cause check_guest_ram() to fail sometimes. I think the correct fix here is that your test code should turn its MMU on. Trying to treat guest RAM as uncacheable doesn't work for Arm KVM guests (for the same reason that VGA device video memory doesn't work). If it's RAM your guest has to arrange to map it as Normal Cacheable, and then everything should work fine. thanks -- PMM
On 01/26/2018 10:39 AM, Peter Maydell wrote: > On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote: >> >> >> On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote: >>> * Wei Huang (wei@redhat.com) wrote: >>>> innerloop: >>>> /* clean cache because el2 might still cache guest data under KVM */ >>>> dc civac, x2 >>> >>> Can you explain a bit more about that please; if it's guest >>> visible acorss migration, doesn't that suggest we need the cache clear >>> in KVM or QEMU? >> >> I think this is ARM specific. This is caused by the inconsistency >> between guest VM's data accesses and userspace's accesses (in >> check_guest_ram) at the destination: >> >> 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So >> the data accesses from guest VM go directly into memory. >> 2) QEMU user space will use the cacheable version. So it is possible >> that data can come from cache, instead of RAM. The difference between >> (1) and (2) obviously can cause check_guest_ram() to fail sometimes. > > I think the correct fix here is that your test code should turn > its MMU on. Trying to treat guest RAM as uncacheable doesn't work I did try MMU-on while debugging this problem. It was a migration unit test written on top of kvm-unit-tests and I forced QEMU's tests/migration-test.c to use this .flat file as -kernel parameter. I didn't see any memory check failures using this approach. So I can confirm your suggestion. But turning MMU on means a much larger binary blob. I know you don't like the existing migration-test.c approach. Building a more complicated test case and embedding it in migration-test.c code will make the situation worse. Because of this, I chose the current version: small and manageable (even with one extra cache flush instruction). > for Arm KVM guests (for the same reason that VGA device video memory > doesn't work). If it's RAM your guest has to arrange to map it as > Normal Cacheable, and then everything should work fine. > > thanks > -- PMM >
* Peter Maydell (peter.maydell@linaro.org) wrote: > On 26 January 2018 at 15:47, Wei Huang <wei@redhat.com> wrote: > > > > > > On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote: > >> * Wei Huang (wei@redhat.com) wrote: > >>> innerloop: > >>> /* clean cache because el2 might still cache guest data under KVM */ > >>> dc civac, x2 > >> > >> Can you explain a bit more about that please; if it's guest > >> visible acorss migration, doesn't that suggest we need the cache clear > >> in KVM or QEMU? > > > > I think this is ARM specific. This is caused by the inconsistency > > between guest VM's data accesses and userspace's accesses (in > > check_guest_ram) at the destination: > > > > 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So > > the data accesses from guest VM go directly into memory. > > 2) QEMU user space will use the cacheable version. So it is possible > > that data can come from cache, instead of RAM. The difference between > > (1) and (2) obviously can cause check_guest_ram() to fail sometimes. > > I think the correct fix here is that your test code should turn > its MMU on. Trying to treat guest RAM as uncacheable doesn't work > for Arm KVM guests (for the same reason that VGA device video memory > doesn't work). If it's RAM your guest has to arrange to map it as > Normal Cacheable, and then everything should work fine. Does this cause problems with migrating at just the wrong point during a VM boot? Dave > thanks > -- PMM -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > * Peter Maydell (peter.maydell@linaro.org) wrote: >> I think the correct fix here is that your test code should turn >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >> for Arm KVM guests (for the same reason that VGA device video memory >> doesn't work). If it's RAM your guest has to arrange to map it as >> Normal Cacheable, and then everything should work fine. > > Does this cause problems with migrating at just the wrong point during > a VM boot? It wouldn't surprise me if it did, but I don't think I've ever tried to provoke that problem... thanks -- PMM
* Peter Maydell (peter.maydell@linaro.org) wrote: > On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > * Peter Maydell (peter.maydell@linaro.org) wrote: > >> I think the correct fix here is that your test code should turn > >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > >> for Arm KVM guests (for the same reason that VGA device video memory > >> doesn't work). If it's RAM your guest has to arrange to map it as > >> Normal Cacheable, and then everything should work fine. > > > > Does this cause problems with migrating at just the wrong point during > > a VM boot? > > It wouldn't surprise me if it did, but I don't think I've ever > tried to provoke that problem... If you think it'll get the RAM contents wrong, it might be best to fail the migration if you can detect the cache is disabled in the guest. Dave > thanks > -- PMM -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > * Peter Maydell (peter.maydell@linaro.org) wrote: >> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >> > * Peter Maydell (peter.maydell@linaro.org) wrote: >> >> I think the correct fix here is that your test code should turn >> >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >> >> for Arm KVM guests (for the same reason that VGA device video memory >> >> doesn't work). If it's RAM your guest has to arrange to map it as >> >> Normal Cacheable, and then everything should work fine. >> > >> > Does this cause problems with migrating at just the wrong point during >> > a VM boot? >> >> It wouldn't surprise me if it did, but I don't think I've ever >> tried to provoke that problem... > > If you think it'll get the RAM contents wrong, it might be best to fail > the migration if you can detect the cache is disabled in the guest. I guess QEMU could look at the value of the "MMU disabled/enabled" bit in the guest's system registers, and refuse migration if it's off... (cc'd Marc, Christoffer to check that I don't have the wrong end of the stick about how thin the ice is in the period before the guest turns on its MMU...) thanks -- PMM
* Peter Maydell (peter.maydell@linaro.org) wrote: > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > * Peter Maydell (peter.maydell@linaro.org) wrote: > >> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >> > * Peter Maydell (peter.maydell@linaro.org) wrote: > >> >> I think the correct fix here is that your test code should turn > >> >> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > >> >> for Arm KVM guests (for the same reason that VGA device video memory > >> >> doesn't work). If it's RAM your guest has to arrange to map it as > >> >> Normal Cacheable, and then everything should work fine. > >> > > >> > Does this cause problems with migrating at just the wrong point during > >> > a VM boot? > >> > >> It wouldn't surprise me if it did, but I don't think I've ever > >> tried to provoke that problem... > > > > If you think it'll get the RAM contents wrong, it might be best to fail > > the migration if you can detect the cache is disabled in the guest. > > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > in the guest's system registers, and refuse migration if it's off... > > (cc'd Marc, Christoffer to check that I don't have the wrong end > of the stick about how thin the ice is in the period before the > guest turns on its MMU...) OK; I mean it's not nice either way - but a failed migration is better than one with corrupt RAM. It's not pretty for cloud users; they do host-evacuations and the like fully automatically without any knowledge of the state of a VM and hope for migrations to work in any state. Dave > thanks > -- PMM -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 29/01/18 10:04, Peter Maydell wrote: > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>> I think the correct fix here is that your test code should turn >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>> Normal Cacheable, and then everything should work fine. >>>> >>>> Does this cause problems with migrating at just the wrong point during >>>> a VM boot? >>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>> tried to provoke that problem... >> >> If you think it'll get the RAM contents wrong, it might be best to fail >> the migration if you can detect the cache is disabled in the guest. > > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > in the guest's system registers, and refuse migration if it's off... > > (cc'd Marc, Christoffer to check that I don't have the wrong end > of the stick about how thin the ice is in the period before the > guest turns on its MMU...) Once MMU and caches are on, we should be in a reasonable place for QEMU to have a consistent view of the memory. The trick is to prevent the vcpus from changing that. A guest could perfectly turn off its MMU at any given time if it needs to (and it is actually required on some HW if you want to mitigate headlining CVEs), and KVM won't know about that. You may have to pause the vcpus before starting the migration, or introduce a new KVM feature that would automatically pause a vcpu that is trying to disable its MMU while the migration is on. This would involve trapping all the virtual memory related system registers, with an obvious cost. But that cost would be limited to the time it takes to migrate the memory, so maybe that's acceptable. Thoughts? M.
On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: > On 29/01/18 10:04, Peter Maydell wrote: > > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >> * Peter Maydell (peter.maydell@linaro.org) wrote: > >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: > >>>>> I think the correct fix here is that your test code should turn > >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > >>>>> for Arm KVM guests (for the same reason that VGA device video memory > >>>>> doesn't work). If it's RAM your guest has to arrange to map it as > >>>>> Normal Cacheable, and then everything should work fine. > >>>> > >>>> Does this cause problems with migrating at just the wrong point during > >>>> a VM boot? > >>> > >>> It wouldn't surprise me if it did, but I don't think I've ever > >>> tried to provoke that problem... > >> > >> If you think it'll get the RAM contents wrong, it might be best to fail > >> the migration if you can detect the cache is disabled in the guest. > > > > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > > in the guest's system registers, and refuse migration if it's off... > > > > (cc'd Marc, Christoffer to check that I don't have the wrong end > > of the stick about how thin the ice is in the period before the > > guest turns on its MMU...) > > Once MMU and caches are on, we should be in a reasonable place for QEMU > to have a consistent view of the memory. The trick is to prevent the > vcpus from changing that. A guest could perfectly turn off its MMU at > any given time if it needs to (and it is actually required on some HW if > you want to mitigate headlining CVEs), and KVM won't know about that. > (Clarification: KVM can detect this is it bother to check the VCPU's system registers, but we don't trap to KVM when the VCPU turns off its caches, right?) > You may have to pause the vcpus before starting the migration, or > introduce a new KVM feature that would automatically pause a vcpu that > is trying to disable its MMU while the migration is on. This would > involve trapping all the virtual memory related system registers, with > an obvious cost. But that cost would be limited to the time it takes to > migrate the memory, so maybe that's acceptable. > Is that even sufficient? What if the following happened. (1) guest turns off MMU, (2) guest writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads guest ram. QEMU's view of guest ram is now incorrect (stale, incoherent, ...). I'm also not really sure if pausing one VCPU because it turned off its MMU will go very well when trying to migrate a large VM (wouldn't this ask for all the other VCPUs beginning to complain that the stopped VCPU appears to be dead?). As a short-term 'fix' it's probably better to refuse migration if you detect that a VCPU had begun turning off its MMU. On the larger scale of thins; this appears to me to be another case of us really needing some way to coherently access memory between QEMU and the VM, but in the case of the VCPU turning off the MMU prior to migration, we don't even know where it may have written data, and I'm therefore not really sure what the 'proper' solution would be. (cc'ing Ard who has has thought about this problem before in the context of UEFI and VGA.) Thanks, -Christoffer
On 31 January 2018 at 09:53, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >> On 29/01/18 10:04, Peter Maydell wrote: >> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >> >>>>> I think the correct fix here is that your test code should turn >> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >> >>>>> Normal Cacheable, and then everything should work fine. >> >>>> >> >>>> Does this cause problems with migrating at just the wrong point during >> >>>> a VM boot? >> >>> >> >>> It wouldn't surprise me if it did, but I don't think I've ever >> >>> tried to provoke that problem... >> >> >> >> If you think it'll get the RAM contents wrong, it might be best to fail >> >> the migration if you can detect the cache is disabled in the guest. >> > >> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >> > in the guest's system registers, and refuse migration if it's off... >> > >> > (cc'd Marc, Christoffer to check that I don't have the wrong end >> > of the stick about how thin the ice is in the period before the >> > guest turns on its MMU...) >> >> Once MMU and caches are on, we should be in a reasonable place for QEMU >> to have a consistent view of the memory. The trick is to prevent the >> vcpus from changing that. A guest could perfectly turn off its MMU at >> any given time if it needs to (and it is actually required on some HW if >> you want to mitigate headlining CVEs), and KVM won't know about that. >> > > (Clarification: KVM can detect this is it bother to check the VCPU's > system registers, but we don't trap to KVM when the VCPU turns off its > caches, right?) > >> You may have to pause the vcpus before starting the migration, or >> introduce a new KVM feature that would automatically pause a vcpu that >> is trying to disable its MMU while the migration is on. This would >> involve trapping all the virtual memory related system registers, with >> an obvious cost. But that cost would be limited to the time it takes to >> migrate the memory, so maybe that's acceptable. >> > Is that even sufficient? > > What if the following happened. (1) guest turns off MMU, (2) guest > writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads > guest ram. QEMU's view of guest ram is now incorrect (stale, > incoherent, ...). > > I'm also not really sure if pausing one VCPU because it turned off its > MMU will go very well when trying to migrate a large VM (wouldn't this > ask for all the other VCPUs beginning to complain that the stopped VCPU > appears to be dead?). As a short-term 'fix' it's probably better to > refuse migration if you detect that a VCPU had begun turning off its > MMU. > > On the larger scale of thins; this appears to me to be another case of > us really needing some way to coherently access memory between QEMU and > the VM, but in the case of the VCPU turning off the MMU prior to > migration, we don't even know where it may have written data, and I'm > therefore not really sure what the 'proper' solution would be. > > (cc'ing Ard who has has thought about this problem before in the context > of UEFI and VGA.) > Actually, the VGA case is much simpler because the host is not expected to write to the framebuffer, only read from it, and the guest is not expected to create a cacheable mapping for it, so any incoherency can be trivially solved by cache invalidation on the host side. (Note that this has nothing to do with DMA coherency, but only with PCI MMIO BARs that are backed by DRAM in the host) In the migration case, it is much more complicated, and I think capturing the state of the VM in a way that takes incoherency between caches and main memory into account is simply infeasible (i.e., the act of recording the state of guest RAM via a cached mapping may evict clean cachelines that are out of sync, and so it is impossible to record both the cached *and* the delta with the uncached state) I wonder how difficult it would be to a) enable trapping of the MMU system register when a guest CPU is found to have its MMU off at migration time b) allow the guest CPU to complete whatever it thinks it needs to be doing with the MMU off c) once it re-enables the MMU, proceed with capturing the memory state Full disclosure: I know very little about KVM migration ...
* Ard Biesheuvel (ard.biesheuvel@linaro.org) wrote: > On 31 January 2018 at 09:53, Christoffer Dall > <christoffer.dall@linaro.org> wrote: > > On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: > >> On 29/01/18 10:04, Peter Maydell wrote: > >> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >> >> * Peter Maydell (peter.maydell@linaro.org) wrote: > >> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: > >> >>>>> I think the correct fix here is that your test code should turn > >> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > >> >>>>> for Arm KVM guests (for the same reason that VGA device video memory > >> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as > >> >>>>> Normal Cacheable, and then everything should work fine. > >> >>>> > >> >>>> Does this cause problems with migrating at just the wrong point during > >> >>>> a VM boot? > >> >>> > >> >>> It wouldn't surprise me if it did, but I don't think I've ever > >> >>> tried to provoke that problem... > >> >> > >> >> If you think it'll get the RAM contents wrong, it might be best to fail > >> >> the migration if you can detect the cache is disabled in the guest. > >> > > >> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > >> > in the guest's system registers, and refuse migration if it's off... > >> > > >> > (cc'd Marc, Christoffer to check that I don't have the wrong end > >> > of the stick about how thin the ice is in the period before the > >> > guest turns on its MMU...) > >> > >> Once MMU and caches are on, we should be in a reasonable place for QEMU > >> to have a consistent view of the memory. The trick is to prevent the > >> vcpus from changing that. A guest could perfectly turn off its MMU at > >> any given time if it needs to (and it is actually required on some HW if > >> you want to mitigate headlining CVEs), and KVM won't know about that. > >> > > > > (Clarification: KVM can detect this is it bother to check the VCPU's > > system registers, but we don't trap to KVM when the VCPU turns off its > > caches, right?) > > > >> You may have to pause the vcpus before starting the migration, or > >> introduce a new KVM feature that would automatically pause a vcpu that > >> is trying to disable its MMU while the migration is on. This would > >> involve trapping all the virtual memory related system registers, with > >> an obvious cost. But that cost would be limited to the time it takes to > >> migrate the memory, so maybe that's acceptable. > >> > > Is that even sufficient? > > > > What if the following happened. (1) guest turns off MMU, (2) guest > > writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads > > guest ram. QEMU's view of guest ram is now incorrect (stale, > > incoherent, ...). > > > > I'm also not really sure if pausing one VCPU because it turned off its > > MMU will go very well when trying to migrate a large VM (wouldn't this > > ask for all the other VCPUs beginning to complain that the stopped VCPU > > appears to be dead?). As a short-term 'fix' it's probably better to > > refuse migration if you detect that a VCPU had begun turning off its > > MMU. > > > > On the larger scale of thins; this appears to me to be another case of > > us really needing some way to coherently access memory between QEMU and > > the VM, but in the case of the VCPU turning off the MMU prior to > > migration, we don't even know where it may have written data, and I'm > > therefore not really sure what the 'proper' solution would be. > > > > (cc'ing Ard who has has thought about this problem before in the context > > of UEFI and VGA.) > > > > Actually, the VGA case is much simpler because the host is not > expected to write to the framebuffer, only read from it, and the guest > is not expected to create a cacheable mapping for it, so any > incoherency can be trivially solved by cache invalidation on the host > side. (Note that this has nothing to do with DMA coherency, but only > with PCI MMIO BARs that are backed by DRAM in the host) > > In the migration case, it is much more complicated, and I think > capturing the state of the VM in a way that takes incoherency between > caches and main memory into account is simply infeasible (i.e., the > act of recording the state of guest RAM via a cached mapping may evict > clean cachelines that are out of sync, and so it is impossible to > record both the cached *and* the delta with the uncached state) > > I wonder how difficult it would be to > a) enable trapping of the MMU system register when a guest CPU is > found to have its MMU off at migration time > b) allow the guest CPU to complete whatever it thinks it needs to be > doing with the MMU off > c) once it re-enables the MMU, proceed with capturing the memory state > > Full disclosure: I know very little about KVM migration ... The difficulty is that migration is 'live' - i.e. the guest is running while we're copying the data across; that means that a guest might do any of these MMU things multiple times - so if we wait for it to be right, will it go back to being wrong? How long do you wait? (It's not a bad hack if that's the best we can do though). Now of course 'live' itself sounds scary for consistency, but the only thing we really require is that a page is marked dirty some time after it's been written to so that we cause it to be sent again and that we eventually send a correct version; it's ok for us to be sending inconsistent versions as long as we eventually send the right version. Dave -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 31 January 2018 at 09:53, Christoffer Dall > <christoffer.dall@linaro.org> wrote: >> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>> On 29/01/18 10:04, Peter Maydell wrote: >>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>> >>>>> I think the correct fix here is that your test code should turn >>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>> >>>>> Normal Cacheable, and then everything should work fine. >>> >>>> >>> >>>> Does this cause problems with migrating at just the wrong point during >>> >>>> a VM boot? >>> >>> >>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>> >>> tried to provoke that problem... >>> >> >>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>> >> the migration if you can detect the cache is disabled in the guest. >>> > >>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>> > in the guest's system registers, and refuse migration if it's off... >>> > >>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>> > of the stick about how thin the ice is in the period before the >>> > guest turns on its MMU...) >>> >>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>> to have a consistent view of the memory. The trick is to prevent the >>> vcpus from changing that. A guest could perfectly turn off its MMU at >>> any given time if it needs to (and it is actually required on some HW if >>> you want to mitigate headlining CVEs), and KVM won't know about that. >>> >> >> (Clarification: KVM can detect this is it bother to check the VCPU's >> system registers, but we don't trap to KVM when the VCPU turns off its >> caches, right?) >> >>> You may have to pause the vcpus before starting the migration, or >>> introduce a new KVM feature that would automatically pause a vcpu that >>> is trying to disable its MMU while the migration is on. This would >>> involve trapping all the virtual memory related system registers, with >>> an obvious cost. But that cost would be limited to the time it takes to >>> migrate the memory, so maybe that's acceptable. >>> >> Is that even sufficient? >> >> What if the following happened. (1) guest turns off MMU, (2) guest >> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >> guest ram. QEMU's view of guest ram is now incorrect (stale, >> incoherent, ...). >> >> I'm also not really sure if pausing one VCPU because it turned off its >> MMU will go very well when trying to migrate a large VM (wouldn't this >> ask for all the other VCPUs beginning to complain that the stopped VCPU >> appears to be dead?). As a short-term 'fix' it's probably better to >> refuse migration if you detect that a VCPU had begun turning off its >> MMU. >> >> On the larger scale of thins; this appears to me to be another case of >> us really needing some way to coherently access memory between QEMU and >> the VM, but in the case of the VCPU turning off the MMU prior to >> migration, we don't even know where it may have written data, and I'm >> therefore not really sure what the 'proper' solution would be. >> >> (cc'ing Ard who has has thought about this problem before in the context >> of UEFI and VGA.) >> > > Actually, the VGA case is much simpler because the host is not > expected to write to the framebuffer, only read from it, and the guest > is not expected to create a cacheable mapping for it, so any > incoherency can be trivially solved by cache invalidation on the host > side. (Note that this has nothing to do with DMA coherency, but only > with PCI MMIO BARs that are backed by DRAM in the host) In case of the running guest, the host will also only read from the cached mapping. Of course, at restore, the host will also write through a cached mapping, but shouldn't the latter case be solvable by having KVM clean the cache lines when faulting in any page? > > In the migration case, it is much more complicated, and I think > capturing the state of the VM in a way that takes incoherency between > caches and main memory into account is simply infeasible (i.e., the > act of recording the state of guest RAM via a cached mapping may evict > clean cachelines that are out of sync, and so it is impossible to > record both the cached *and* the delta with the uncached state) This may be an incredibly stupid question (and I may have asked it before), but why can't we clean+invalidate the guest page before reading it and thereby obtain a coherent view of a page? > > I wonder how difficult it would be to > a) enable trapping of the MMU system register when a guest CPU is > found to have its MMU off at migration time > b) allow the guest CPU to complete whatever it thinks it needs to be > doing with the MMU off > c) once it re-enables the MMU, proceed with capturing the memory state > I don't think this is impossible, but it would prevent migration of things like bare-metal guests or weirdo unikernel guests running with the MMU off, wouldn't it? Thanks, -Christoffer
On 31 January 2018 at 16:53, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: >> On 31 January 2018 at 09:53, Christoffer Dall >> <christoffer.dall@linaro.org> wrote: >>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>> On 29/01/18 10:04, Peter Maydell wrote: >>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>> >>>>> I think the correct fix here is that your test code should turn >>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>> >>>>> Normal Cacheable, and then everything should work fine. >>>> >>>> >>>> >>>> Does this cause problems with migrating at just the wrong point during >>>> >>>> a VM boot? >>>> >>> >>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>> >>> tried to provoke that problem... >>>> >> >>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>> >> the migration if you can detect the cache is disabled in the guest. >>>> > >>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>> > in the guest's system registers, and refuse migration if it's off... >>>> > >>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>> > of the stick about how thin the ice is in the period before the >>>> > guest turns on its MMU...) >>>> >>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>> to have a consistent view of the memory. The trick is to prevent the >>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>> any given time if it needs to (and it is actually required on some HW if >>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>> >>> >>> (Clarification: KVM can detect this is it bother to check the VCPU's >>> system registers, but we don't trap to KVM when the VCPU turns off its >>> caches, right?) >>> >>>> You may have to pause the vcpus before starting the migration, or >>>> introduce a new KVM feature that would automatically pause a vcpu that >>>> is trying to disable its MMU while the migration is on. This would >>>> involve trapping all the virtual memory related system registers, with >>>> an obvious cost. But that cost would be limited to the time it takes to >>>> migrate the memory, so maybe that's acceptable. >>>> >>> Is that even sufficient? >>> >>> What if the following happened. (1) guest turns off MMU, (2) guest >>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>> incoherent, ...). >>> >>> I'm also not really sure if pausing one VCPU because it turned off its >>> MMU will go very well when trying to migrate a large VM (wouldn't this >>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>> appears to be dead?). As a short-term 'fix' it's probably better to >>> refuse migration if you detect that a VCPU had begun turning off its >>> MMU. >>> >>> On the larger scale of thins; this appears to me to be another case of >>> us really needing some way to coherently access memory between QEMU and >>> the VM, but in the case of the VCPU turning off the MMU prior to >>> migration, we don't even know where it may have written data, and I'm >>> therefore not really sure what the 'proper' solution would be. >>> >>> (cc'ing Ard who has has thought about this problem before in the context >>> of UEFI and VGA.) >>> >> >> Actually, the VGA case is much simpler because the host is not >> expected to write to the framebuffer, only read from it, and the guest >> is not expected to create a cacheable mapping for it, so any >> incoherency can be trivially solved by cache invalidation on the host >> side. (Note that this has nothing to do with DMA coherency, but only >> with PCI MMIO BARs that are backed by DRAM in the host) > > In case of the running guest, the host will also only read from the > cached mapping. Of course, at restore, the host will also write > through a cached mapping, but shouldn't the latter case be solvable by > having KVM clean the cache lines when faulting in any page? > We are still talking about the contents of the framebuffer, right? In that case, yes, afaict >> >> In the migration case, it is much more complicated, and I think >> capturing the state of the VM in a way that takes incoherency between >> caches and main memory into account is simply infeasible (i.e., the >> act of recording the state of guest RAM via a cached mapping may evict >> clean cachelines that are out of sync, and so it is impossible to >> record both the cached *and* the delta with the uncached state) > > This may be an incredibly stupid question (and I may have asked it > before), but why can't we clean+invalidate the guest page before > reading it and thereby obtain a coherent view of a page? > Because cleaning from the host will clobber whatever the guest wrote directly to memory with the MMU off, if there is a dirty cacheline shadowing that memory. However, that same cacheline could be dirty because the guest itself wrote to memory with the MMU on. So in general, it is hard to tell which version of the data is the correct one without having intimate knowledge of how the guest is using which parts of memory with the MMU off (the same applies to non-coherent DMA btw), and so whether clean or invalidate is the correct operation to perform from the host is not trivial to decide. >> >> I wonder how difficult it would be to >> a) enable trapping of the MMU system register when a guest CPU is >> found to have its MMU off at migration time >> b) allow the guest CPU to complete whatever it thinks it needs to be >> doing with the MMU off >> c) once it re-enables the MMU, proceed with capturing the memory state >> > I don't think this is impossible, but it would prevent migration of > things like bare-metal guests or weirdo unikernel guests running with > the MMU off, wouldn't it? > Yep.
On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 31 January 2018 at 16:53, Christoffer Dall > <christoffer.dall@linaro.org> wrote: >> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >> <ard.biesheuvel@linaro.org> wrote: >>> On 31 January 2018 at 09:53, Christoffer Dall >>> <christoffer.dall@linaro.org> wrote: >>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>> >>>>> I think the correct fix here is that your test code should turn >>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>> >>>> >>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>> >>>> a VM boot? >>>>> >>> >>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>> >>> tried to provoke that problem... >>>>> >> >>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>> > >>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>> > in the guest's system registers, and refuse migration if it's off... >>>>> > >>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>> > of the stick about how thin the ice is in the period before the >>>>> > guest turns on its MMU...) >>>>> >>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>> to have a consistent view of the memory. The trick is to prevent the >>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>> any given time if it needs to (and it is actually required on some HW if >>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>> >>>> >>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>> caches, right?) >>>> >>>>> You may have to pause the vcpus before starting the migration, or >>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>> is trying to disable its MMU while the migration is on. This would >>>>> involve trapping all the virtual memory related system registers, with >>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>> migrate the memory, so maybe that's acceptable. >>>>> >>>> Is that even sufficient? >>>> >>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>> incoherent, ...). >>>> >>>> I'm also not really sure if pausing one VCPU because it turned off its >>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>> refuse migration if you detect that a VCPU had begun turning off its >>>> MMU. >>>> >>>> On the larger scale of thins; this appears to me to be another case of >>>> us really needing some way to coherently access memory between QEMU and >>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>> migration, we don't even know where it may have written data, and I'm >>>> therefore not really sure what the 'proper' solution would be. >>>> >>>> (cc'ing Ard who has has thought about this problem before in the context >>>> of UEFI and VGA.) >>>> >>> >>> Actually, the VGA case is much simpler because the host is not >>> expected to write to the framebuffer, only read from it, and the guest >>> is not expected to create a cacheable mapping for it, so any >>> incoherency can be trivially solved by cache invalidation on the host >>> side. (Note that this has nothing to do with DMA coherency, but only >>> with PCI MMIO BARs that are backed by DRAM in the host) >> >> In case of the running guest, the host will also only read from the >> cached mapping. Of course, at restore, the host will also write >> through a cached mapping, but shouldn't the latter case be solvable by >> having KVM clean the cache lines when faulting in any page? >> > > We are still talking about the contents of the framebuffer, right? In > that case, yes, afaict > I was talking about normal RAM actually... not sure if that changes anything? >>> >>> In the migration case, it is much more complicated, and I think >>> capturing the state of the VM in a way that takes incoherency between >>> caches and main memory into account is simply infeasible (i.e., the >>> act of recording the state of guest RAM via a cached mapping may evict >>> clean cachelines that are out of sync, and so it is impossible to >>> record both the cached *and* the delta with the uncached state) >> >> This may be an incredibly stupid question (and I may have asked it >> before), but why can't we clean+invalidate the guest page before >> reading it and thereby obtain a coherent view of a page? >> > > Because cleaning from the host will clobber whatever the guest wrote > directly to memory with the MMU off, if there is a dirty cacheline > shadowing that memory. If the host never wrote anything to that memory (it shouldn't mess with the guest's memory) there will only be clean cache lines (even if they contain content shadowing the memory) and cleaning them would be equivalent to an invalidate. Am I misremembering how this works? > However, that same cacheline could be dirty > because the guest itself wrote to memory with the MMU on. Yes, but the guest would have no control over when such a cache line gets flushed to main memory by the hardware, and can have no reasonable expectation that the cache lines don't get cleaned behind its back. The fact that a migration triggers this, is reasonable. A guest that wants hand-off from main memory that its accessing with the MMU off, must invalidate the appropriate cache lines or ensure they're clean. There's very likely some subtle aspect to all of this that I'm forgetting. Thanks, -Christoffer
On 31 January 2018 at 17:39, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: >> On 31 January 2018 at 16:53, Christoffer Dall >> <christoffer.dall@linaro.org> wrote: >>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>> <ard.biesheuvel@linaro.org> wrote: >>>> On 31 January 2018 at 09:53, Christoffer Dall >>>> <christoffer.dall@linaro.org> wrote: >>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>> >>>> >>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>> >>>> a VM boot? >>>>>> >>> >>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>> >>> tried to provoke that problem... >>>>>> >> >>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>> > >>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>> > >>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>> > of the stick about how thin the ice is in the period before the >>>>>> > guest turns on its MMU...) >>>>>> >>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>> >>>>> >>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>> caches, right?) >>>>> >>>>>> You may have to pause the vcpus before starting the migration, or >>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>> is trying to disable its MMU while the migration is on. This would >>>>>> involve trapping all the virtual memory related system registers, with >>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>> migrate the memory, so maybe that's acceptable. >>>>>> >>>>> Is that even sufficient? >>>>> >>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>> incoherent, ...). >>>>> >>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>> MMU. >>>>> >>>>> On the larger scale of thins; this appears to me to be another case of >>>>> us really needing some way to coherently access memory between QEMU and >>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>> migration, we don't even know where it may have written data, and I'm >>>>> therefore not really sure what the 'proper' solution would be. >>>>> >>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>> of UEFI and VGA.) >>>>> >>>> >>>> Actually, the VGA case is much simpler because the host is not >>>> expected to write to the framebuffer, only read from it, and the guest >>>> is not expected to create a cacheable mapping for it, so any >>>> incoherency can be trivially solved by cache invalidation on the host >>>> side. (Note that this has nothing to do with DMA coherency, but only >>>> with PCI MMIO BARs that are backed by DRAM in the host) >>> >>> In case of the running guest, the host will also only read from the >>> cached mapping. Of course, at restore, the host will also write >>> through a cached mapping, but shouldn't the latter case be solvable by >>> having KVM clean the cache lines when faulting in any page? >>> >> >> We are still talking about the contents of the framebuffer, right? In >> that case, yes, afaict >> > > I was talking about normal RAM actually... not sure if that changes anything? > The main difference is that with a framebuffer BAR, it is pointless for the guest to map it cacheable, given that the purpose of a framebuffer is its side effects, which are not guaranteed to occur timely if the mapping is cacheable. If we are talking about normal RAM, then why are we discussing it here and not down there? vvv >>>> >>>> In the migration case, it is much more complicated, and I think >>>> capturing the state of the VM in a way that takes incoherency between >>>> caches and main memory into account is simply infeasible (i.e., the >>>> act of recording the state of guest RAM via a cached mapping may evict >>>> clean cachelines that are out of sync, and so it is impossible to >>>> record both the cached *and* the delta with the uncached state) >>> >>> This may be an incredibly stupid question (and I may have asked it >>> before), but why can't we clean+invalidate the guest page before >>> reading it and thereby obtain a coherent view of a page? >>> >> >> Because cleaning from the host will clobber whatever the guest wrote >> directly to memory with the MMU off, if there is a dirty cacheline >> shadowing that memory. > > If the host never wrote anything to that memory (it shouldn't mess > with the guest's memory) there will only be clean cache lines (even if > they contain content shadowing the memory) and cleaning them would be > equivalent to an invalidate. Am I misremembering how this works? > Cleaning doesn't actually invalidate, but it should be a no-op for clean cachelines. >> However, that same cacheline could be dirty >> because the guest itself wrote to memory with the MMU on. > > Yes, but the guest would have no control over when such a cache line > gets flushed to main memory by the hardware, and can have no > reasonable expectation that the cache lines don't get cleaned behind > its back. The fact that a migration triggers this, is reasonable. A > guest that wants hand-off from main memory that its accessing with the > MMU off, must invalidate the appropriate cache lines or ensure they're > clean. There's very likely some subtle aspect to all of this that I'm > forgetting. > OK, so if the only way cachelines covering guest memory could be dirty is after the guest wrote to that memory itself via a cacheable mapping, I guess it would be reasonable to do clean+invalidate before reading the memory. Then, the only way for the guest to lose anything is in cases where it could not reasonably expect it to be retained anyway. However, that does leave a window, between the invalidate and the read, where the guest could modify memory without it being visible to the host.
On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 31 January 2018 at 17:39, Christoffer Dall > <christoffer.dall@linaro.org> wrote: >> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel >> <ard.biesheuvel@linaro.org> wrote: >>> On 31 January 2018 at 16:53, Christoffer Dall >>> <christoffer.dall@linaro.org> wrote: >>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>>> <ard.biesheuvel@linaro.org> wrote: >>>>> On 31 January 2018 at 09:53, Christoffer Dall >>>>> <christoffer.dall@linaro.org> wrote: >>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>>> >>>> >>>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>>> >>>> a VM boot? >>>>>>> >>> >>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>>> >>> tried to provoke that problem... >>>>>>> >> >>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>>> > >>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>>> > >>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>>> > of the stick about how thin the ice is in the period before the >>>>>>> > guest turns on its MMU...) >>>>>>> >>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>>> >>>>>> >>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>>> caches, right?) >>>>>> >>>>>>> You may have to pause the vcpus before starting the migration, or >>>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>>> is trying to disable its MMU while the migration is on. This would >>>>>>> involve trapping all the virtual memory related system registers, with >>>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>>> migrate the memory, so maybe that's acceptable. >>>>>>> >>>>>> Is that even sufficient? >>>>>> >>>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>>> incoherent, ...). >>>>>> >>>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>>> MMU. >>>>>> >>>>>> On the larger scale of thins; this appears to me to be another case of >>>>>> us really needing some way to coherently access memory between QEMU and >>>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>>> migration, we don't even know where it may have written data, and I'm >>>>>> therefore not really sure what the 'proper' solution would be. >>>>>> >>>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>>> of UEFI and VGA.) >>>>>> >>>>> >>>>> Actually, the VGA case is much simpler because the host is not >>>>> expected to write to the framebuffer, only read from it, and the guest >>>>> is not expected to create a cacheable mapping for it, so any >>>>> incoherency can be trivially solved by cache invalidation on the host >>>>> side. (Note that this has nothing to do with DMA coherency, but only >>>>> with PCI MMIO BARs that are backed by DRAM in the host) >>>> >>>> In case of the running guest, the host will also only read from the >>>> cached mapping. Of course, at restore, the host will also write >>>> through a cached mapping, but shouldn't the latter case be solvable by >>>> having KVM clean the cache lines when faulting in any page? >>>> >>> >>> We are still talking about the contents of the framebuffer, right? In >>> that case, yes, afaict >>> >> >> I was talking about normal RAM actually... not sure if that changes anything? >> > > The main difference is that with a framebuffer BAR, it is pointless > for the guest to map it cacheable, given that the purpose of a > framebuffer is its side effects, which are not guaranteed to occur > timely if the mapping is cacheable. > > If we are talking about normal RAM, then why are we discussing it here > and not down there? > Because I was trying to figure out how the challenge of accessing the VGA framebuffer differs from the challenge of accessing guest RAM which may have been written by the guest with the MMU off. First approximation, they are extremely similar because the guest is writing uncached to memory, which the host now has to access via a cached mapping. But I'm guessing that a "clean+invalidate before read on the host" solution will result in terrible performance for a framebuffer and therefore isn't a good solution for that problem... > > >>>>> >>>>> In the migration case, it is much more complicated, and I think >>>>> capturing the state of the VM in a way that takes incoherency between >>>>> caches and main memory into account is simply infeasible (i.e., the >>>>> act of recording the state of guest RAM via a cached mapping may evict >>>>> clean cachelines that are out of sync, and so it is impossible to >>>>> record both the cached *and* the delta with the uncached state) >>>> >>>> This may be an incredibly stupid question (and I may have asked it >>>> before), but why can't we clean+invalidate the guest page before >>>> reading it and thereby obtain a coherent view of a page? >>>> >>> >>> Because cleaning from the host will clobber whatever the guest wrote >>> directly to memory with the MMU off, if there is a dirty cacheline >>> shadowing that memory. >> >> If the host never wrote anything to that memory (it shouldn't mess >> with the guest's memory) there will only be clean cache lines (even if >> they contain content shadowing the memory) and cleaning them would be >> equivalent to an invalidate. Am I misremembering how this works? >> > > Cleaning doesn't actually invalidate, but it should be a no-op for > clean cachelines. > >>> However, that same cacheline could be dirty >>> because the guest itself wrote to memory with the MMU on. >> >> Yes, but the guest would have no control over when such a cache line >> gets flushed to main memory by the hardware, and can have no >> reasonable expectation that the cache lines don't get cleaned behind >> its back. The fact that a migration triggers this, is reasonable. A >> guest that wants hand-off from main memory that its accessing with the >> MMU off, must invalidate the appropriate cache lines or ensure they're >> clean. There's very likely some subtle aspect to all of this that I'm >> forgetting. >> > > OK, so if the only way cachelines covering guest memory could be dirty > is after the guest wrote to that memory itself via a cacheable > mapping, I guess it would be reasonable to do clean+invalidate before > reading the memory. Then, the only way for the guest to lose anything > is in cases where it could not reasonably expect it to be retained > anyway. Right, that's what I'm thinking. > > However, that does leave a window, between the invalidate and the > read, where the guest could modify memory without it being visible to > the host. Is that a problem specific to the coherency challenge? I thought this problem was already addressed by dirty page tracking, but there's like some interaction with the cache maintenance that we'd have to figure out. Thanks, -Christoffer
On 31 January 2018 at 19:12, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: >> On 31 January 2018 at 17:39, Christoffer Dall >> <christoffer.dall@linaro.org> wrote: >>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel >>> <ard.biesheuvel@linaro.org> wrote: >>>> On 31 January 2018 at 16:53, Christoffer Dall >>>> <christoffer.dall@linaro.org> wrote: >>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>> On 31 January 2018 at 09:53, Christoffer Dall >>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>>>> >>>> >>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>>>> >>>> a VM boot? >>>>>>>> >>> >>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>>>> >>> tried to provoke that problem... >>>>>>>> >> >>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>>>> > >>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>>>> > >>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>>>> > of the stick about how thin the ice is in the period before the >>>>>>>> > guest turns on its MMU...) >>>>>>>> >>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>>>> >>>>>>> >>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>>>> caches, right?) >>>>>>> >>>>>>>> You may have to pause the vcpus before starting the migration, or >>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>>>> is trying to disable its MMU while the migration is on. This would >>>>>>>> involve trapping all the virtual memory related system registers, with >>>>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>>>> migrate the memory, so maybe that's acceptable. >>>>>>>> >>>>>>> Is that even sufficient? >>>>>>> >>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>>>> incoherent, ...). >>>>>>> >>>>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>>>> MMU. >>>>>>> >>>>>>> On the larger scale of thins; this appears to me to be another case of >>>>>>> us really needing some way to coherently access memory between QEMU and >>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>>>> migration, we don't even know where it may have written data, and I'm >>>>>>> therefore not really sure what the 'proper' solution would be. >>>>>>> >>>>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>>>> of UEFI and VGA.) >>>>>>> >>>>>> >>>>>> Actually, the VGA case is much simpler because the host is not >>>>>> expected to write to the framebuffer, only read from it, and the guest >>>>>> is not expected to create a cacheable mapping for it, so any >>>>>> incoherency can be trivially solved by cache invalidation on the host >>>>>> side. (Note that this has nothing to do with DMA coherency, but only >>>>>> with PCI MMIO BARs that are backed by DRAM in the host) >>>>> >>>>> In case of the running guest, the host will also only read from the >>>>> cached mapping. Of course, at restore, the host will also write >>>>> through a cached mapping, but shouldn't the latter case be solvable by >>>>> having KVM clean the cache lines when faulting in any page? >>>>> >>>> >>>> We are still talking about the contents of the framebuffer, right? In >>>> that case, yes, afaict >>>> >>> >>> I was talking about normal RAM actually... not sure if that changes anything? >>> >> >> The main difference is that with a framebuffer BAR, it is pointless >> for the guest to map it cacheable, given that the purpose of a >> framebuffer is its side effects, which are not guaranteed to occur >> timely if the mapping is cacheable. >> >> If we are talking about normal RAM, then why are we discussing it here >> and not down there? >> > > Because I was trying to figure out how the challenge of accessing the > VGA framebuffer differs from the challenge of accessing guest RAM > which may have been written by the guest with the MMU off. > > First approximation, they are extremely similar because the guest is > writing uncached to memory, which the host now has to access via a > cached mapping. > > But I'm guessing that a "clean+invalidate before read on the host" > solution will result in terrible performance for a framebuffer and > therefore isn't a good solution for that problem... > That highly depends on where 'not working' resides on the performance scale. Currently, VGA on KVM simply does not work at all, and so working but slow would be a huge improvement over the current situation. Also, the performance hit is caused by the fact that the data needs to make a round trip to memory, and the invalidation (without cleaning) performed by the host shouldn't make that much worse than it fundamentally is to begin with. A paravirtualized framebuffer (as was proposed recently by Gerd I think?) would solve this, since the guest can just map it as cacheable. >> >> >>>>>> >>>>>> In the migration case, it is much more complicated, and I think >>>>>> capturing the state of the VM in a way that takes incoherency between >>>>>> caches and main memory into account is simply infeasible (i.e., the >>>>>> act of recording the state of guest RAM via a cached mapping may evict >>>>>> clean cachelines that are out of sync, and so it is impossible to >>>>>> record both the cached *and* the delta with the uncached state) >>>>> >>>>> This may be an incredibly stupid question (and I may have asked it >>>>> before), but why can't we clean+invalidate the guest page before >>>>> reading it and thereby obtain a coherent view of a page? >>>>> >>>> >>>> Because cleaning from the host will clobber whatever the guest wrote >>>> directly to memory with the MMU off, if there is a dirty cacheline >>>> shadowing that memory. >>> >>> If the host never wrote anything to that memory (it shouldn't mess >>> with the guest's memory) there will only be clean cache lines (even if >>> they contain content shadowing the memory) and cleaning them would be >>> equivalent to an invalidate. Am I misremembering how this works? >>> >> >> Cleaning doesn't actually invalidate, but it should be a no-op for >> clean cachelines. >> >>>> However, that same cacheline could be dirty >>>> because the guest itself wrote to memory with the MMU on. >>> >>> Yes, but the guest would have no control over when such a cache line >>> gets flushed to main memory by the hardware, and can have no >>> reasonable expectation that the cache lines don't get cleaned behind >>> its back. The fact that a migration triggers this, is reasonable. A >>> guest that wants hand-off from main memory that its accessing with the >>> MMU off, must invalidate the appropriate cache lines or ensure they're >>> clean. There's very likely some subtle aspect to all of this that I'm >>> forgetting. >>> >> >> OK, so if the only way cachelines covering guest memory could be dirty >> is after the guest wrote to that memory itself via a cacheable >> mapping, I guess it would be reasonable to do clean+invalidate before >> reading the memory. Then, the only way for the guest to lose anything >> is in cases where it could not reasonably expect it to be retained >> anyway. > > Right, that's what I'm thinking. > >> >> However, that does leave a window, between the invalidate and the >> read, where the guest could modify memory without it being visible to >> the host. > > Is that a problem specific to the coherency challenge? I thought this > problem was already addressed by dirty page tracking, but there's like > some interaction with the cache maintenance that we'd have to figure > out. > I don't know how dirty page tracking works exactly, but if it that can track direct writes to memory as easily as cached writes, it would probably cover this as well.
On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 31 January 2018 at 19:12, Christoffer Dall > <christoffer.dall@linaro.org> wrote: >> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel >> <ard.biesheuvel@linaro.org> wrote: >>> On 31 January 2018 at 17:39, Christoffer Dall >>> <christoffer.dall@linaro.org> wrote: >>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel >>>> <ard.biesheuvel@linaro.org> wrote: >>>>> On 31 January 2018 at 16:53, Christoffer Dall >>>>> <christoffer.dall@linaro.org> wrote: >>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>>> On 31 January 2018 at 09:53, Christoffer Dall >>>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>>>>> >>>> >>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>>>>> >>>> a VM boot? >>>>>>>>> >>> >>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>>>>> >>> tried to provoke that problem... >>>>>>>>> >> >>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>>>>> > >>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>>>>> > >>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>>>>> > of the stick about how thin the ice is in the period before the >>>>>>>>> > guest turns on its MMU...) >>>>>>>>> >>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>>>>> >>>>>>>> >>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>>>>> caches, right?) >>>>>>>> >>>>>>>>> You may have to pause the vcpus before starting the migration, or >>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>>>>> is trying to disable its MMU while the migration is on. This would >>>>>>>>> involve trapping all the virtual memory related system registers, with >>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>>>>> migrate the memory, so maybe that's acceptable. >>>>>>>>> >>>>>>>> Is that even sufficient? >>>>>>>> >>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>>>>> incoherent, ...). >>>>>>>> >>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>>>>> MMU. >>>>>>>> >>>>>>>> On the larger scale of thins; this appears to me to be another case of >>>>>>>> us really needing some way to coherently access memory between QEMU and >>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>>>>> migration, we don't even know where it may have written data, and I'm >>>>>>>> therefore not really sure what the 'proper' solution would be. >>>>>>>> >>>>>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>>>>> of UEFI and VGA.) >>>>>>>> >>>>>>> >>>>>>> Actually, the VGA case is much simpler because the host is not >>>>>>> expected to write to the framebuffer, only read from it, and the guest >>>>>>> is not expected to create a cacheable mapping for it, so any >>>>>>> incoherency can be trivially solved by cache invalidation on the host >>>>>>> side. (Note that this has nothing to do with DMA coherency, but only >>>>>>> with PCI MMIO BARs that are backed by DRAM in the host) >>>>>> >>>>>> In case of the running guest, the host will also only read from the >>>>>> cached mapping. Of course, at restore, the host will also write >>>>>> through a cached mapping, but shouldn't the latter case be solvable by >>>>>> having KVM clean the cache lines when faulting in any page? >>>>>> >>>>> >>>>> We are still talking about the contents of the framebuffer, right? In >>>>> that case, yes, afaict >>>>> >>>> >>>> I was talking about normal RAM actually... not sure if that changes anything? >>>> >>> >>> The main difference is that with a framebuffer BAR, it is pointless >>> for the guest to map it cacheable, given that the purpose of a >>> framebuffer is its side effects, which are not guaranteed to occur >>> timely if the mapping is cacheable. >>> >>> If we are talking about normal RAM, then why are we discussing it here >>> and not down there? >>> >> >> Because I was trying to figure out how the challenge of accessing the >> VGA framebuffer differs from the challenge of accessing guest RAM >> which may have been written by the guest with the MMU off. >> >> First approximation, they are extremely similar because the guest is >> writing uncached to memory, which the host now has to access via a >> cached mapping. >> >> But I'm guessing that a "clean+invalidate before read on the host" >> solution will result in terrible performance for a framebuffer and >> therefore isn't a good solution for that problem... >> > > That highly depends on where 'not working' resides on the performance > scale. Currently, VGA on KVM simply does not work at all, and so > working but slow would be a huge improvement over the current > situation. > > Also, the performance hit is caused by the fact that the data needs to > make a round trip to memory, and the invalidation (without cleaning) > performed by the host shouldn't make that much worse than it > fundamentally is to begin with. > True, but Marc pointed out (on IRC) that the cache maintenance instructions are broadcast across all CPUs and may therefore adversely affect the performance of your entire system by running an emulated VGA framebuffer on a subset of the host CPUs. However, he also pointed out that the necessary cache maintenance can be done in EL0 and we wouldn't even need a new ioctl or anything else from KVM or the kernel to do this. I think we should measure the effect of this before dismissing it though. > A paravirtualized framebuffer (as was proposed recently by Gerd I > think?) would solve this, since the guest can just map it as > cacheable. > Yes, but it seems to me that the problem of incoherent views of memory between the guest and QEMU keeps coming up, and we therefore need to have a fix for this in software. >>> >>> >>>>>>> >>>>>>> In the migration case, it is much more complicated, and I think >>>>>>> capturing the state of the VM in a way that takes incoherency between >>>>>>> caches and main memory into account is simply infeasible (i.e., the >>>>>>> act of recording the state of guest RAM via a cached mapping may evict >>>>>>> clean cachelines that are out of sync, and so it is impossible to >>>>>>> record both the cached *and* the delta with the uncached state) >>>>>> >>>>>> This may be an incredibly stupid question (and I may have asked it >>>>>> before), but why can't we clean+invalidate the guest page before >>>>>> reading it and thereby obtain a coherent view of a page? >>>>>> >>>>> >>>>> Because cleaning from the host will clobber whatever the guest wrote >>>>> directly to memory with the MMU off, if there is a dirty cacheline >>>>> shadowing that memory. >>>> >>>> If the host never wrote anything to that memory (it shouldn't mess >>>> with the guest's memory) there will only be clean cache lines (even if >>>> they contain content shadowing the memory) and cleaning them would be >>>> equivalent to an invalidate. Am I misremembering how this works? >>>> >>> >>> Cleaning doesn't actually invalidate, but it should be a no-op for >>> clean cachelines. >>> >>>>> However, that same cacheline could be dirty >>>>> because the guest itself wrote to memory with the MMU on. >>>> >>>> Yes, but the guest would have no control over when such a cache line >>>> gets flushed to main memory by the hardware, and can have no >>>> reasonable expectation that the cache lines don't get cleaned behind >>>> its back. The fact that a migration triggers this, is reasonable. A >>>> guest that wants hand-off from main memory that its accessing with the >>>> MMU off, must invalidate the appropriate cache lines or ensure they're >>>> clean. There's very likely some subtle aspect to all of this that I'm >>>> forgetting. >>>> >>> >>> OK, so if the only way cachelines covering guest memory could be dirty >>> is after the guest wrote to that memory itself via a cacheable >>> mapping, I guess it would be reasonable to do clean+invalidate before >>> reading the memory. Then, the only way for the guest to lose anything >>> is in cases where it could not reasonably expect it to be retained >>> anyway. >> >> Right, that's what I'm thinking. >> >>> >>> However, that does leave a window, between the invalidate and the >>> read, where the guest could modify memory without it being visible to >>> the host. >> >> Is that a problem specific to the coherency challenge? I thought this >> problem was already addressed by dirty page tracking, but there's like >> some interaction with the cache maintenance that we'd have to figure >> out. >> > > I don't know how dirty page tracking works exactly, but if it that can > track direct writes to memory as easily as cached writes, it would > probably cover this as well. I don't see why it shouldn't. Guest pages get mapped as read-only in stage 2, and their memory attributes shouldn't be able to affect permission checks (one can only hope). Unless I'm missing something fundamental, I think we should add functionality in QEMU to clean+invalidate pages on aarch64 hosts and see if that solves both VGA and migration, and if so, what the potential performance impacts are. Thanks, -Christoffer
On 1 February 2018 at 09:17, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: >> On 31 January 2018 at 19:12, Christoffer Dall >> <christoffer.dall@linaro.org> wrote: >>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel >>> <ard.biesheuvel@linaro.org> wrote: >>>> On 31 January 2018 at 17:39, Christoffer Dall >>>> <christoffer.dall@linaro.org> wrote: >>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel >>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>> On 31 January 2018 at 16:53, Christoffer Dall >>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>>>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall >>>>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>>>>>> >>>> >>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>>>>>> >>>> a VM boot? >>>>>>>>>> >>> >>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>>>>>> >>> tried to provoke that problem... >>>>>>>>>> >> >>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>>>>>> > >>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>>>>>> > >>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>>>>>> > of the stick about how thin the ice is in the period before the >>>>>>>>>> > guest turns on its MMU...) >>>>>>>>>> >>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>>>>>> >>>>>>>>> >>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>>>>>> caches, right?) >>>>>>>>> >>>>>>>>>> You may have to pause the vcpus before starting the migration, or >>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>>>>>> is trying to disable its MMU while the migration is on. This would >>>>>>>>>> involve trapping all the virtual memory related system registers, with >>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>>>>>> migrate the memory, so maybe that's acceptable. >>>>>>>>>> >>>>>>>>> Is that even sufficient? >>>>>>>>> >>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>>>>>> incoherent, ...). >>>>>>>>> >>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>>>>>> MMU. >>>>>>>>> >>>>>>>>> On the larger scale of thins; this appears to me to be another case of >>>>>>>>> us really needing some way to coherently access memory between QEMU and >>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>>>>>> migration, we don't even know where it may have written data, and I'm >>>>>>>>> therefore not really sure what the 'proper' solution would be. >>>>>>>>> >>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>>>>>> of UEFI and VGA.) >>>>>>>>> >>>>>>>> >>>>>>>> Actually, the VGA case is much simpler because the host is not >>>>>>>> expected to write to the framebuffer, only read from it, and the guest >>>>>>>> is not expected to create a cacheable mapping for it, so any >>>>>>>> incoherency can be trivially solved by cache invalidation on the host >>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only >>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host) >>>>>>> >>>>>>> In case of the running guest, the host will also only read from the >>>>>>> cached mapping. Of course, at restore, the host will also write >>>>>>> through a cached mapping, but shouldn't the latter case be solvable by >>>>>>> having KVM clean the cache lines when faulting in any page? >>>>>>> >>>>>> >>>>>> We are still talking about the contents of the framebuffer, right? In >>>>>> that case, yes, afaict >>>>>> >>>>> >>>>> I was talking about normal RAM actually... not sure if that changes anything? >>>>> >>>> >>>> The main difference is that with a framebuffer BAR, it is pointless >>>> for the guest to map it cacheable, given that the purpose of a >>>> framebuffer is its side effects, which are not guaranteed to occur >>>> timely if the mapping is cacheable. >>>> >>>> If we are talking about normal RAM, then why are we discussing it here >>>> and not down there? >>>> >>> >>> Because I was trying to figure out how the challenge of accessing the >>> VGA framebuffer differs from the challenge of accessing guest RAM >>> which may have been written by the guest with the MMU off. >>> >>> First approximation, they are extremely similar because the guest is >>> writing uncached to memory, which the host now has to access via a >>> cached mapping. >>> >>> But I'm guessing that a "clean+invalidate before read on the host" >>> solution will result in terrible performance for a framebuffer and >>> therefore isn't a good solution for that problem... >>> >> >> That highly depends on where 'not working' resides on the performance >> scale. Currently, VGA on KVM simply does not work at all, and so >> working but slow would be a huge improvement over the current >> situation. >> >> Also, the performance hit is caused by the fact that the data needs to >> make a round trip to memory, and the invalidation (without cleaning) >> performed by the host shouldn't make that much worse than it >> fundamentally is to begin with. >> > > True, but Marc pointed out (on IRC) that the cache maintenance > instructions are broadcast across all CPUs and may therefore adversely > affect the performance of your entire system by running an emulated > VGA framebuffer on a subset of the host CPUs. However, he also > pointed out that the necessary cache maintenance can be done in EL0 > and we wouldn't even need a new ioctl or anything else from KVM or the > kernel to do this. I think we should measure the effect of this > before dismissing it though. > If you could use dirty page tracking to only perform the cache invalidation when the framebuffer memory has been updated, you can at least limit the impact to cases where the framebuffer is actually used, rather than sitting idle with a nice wallpaper image. >> A paravirtualized framebuffer (as was proposed recently by Gerd I >> think?) would solve this, since the guest can just map it as >> cacheable. >> > > Yes, but it seems to me that the problem of incoherent views of memory > between the guest and QEMU keeps coming up, and we therefore need to > have a fix for this in software. > Agreed. >>>> >>>> >>>>>>>> >>>>>>>> In the migration case, it is much more complicated, and I think >>>>>>>> capturing the state of the VM in a way that takes incoherency between >>>>>>>> caches and main memory into account is simply infeasible (i.e., the >>>>>>>> act of recording the state of guest RAM via a cached mapping may evict >>>>>>>> clean cachelines that are out of sync, and so it is impossible to >>>>>>>> record both the cached *and* the delta with the uncached state) >>>>>>> >>>>>>> This may be an incredibly stupid question (and I may have asked it >>>>>>> before), but why can't we clean+invalidate the guest page before >>>>>>> reading it and thereby obtain a coherent view of a page? >>>>>>> >>>>>> >>>>>> Because cleaning from the host will clobber whatever the guest wrote >>>>>> directly to memory with the MMU off, if there is a dirty cacheline >>>>>> shadowing that memory. >>>>> >>>>> If the host never wrote anything to that memory (it shouldn't mess >>>>> with the guest's memory) there will only be clean cache lines (even if >>>>> they contain content shadowing the memory) and cleaning them would be >>>>> equivalent to an invalidate. Am I misremembering how this works? >>>>> >>>> >>>> Cleaning doesn't actually invalidate, but it should be a no-op for >>>> clean cachelines. >>>> >>>>>> However, that same cacheline could be dirty >>>>>> because the guest itself wrote to memory with the MMU on. >>>>> >>>>> Yes, but the guest would have no control over when such a cache line >>>>> gets flushed to main memory by the hardware, and can have no >>>>> reasonable expectation that the cache lines don't get cleaned behind >>>>> its back. The fact that a migration triggers this, is reasonable. A >>>>> guest that wants hand-off from main memory that its accessing with the >>>>> MMU off, must invalidate the appropriate cache lines or ensure they're >>>>> clean. There's very likely some subtle aspect to all of this that I'm >>>>> forgetting. >>>>> >>>> >>>> OK, so if the only way cachelines covering guest memory could be dirty >>>> is after the guest wrote to that memory itself via a cacheable >>>> mapping, I guess it would be reasonable to do clean+invalidate before >>>> reading the memory. Then, the only way for the guest to lose anything >>>> is in cases where it could not reasonably expect it to be retained >>>> anyway. >>> >>> Right, that's what I'm thinking. >>> >>>> >>>> However, that does leave a window, between the invalidate and the >>>> read, where the guest could modify memory without it being visible to >>>> the host. >>> >>> Is that a problem specific to the coherency challenge? I thought this >>> problem was already addressed by dirty page tracking, but there's like >>> some interaction with the cache maintenance that we'd have to figure >>> out. >>> >> >> I don't know how dirty page tracking works exactly, but if it that can >> track direct writes to memory as easily as cached writes, it would >> probably cover this as well. > > I don't see why it shouldn't. Guest pages get mapped as read-only in > stage 2, and their memory attributes shouldn't be able to affect > permission checks (one can only hope). > > Unless I'm missing something fundamental, I think we should add > functionality in QEMU to clean+invalidate pages on aarch64 hosts and > see if that solves both VGA and migration, and if so, what the > potential performance impacts are. > Indeed. Identifying the framebuffer region is easy for QEMU, so there we can invalidate (without clean) selectively. However, although we'd probably need to capture the state of the MMU bit anyway when handling the stage 2 fault for the dirty page tracking, I don't think we can generally infer whether a faulting access was made via an uncached mapping while the MMU was on, and so this would require clean+invalidate on all dirty pages when doing a migration.
On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 1 February 2018 at 09:17, Christoffer Dall > <christoffer.dall@linaro.org> wrote: >> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel >> <ard.biesheuvel@linaro.org> wrote: >>> On 31 January 2018 at 19:12, Christoffer Dall >>> <christoffer.dall@linaro.org> wrote: >>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel >>>> <ard.biesheuvel@linaro.org> wrote: >>>>> On 31 January 2018 at 17:39, Christoffer Dall >>>>> <christoffer.dall@linaro.org> wrote: >>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel >>>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>>> On 31 January 2018 at 16:53, Christoffer Dall >>>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel >>>>>>>> <ard.biesheuvel@linaro.org> wrote: >>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall >>>>>>>>> <christoffer.dall@linaro.org> wrote: >>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: >>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: >>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: >>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: >>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn >>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work >>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory >>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as >>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. >>>>>>>>>>> >>>> >>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during >>>>>>>>>>> >>>> a VM boot? >>>>>>>>>>> >>> >>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever >>>>>>>>>>> >>> tried to provoke that problem... >>>>>>>>>>> >> >>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail >>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest. >>>>>>>>>>> > >>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit >>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off... >>>>>>>>>>> > >>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end >>>>>>>>>>> > of the stick about how thin the ice is in the period before the >>>>>>>>>>> > guest turns on its MMU...) >>>>>>>>>>> >>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU >>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the >>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at >>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if >>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. >>>>>>>>>>> >>>>>>>>>> >>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's >>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its >>>>>>>>>> caches, right?) >>>>>>>>>> >>>>>>>>>>> You may have to pause the vcpus before starting the migration, or >>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that >>>>>>>>>>> is trying to disable its MMU while the migration is on. This would >>>>>>>>>>> involve trapping all the virtual memory related system registers, with >>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to >>>>>>>>>>> migrate the memory, so maybe that's acceptable. >>>>>>>>>>> >>>>>>>>>> Is that even sufficient? >>>>>>>>>> >>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest >>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads >>>>>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, >>>>>>>>>> incoherent, ...). >>>>>>>>>> >>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its >>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this >>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU >>>>>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to >>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its >>>>>>>>>> MMU. >>>>>>>>>> >>>>>>>>>> On the larger scale of thins; this appears to me to be another case of >>>>>>>>>> us really needing some way to coherently access memory between QEMU and >>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to >>>>>>>>>> migration, we don't even know where it may have written data, and I'm >>>>>>>>>> therefore not really sure what the 'proper' solution would be. >>>>>>>>>> >>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context >>>>>>>>>> of UEFI and VGA.) >>>>>>>>>> >>>>>>>>> >>>>>>>>> Actually, the VGA case is much simpler because the host is not >>>>>>>>> expected to write to the framebuffer, only read from it, and the guest >>>>>>>>> is not expected to create a cacheable mapping for it, so any >>>>>>>>> incoherency can be trivially solved by cache invalidation on the host >>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only >>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host) >>>>>>>> >>>>>>>> In case of the running guest, the host will also only read from the >>>>>>>> cached mapping. Of course, at restore, the host will also write >>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by >>>>>>>> having KVM clean the cache lines when faulting in any page? >>>>>>>> >>>>>>> >>>>>>> We are still talking about the contents of the framebuffer, right? In >>>>>>> that case, yes, afaict >>>>>>> >>>>>> >>>>>> I was talking about normal RAM actually... not sure if that changes anything? >>>>>> >>>>> >>>>> The main difference is that with a framebuffer BAR, it is pointless >>>>> for the guest to map it cacheable, given that the purpose of a >>>>> framebuffer is its side effects, which are not guaranteed to occur >>>>> timely if the mapping is cacheable. >>>>> >>>>> If we are talking about normal RAM, then why are we discussing it here >>>>> and not down there? >>>>> >>>> >>>> Because I was trying to figure out how the challenge of accessing the >>>> VGA framebuffer differs from the challenge of accessing guest RAM >>>> which may have been written by the guest with the MMU off. >>>> >>>> First approximation, they are extremely similar because the guest is >>>> writing uncached to memory, which the host now has to access via a >>>> cached mapping. >>>> >>>> But I'm guessing that a "clean+invalidate before read on the host" >>>> solution will result in terrible performance for a framebuffer and >>>> therefore isn't a good solution for that problem... >>>> >>> >>> That highly depends on where 'not working' resides on the performance >>> scale. Currently, VGA on KVM simply does not work at all, and so >>> working but slow would be a huge improvement over the current >>> situation. >>> >>> Also, the performance hit is caused by the fact that the data needs to >>> make a round trip to memory, and the invalidation (without cleaning) >>> performed by the host shouldn't make that much worse than it >>> fundamentally is to begin with. >>> >> >> True, but Marc pointed out (on IRC) that the cache maintenance >> instructions are broadcast across all CPUs and may therefore adversely >> affect the performance of your entire system by running an emulated >> VGA framebuffer on a subset of the host CPUs. However, he also >> pointed out that the necessary cache maintenance can be done in EL0 >> and we wouldn't even need a new ioctl or anything else from KVM or the >> kernel to do this. I think we should measure the effect of this >> before dismissing it though. >> > > If you could use dirty page tracking to only perform the cache > invalidation when the framebuffer memory has been updated, you can at > least limit the impact to cases where the framebuffer is actually > used, rather than sitting idle with a nice wallpaper image. > > >>> A paravirtualized framebuffer (as was proposed recently by Gerd I >>> think?) would solve this, since the guest can just map it as >>> cacheable. >>> >> >> Yes, but it seems to me that the problem of incoherent views of memory >> between the guest and QEMU keeps coming up, and we therefore need to >> have a fix for this in software. >> > > Agreed. > >>>>> >>>>> >>>>>>>>> >>>>>>>>> In the migration case, it is much more complicated, and I think >>>>>>>>> capturing the state of the VM in a way that takes incoherency between >>>>>>>>> caches and main memory into account is simply infeasible (i.e., the >>>>>>>>> act of recording the state of guest RAM via a cached mapping may evict >>>>>>>>> clean cachelines that are out of sync, and so it is impossible to >>>>>>>>> record both the cached *and* the delta with the uncached state) >>>>>>>> >>>>>>>> This may be an incredibly stupid question (and I may have asked it >>>>>>>> before), but why can't we clean+invalidate the guest page before >>>>>>>> reading it and thereby obtain a coherent view of a page? >>>>>>>> >>>>>>> >>>>>>> Because cleaning from the host will clobber whatever the guest wrote >>>>>>> directly to memory with the MMU off, if there is a dirty cacheline >>>>>>> shadowing that memory. >>>>>> >>>>>> If the host never wrote anything to that memory (it shouldn't mess >>>>>> with the guest's memory) there will only be clean cache lines (even if >>>>>> they contain content shadowing the memory) and cleaning them would be >>>>>> equivalent to an invalidate. Am I misremembering how this works? >>>>>> >>>>> >>>>> Cleaning doesn't actually invalidate, but it should be a no-op for >>>>> clean cachelines. >>>>> >>>>>>> However, that same cacheline could be dirty >>>>>>> because the guest itself wrote to memory with the MMU on. >>>>>> >>>>>> Yes, but the guest would have no control over when such a cache line >>>>>> gets flushed to main memory by the hardware, and can have no >>>>>> reasonable expectation that the cache lines don't get cleaned behind >>>>>> its back. The fact that a migration triggers this, is reasonable. A >>>>>> guest that wants hand-off from main memory that its accessing with the >>>>>> MMU off, must invalidate the appropriate cache lines or ensure they're >>>>>> clean. There's very likely some subtle aspect to all of this that I'm >>>>>> forgetting. >>>>>> >>>>> >>>>> OK, so if the only way cachelines covering guest memory could be dirty >>>>> is after the guest wrote to that memory itself via a cacheable >>>>> mapping, I guess it would be reasonable to do clean+invalidate before >>>>> reading the memory. Then, the only way for the guest to lose anything >>>>> is in cases where it could not reasonably expect it to be retained >>>>> anyway. >>>> >>>> Right, that's what I'm thinking. >>>> >>>>> >>>>> However, that does leave a window, between the invalidate and the >>>>> read, where the guest could modify memory without it being visible to >>>>> the host. >>>> >>>> Is that a problem specific to the coherency challenge? I thought this >>>> problem was already addressed by dirty page tracking, but there's like >>>> some interaction with the cache maintenance that we'd have to figure >>>> out. >>>> >>> >>> I don't know how dirty page tracking works exactly, but if it that can >>> track direct writes to memory as easily as cached writes, it would >>> probably cover this as well. >> >> I don't see why it shouldn't. Guest pages get mapped as read-only in >> stage 2, and their memory attributes shouldn't be able to affect >> permission checks (one can only hope). >> >> Unless I'm missing something fundamental, I think we should add >> functionality in QEMU to clean+invalidate pages on aarch64 hosts and >> see if that solves both VGA and migration, and if so, what the >> potential performance impacts are. >> > > Indeed. > > Identifying the framebuffer region is easy for QEMU, so there we can > invalidate (without clean) selectively. Do you think there's a material win of doing invalidate rather than clean+invalidate or could we just aim for "access coherent" == "clean+invalidate" and we don't have to make any assumptions about how the guest is accessing a particular piece of memory? > However, although we'd > probably need to capture the state of the MMU bit anyway when handling > the stage 2 fault for the dirty page tracking, I don't think we can > generally infer whether a faulting access was made via an uncached > mapping while the MMU was on, and so this would require > clean+invalidate on all dirty pages when doing a migration. My gut feeling here is that there might be some very clever scheme to optimize cache maintenance of the 'hot pages' when dirty page tracking is enabled, but since the set of hot pages by virtue of migration algorithms has to be small, the win is not likely to impact anything real-life, and it's therefore not worth looking into. Thanks, -Christoffer
On 1 February 2018 at 09:59, Christoffer Dall <christoffer.dall@linaro.org> wrote: > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: >> On 1 February 2018 at 09:17, Christoffer Dall >> <christoffer.dall@linaro.org> wrote: ... >>> >>> Unless I'm missing something fundamental, I think we should add >>> functionality in QEMU to clean+invalidate pages on aarch64 hosts and >>> see if that solves both VGA and migration, and if so, what the >>> potential performance impacts are. >>> >> >> Indeed. >> >> Identifying the framebuffer region is easy for QEMU, so there we can >> invalidate (without clean) selectively. > > Do you think there's a material win of doing invalidate rather than > clean+invalidate or could we just aim for "access coherent" == > "clean+invalidate" and we don't have to make any assumptions about how > the guest is accessing a particular piece of memory? > In general, invalidate should be cheaper than clean, given that just discarding the data involves less work than writing it back to memory. Whether that actually holds in current SMP designs, and whether it still applies when the memory can be assumed to be clean in the first place are things I can't really answer. >> However, although we'd >> probably need to capture the state of the MMU bit anyway when handling >> the stage 2 fault for the dirty page tracking, I don't think we can >> generally infer whether a faulting access was made via an uncached >> mapping while the MMU was on, and so this would require >> clean+invalidate on all dirty pages when doing a migration. > > My gut feeling here is that there might be some very clever scheme to > optimize cache maintenance of the 'hot pages' when dirty page tracking > is enabled, but since the set of hot pages by virtue of migration > algorithms has to be small, the win is not likely to impact anything > real-life, and it's therefore not worth looking into. > Sounds reasonable.
On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote: > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel > <ard.biesheuvel@linaro.org> wrote: > > On 1 February 2018 at 09:17, Christoffer Dall > > <christoffer.dall@linaro.org> wrote: > >> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel > >> <ard.biesheuvel@linaro.org> wrote: > >>> On 31 January 2018 at 19:12, Christoffer Dall > >>> <christoffer.dall@linaro.org> wrote: > >>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel > >>>> <ard.biesheuvel@linaro.org> wrote: > >>>>> On 31 January 2018 at 17:39, Christoffer Dall > >>>>> <christoffer.dall@linaro.org> wrote: > >>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel > >>>>>> <ard.biesheuvel@linaro.org> wrote: > >>>>>>> On 31 January 2018 at 16:53, Christoffer Dall > >>>>>>> <christoffer.dall@linaro.org> wrote: > >>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel > >>>>>>>> <ard.biesheuvel@linaro.org> wrote: > >>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall > >>>>>>>>> <christoffer.dall@linaro.org> wrote: > >>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: > >>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: > >>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: > >>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > >>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: > >>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn > >>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > >>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory > >>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as > >>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. > >>>>>>>>>>> >>>> > >>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during > >>>>>>>>>>> >>>> a VM boot? > >>>>>>>>>>> >>> > >>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever > >>>>>>>>>>> >>> tried to provoke that problem... > >>>>>>>>>>> >> > >>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail > >>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest. > >>>>>>>>>>> > > >>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > >>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off... > >>>>>>>>>>> > > >>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end > >>>>>>>>>>> > of the stick about how thin the ice is in the period before the > >>>>>>>>>>> > guest turns on its MMU...) > >>>>>>>>>>> > >>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU > >>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the > >>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at > >>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if > >>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. > >>>>>>>>>>> > >>>>>>>>>> > >>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's > >>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its > >>>>>>>>>> caches, right?) > >>>>>>>>>> > >>>>>>>>>>> You may have to pause the vcpus before starting the migration, or > >>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that > >>>>>>>>>>> is trying to disable its MMU while the migration is on. This would > >>>>>>>>>>> involve trapping all the virtual memory related system registers, with > >>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to > >>>>>>>>>>> migrate the memory, so maybe that's acceptable. > >>>>>>>>>>> > >>>>>>>>>> Is that even sufficient? > >>>>>>>>>> > >>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest > >>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads > >>>>>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, > >>>>>>>>>> incoherent, ...). > >>>>>>>>>> > >>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its > >>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this > >>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU > >>>>>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to > >>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its > >>>>>>>>>> MMU. > >>>>>>>>>> > >>>>>>>>>> On the larger scale of thins; this appears to me to be another case of > >>>>>>>>>> us really needing some way to coherently access memory between QEMU and > >>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to > >>>>>>>>>> migration, we don't even know where it may have written data, and I'm > >>>>>>>>>> therefore not really sure what the 'proper' solution would be. > >>>>>>>>>> > >>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context > >>>>>>>>>> of UEFI and VGA.) > >>>>>>>>>> > >>>>>>>>> > >>>>>>>>> Actually, the VGA case is much simpler because the host is not > >>>>>>>>> expected to write to the framebuffer, only read from it, and the guest > >>>>>>>>> is not expected to create a cacheable mapping for it, so any > >>>>>>>>> incoherency can be trivially solved by cache invalidation on the host > >>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only > >>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host) > >>>>>>>> > >>>>>>>> In case of the running guest, the host will also only read from the > >>>>>>>> cached mapping. Of course, at restore, the host will also write > >>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by > >>>>>>>> having KVM clean the cache lines when faulting in any page? > >>>>>>>> > >>>>>>> > >>>>>>> We are still talking about the contents of the framebuffer, right? In > >>>>>>> that case, yes, afaict > >>>>>>> > >>>>>> > >>>>>> I was talking about normal RAM actually... not sure if that changes anything? > >>>>>> > >>>>> > >>>>> The main difference is that with a framebuffer BAR, it is pointless > >>>>> for the guest to map it cacheable, given that the purpose of a > >>>>> framebuffer is its side effects, which are not guaranteed to occur > >>>>> timely if the mapping is cacheable. > >>>>> > >>>>> If we are talking about normal RAM, then why are we discussing it here > >>>>> and not down there? > >>>>> > >>>> > >>>> Because I was trying to figure out how the challenge of accessing the > >>>> VGA framebuffer differs from the challenge of accessing guest RAM > >>>> which may have been written by the guest with the MMU off. > >>>> > >>>> First approximation, they are extremely similar because the guest is > >>>> writing uncached to memory, which the host now has to access via a > >>>> cached mapping. > >>>> > >>>> But I'm guessing that a "clean+invalidate before read on the host" > >>>> solution will result in terrible performance for a framebuffer and > >>>> therefore isn't a good solution for that problem... > >>>> > >>> > >>> That highly depends on where 'not working' resides on the performance > >>> scale. Currently, VGA on KVM simply does not work at all, and so > >>> working but slow would be a huge improvement over the current > >>> situation. > >>> > >>> Also, the performance hit is caused by the fact that the data needs to > >>> make a round trip to memory, and the invalidation (without cleaning) > >>> performed by the host shouldn't make that much worse than it > >>> fundamentally is to begin with. > >>> > >> > >> True, but Marc pointed out (on IRC) that the cache maintenance > >> instructions are broadcast across all CPUs and may therefore adversely > >> affect the performance of your entire system by running an emulated > >> VGA framebuffer on a subset of the host CPUs. However, he also > >> pointed out that the necessary cache maintenance can be done in EL0 I had some memory of this not being the case, but I just checked and indeed 'dc civac' will work from el0 with sctlr_el1.uci=1. Note, I see that the gcc builtin __builtin___clear_cache() does not use that instruction (it uses 'dc cvau'), so we'd need to implement a wrapper ourselves (maybe that's why I thought it wasn't available...) > >> and we wouldn't even need a new ioctl or anything else from KVM or the > >> kernel to do this. I think we should measure the effect of this > >> before dismissing it though. > >> > > > > If you could use dirty page tracking to only perform the cache > > invalidation when the framebuffer memory has been updated, you can at > > least limit the impact to cases where the framebuffer is actually > > used, rather than sitting idle with a nice wallpaper image. Yes, this is the exact approach I took back when I experimented with this. I must have screwed up my PoC in some way (like using the gcc builtin), because it wasn't working for me... > > > > > >>> A paravirtualized framebuffer (as was proposed recently by Gerd I > >>> think?) would solve this, since the guest can just map it as > >>> cacheable. > >>> > >> > >> Yes, but it seems to me that the problem of incoherent views of memory > >> between the guest and QEMU keeps coming up, and we therefore need to > >> have a fix for this in software. > >> > > > > Agreed. > > > >>>>> > >>>>> > >>>>>>>>> > >>>>>>>>> In the migration case, it is much more complicated, and I think > >>>>>>>>> capturing the state of the VM in a way that takes incoherency between > >>>>>>>>> caches and main memory into account is simply infeasible (i.e., the > >>>>>>>>> act of recording the state of guest RAM via a cached mapping may evict > >>>>>>>>> clean cachelines that are out of sync, and so it is impossible to > >>>>>>>>> record both the cached *and* the delta with the uncached state) > >>>>>>>> > >>>>>>>> This may be an incredibly stupid question (and I may have asked it > >>>>>>>> before), but why can't we clean+invalidate the guest page before > >>>>>>>> reading it and thereby obtain a coherent view of a page? > >>>>>>>> > >>>>>>> > >>>>>>> Because cleaning from the host will clobber whatever the guest wrote > >>>>>>> directly to memory with the MMU off, if there is a dirty cacheline > >>>>>>> shadowing that memory. > >>>>>> > >>>>>> If the host never wrote anything to that memory (it shouldn't mess > >>>>>> with the guest's memory) there will only be clean cache lines (even if > >>>>>> they contain content shadowing the memory) and cleaning them would be > >>>>>> equivalent to an invalidate. Am I misremembering how this works? > >>>>>> > >>>>> > >>>>> Cleaning doesn't actually invalidate, but it should be a no-op for > >>>>> clean cachelines. > >>>>> > >>>>>>> However, that same cacheline could be dirty > >>>>>>> because the guest itself wrote to memory with the MMU on. > >>>>>> > >>>>>> Yes, but the guest would have no control over when such a cache line > >>>>>> gets flushed to main memory by the hardware, and can have no > >>>>>> reasonable expectation that the cache lines don't get cleaned behind > >>>>>> its back. The fact that a migration triggers this, is reasonable. A > >>>>>> guest that wants hand-off from main memory that its accessing with the > >>>>>> MMU off, must invalidate the appropriate cache lines or ensure they're > >>>>>> clean. There's very likely some subtle aspect to all of this that I'm > >>>>>> forgetting. > >>>>>> > >>>>> > >>>>> OK, so if the only way cachelines covering guest memory could be dirty > >>>>> is after the guest wrote to that memory itself via a cacheable > >>>>> mapping, I guess it would be reasonable to do clean+invalidate before > >>>>> reading the memory. Then, the only way for the guest to lose anything > >>>>> is in cases where it could not reasonably expect it to be retained > >>>>> anyway. > >>>> > >>>> Right, that's what I'm thinking. > >>>> > >>>>> > >>>>> However, that does leave a window, between the invalidate and the > >>>>> read, where the guest could modify memory without it being visible to > >>>>> the host. > >>>> > >>>> Is that a problem specific to the coherency challenge? I thought this > >>>> problem was already addressed by dirty page tracking, but there's like > >>>> some interaction with the cache maintenance that we'd have to figure > >>>> out. > >>>> > >>> > >>> I don't know how dirty page tracking works exactly, but if it that can > >>> track direct writes to memory as easily as cached writes, it would > >>> probably cover this as well. > >> > >> I don't see why it shouldn't. Guest pages get mapped as read-only in > >> stage 2, and their memory attributes shouldn't be able to affect > >> permission checks (one can only hope). > >> > >> Unless I'm missing something fundamental, I think we should add > >> functionality in QEMU to clean+invalidate pages on aarch64 hosts and > >> see if that solves both VGA and migration, and if so, what the > >> potential performance impacts are. > >> > > > > Indeed. > > > > Identifying the framebuffer region is easy for QEMU, so there we can > > invalidate (without clean) selectively. > > Do you think there's a material win of doing invalidate rather than > clean+invalidate or could we just aim for "access coherent" == > "clean+invalidate" and we don't have to make any assumptions about how > the guest is accessing a particular piece of memory? > > > However, although we'd > > probably need to capture the state of the MMU bit anyway when handling > > the stage 2 fault for the dirty page tracking, I don't think we can > > generally infer whether a faulting access was made via an uncached > > mapping while the MMU was on, and so this would require > > clean+invalidate on all dirty pages when doing a migration. > > My gut feeling here is that there might be some very clever scheme to > optimize cache maintenance of the 'hot pages' when dirty page tracking > is enabled, but since the set of hot pages by virtue of migration > algorithms has to be small, the win is not likely to impact anything > real-life, and it's therefore not worth looking into. > Thanks, drew
On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote: > On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote: > > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel > > <ard.biesheuvel@linaro.org> wrote: > > > On 1 February 2018 at 09:17, Christoffer Dall > > > <christoffer.dall@linaro.org> wrote: > > >> On Wed, Jan 31, 2018 at 9:15 PM, Ard Biesheuvel > > >> <ard.biesheuvel@linaro.org> wrote: > > >>> On 31 January 2018 at 19:12, Christoffer Dall > > >>> <christoffer.dall@linaro.org> wrote: > > >>>> On Wed, Jan 31, 2018 at 7:00 PM, Ard Biesheuvel > > >>>> <ard.biesheuvel@linaro.org> wrote: > > >>>>> On 31 January 2018 at 17:39, Christoffer Dall > > >>>>> <christoffer.dall@linaro.org> wrote: > > >>>>>> On Wed, Jan 31, 2018 at 5:59 PM, Ard Biesheuvel > > >>>>>> <ard.biesheuvel@linaro.org> wrote: > > >>>>>>> On 31 January 2018 at 16:53, Christoffer Dall > > >>>>>>> <christoffer.dall@linaro.org> wrote: > > >>>>>>>> On Wed, Jan 31, 2018 at 4:18 PM, Ard Biesheuvel > > >>>>>>>> <ard.biesheuvel@linaro.org> wrote: > > >>>>>>>>> On 31 January 2018 at 09:53, Christoffer Dall > > >>>>>>>>> <christoffer.dall@linaro.org> wrote: > > >>>>>>>>>> On Mon, Jan 29, 2018 at 10:32:12AM +0000, Marc Zyngier wrote: > > >>>>>>>>>>> On 29/01/18 10:04, Peter Maydell wrote: > > >>>>>>>>>>> > On 29 January 2018 at 09:53, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > >>>>>>>>>>> >> * Peter Maydell (peter.maydell@linaro.org) wrote: > > >>>>>>>>>>> >>> On 26 January 2018 at 19:46, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote: > > >>>>>>>>>>> >>>> * Peter Maydell (peter.maydell@linaro.org) wrote: > > >>>>>>>>>>> >>>>> I think the correct fix here is that your test code should turn > > >>>>>>>>>>> >>>>> its MMU on. Trying to treat guest RAM as uncacheable doesn't work > > >>>>>>>>>>> >>>>> for Arm KVM guests (for the same reason that VGA device video memory > > >>>>>>>>>>> >>>>> doesn't work). If it's RAM your guest has to arrange to map it as > > >>>>>>>>>>> >>>>> Normal Cacheable, and then everything should work fine. > > >>>>>>>>>>> >>>> > > >>>>>>>>>>> >>>> Does this cause problems with migrating at just the wrong point during > > >>>>>>>>>>> >>>> a VM boot? > > >>>>>>>>>>> >>> > > >>>>>>>>>>> >>> It wouldn't surprise me if it did, but I don't think I've ever > > >>>>>>>>>>> >>> tried to provoke that problem... > > >>>>>>>>>>> >> > > >>>>>>>>>>> >> If you think it'll get the RAM contents wrong, it might be best to fail > > >>>>>>>>>>> >> the migration if you can detect the cache is disabled in the guest. > > >>>>>>>>>>> > > > >>>>>>>>>>> > I guess QEMU could look at the value of the "MMU disabled/enabled" bit > > >>>>>>>>>>> > in the guest's system registers, and refuse migration if it's off... > > >>>>>>>>>>> > > > >>>>>>>>>>> > (cc'd Marc, Christoffer to check that I don't have the wrong end > > >>>>>>>>>>> > of the stick about how thin the ice is in the period before the > > >>>>>>>>>>> > guest turns on its MMU...) > > >>>>>>>>>>> > > >>>>>>>>>>> Once MMU and caches are on, we should be in a reasonable place for QEMU > > >>>>>>>>>>> to have a consistent view of the memory. The trick is to prevent the > > >>>>>>>>>>> vcpus from changing that. A guest could perfectly turn off its MMU at > > >>>>>>>>>>> any given time if it needs to (and it is actually required on some HW if > > >>>>>>>>>>> you want to mitigate headlining CVEs), and KVM won't know about that. > > >>>>>>>>>>> > > >>>>>>>>>> > > >>>>>>>>>> (Clarification: KVM can detect this is it bother to check the VCPU's > > >>>>>>>>>> system registers, but we don't trap to KVM when the VCPU turns off its > > >>>>>>>>>> caches, right?) > > >>>>>>>>>> > > >>>>>>>>>>> You may have to pause the vcpus before starting the migration, or > > >>>>>>>>>>> introduce a new KVM feature that would automatically pause a vcpu that > > >>>>>>>>>>> is trying to disable its MMU while the migration is on. This would > > >>>>>>>>>>> involve trapping all the virtual memory related system registers, with > > >>>>>>>>>>> an obvious cost. But that cost would be limited to the time it takes to > > >>>>>>>>>>> migrate the memory, so maybe that's acceptable. > > >>>>>>>>>>> > > >>>>>>>>>> Is that even sufficient? > > >>>>>>>>>> > > >>>>>>>>>> What if the following happened. (1) guest turns off MMU, (2) guest > > >>>>>>>>>> writes some data directly to ram (3) qemu stops the vcpu (4) qemu reads > > >>>>>>>>>> guest ram. QEMU's view of guest ram is now incorrect (stale, > > >>>>>>>>>> incoherent, ...). > > >>>>>>>>>> > > >>>>>>>>>> I'm also not really sure if pausing one VCPU because it turned off its > > >>>>>>>>>> MMU will go very well when trying to migrate a large VM (wouldn't this > > >>>>>>>>>> ask for all the other VCPUs beginning to complain that the stopped VCPU > > >>>>>>>>>> appears to be dead?). As a short-term 'fix' it's probably better to > > >>>>>>>>>> refuse migration if you detect that a VCPU had begun turning off its > > >>>>>>>>>> MMU. > > >>>>>>>>>> > > >>>>>>>>>> On the larger scale of thins; this appears to me to be another case of > > >>>>>>>>>> us really needing some way to coherently access memory between QEMU and > > >>>>>>>>>> the VM, but in the case of the VCPU turning off the MMU prior to > > >>>>>>>>>> migration, we don't even know where it may have written data, and I'm > > >>>>>>>>>> therefore not really sure what the 'proper' solution would be. > > >>>>>>>>>> > > >>>>>>>>>> (cc'ing Ard who has has thought about this problem before in the context > > >>>>>>>>>> of UEFI and VGA.) > > >>>>>>>>>> > > >>>>>>>>> > > >>>>>>>>> Actually, the VGA case is much simpler because the host is not > > >>>>>>>>> expected to write to the framebuffer, only read from it, and the guest > > >>>>>>>>> is not expected to create a cacheable mapping for it, so any > > >>>>>>>>> incoherency can be trivially solved by cache invalidation on the host > > >>>>>>>>> side. (Note that this has nothing to do with DMA coherency, but only > > >>>>>>>>> with PCI MMIO BARs that are backed by DRAM in the host) > > >>>>>>>> > > >>>>>>>> In case of the running guest, the host will also only read from the > > >>>>>>>> cached mapping. Of course, at restore, the host will also write > > >>>>>>>> through a cached mapping, but shouldn't the latter case be solvable by > > >>>>>>>> having KVM clean the cache lines when faulting in any page? > > >>>>>>>> > > >>>>>>> > > >>>>>>> We are still talking about the contents of the framebuffer, right? In > > >>>>>>> that case, yes, afaict > > >>>>>>> > > >>>>>> > > >>>>>> I was talking about normal RAM actually... not sure if that changes anything? > > >>>>>> > > >>>>> > > >>>>> The main difference is that with a framebuffer BAR, it is pointless > > >>>>> for the guest to map it cacheable, given that the purpose of a > > >>>>> framebuffer is its side effects, which are not guaranteed to occur > > >>>>> timely if the mapping is cacheable. > > >>>>> > > >>>>> If we are talking about normal RAM, then why are we discussing it here > > >>>>> and not down there? > > >>>>> > > >>>> > > >>>> Because I was trying to figure out how the challenge of accessing the > > >>>> VGA framebuffer differs from the challenge of accessing guest RAM > > >>>> which may have been written by the guest with the MMU off. > > >>>> > > >>>> First approximation, they are extremely similar because the guest is > > >>>> writing uncached to memory, which the host now has to access via a > > >>>> cached mapping. > > >>>> > > >>>> But I'm guessing that a "clean+invalidate before read on the host" > > >>>> solution will result in terrible performance for a framebuffer and > > >>>> therefore isn't a good solution for that problem... > > >>>> > > >>> > > >>> That highly depends on where 'not working' resides on the performance > > >>> scale. Currently, VGA on KVM simply does not work at all, and so > > >>> working but slow would be a huge improvement over the current > > >>> situation. > > >>> > > >>> Also, the performance hit is caused by the fact that the data needs to > > >>> make a round trip to memory, and the invalidation (without cleaning) > > >>> performed by the host shouldn't make that much worse than it > > >>> fundamentally is to begin with. > > >>> > > >> > > >> True, but Marc pointed out (on IRC) that the cache maintenance > > >> instructions are broadcast across all CPUs and may therefore adversely > > >> affect the performance of your entire system by running an emulated > > >> VGA framebuffer on a subset of the host CPUs. However, he also > > >> pointed out that the necessary cache maintenance can be done in EL0 > > I had some memory of this not being the case, but I just checked and > indeed 'dc civac' will work from el0 with sctlr_el1.uci=1. Note, I > see that the gcc builtin __builtin___clear_cache() does not use that > instruction (it uses 'dc cvau'), so we'd need to implement a wrapper > ourselves (maybe that's why I thought it wasn't available...) > > > >> and we wouldn't even need a new ioctl or anything else from KVM or the > > >> kernel to do this. I think we should measure the effect of this > > >> before dismissing it though. > > >> > > > > > > If you could use dirty page tracking to only perform the cache > > > invalidation when the framebuffer memory has been updated, you can at > > > least limit the impact to cases where the framebuffer is actually > > > used, rather than sitting idle with a nice wallpaper image. > > Yes, this is the exact approach I took back when I experimented with > this. I must have screwed up my PoC in some way (like using the gcc > builtin), because it wasn't working for me... > Does that mean you have some code you feel like reviving and use to send out an RFC based on? ;) -Christoffer
On Thu, Feb 01, 2018 at 11:48:31AM +0100, Christoffer Dall wrote: > On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote: > > On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote: > > > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel > > > > If you could use dirty page tracking to only perform the cache > > > > invalidation when the framebuffer memory has been updated, you can at > > > > least limit the impact to cases where the framebuffer is actually > > > > used, rather than sitting idle with a nice wallpaper image. > > > > Yes, this is the exact approach I took back when I experimented with > > this. I must have screwed up my PoC in some way (like using the gcc > > builtin), because it wasn't working for me... > > > Does that mean you have some code you feel like reviving and use to send > out an RFC based on? ;) > I don't usually delete things, but I do do some sort of copy on use from old homedirs to new ones from time to time, letting stuff that gets really old completely drop at some point. I might be able to find the old work, or just hack a quick and ugly version from my memory to share - hopefully whatever I did to cause it to fail last time has been forgotten :-) drew
On Thu, Feb 01, 2018 at 01:25:20PM +0100, Andrew Jones wrote: > On Thu, Feb 01, 2018 at 11:48:31AM +0100, Christoffer Dall wrote: > > On Thu, Feb 01, 2018 at 11:42:22AM +0100, Andrew Jones wrote: > > > On Thu, Feb 01, 2018 at 10:59:54AM +0100, Christoffer Dall wrote: > > > > On Thu, Feb 1, 2018 at 10:33 AM, Ard Biesheuvel > > > > > If you could use dirty page tracking to only perform the cache > > > > > invalidation when the framebuffer memory has been updated, you can at > > > > > least limit the impact to cases where the framebuffer is actually > > > > > used, rather than sitting idle with a nice wallpaper image. > > > > > > Yes, this is the exact approach I took back when I experimented with > > > this. I must have screwed up my PoC in some way (like using the gcc > > > builtin), because it wasn't working for me... > > > > > Does that mean you have some code you feel like reviving and use to send > > out an RFC based on? ;) > > > > I don't usually delete things, but I do do some sort of copy on use > from old homedirs to new ones from time to time, letting stuff that > gets really old completely drop at some point. I might be able to > find the old work, or just hack a quick and ugly version from my > memory to share - hopefully whatever I did to cause it to fail last > time has been forgotten :-) > If you have bandwidth to look at this that's great. Otherwise let us know, and I'll see if I can devote some time to it. Thanks, -Christoffer
* Wei Huang (wei@redhat.com) wrote: > > > On 01/25/2018 02:05 PM, Dr. David Alan Gilbert wrote: > > * Wei Huang (wei@redhat.com) wrote: > >> This patch adds the migration test support for aarch64. The test code, > >> which implements the same functionality as x86, is compiled into a binary > >> and booted as a kernel in qemu. Here are the design ideas: > >> > >> * We choose this -kernel design because aarch64 QEMU doesn't provide a > >> built-in fw like x86 does. So instead of relying on a boot loader, we > >> use -kernel approach for aarch64. > >> * The serial output is sent to PL011 directly. > >> * The physical memory base for mach-virt machine is 0x40000000. We change > >> the start_address and end_address for aarch64. > >> > >> RFC->V1: > >> * aarch64 kernel blob is defined as an uint32_t array > >> * The test code is re-written to address a data caching issue under KVM. > >> Tests passed under both x86 and aarch64. > >> * Re-use init_bootfile_x86() for both x86 and aarch64 > >> * Other minor fixes > >> > >> Note that the test code is as the following: > >> > >> .section .text > >> > >> .globl start > >> > >> start: > >> /* disable MMU to use phys mem address */ > >> mrs x0, sctlr_el1 > >> bic x0, x0, #(1<<0) > >> msr sctlr_el1, x0 > >> isb > >> > >> /* output char 'A' to PL011 */ > >> mov w4, #65 > >> mov x5, #0x9000000 > >> strb w4, [x5] > >> > >> /* w6 keeps a counter so we can limit the output speed */ > >> mov w6, #0 > >> > >> /* phys mem base addr = 0x40000000 */ > >> mov x3, #(0x40000000 + 100 *1024*1024) /* traverse 1M-100M */ > >> mov x2, #(0x40000000 + 1 * 1024*1024) > >> > >> /* clean up memory first */ > >> mov w1, #0 > >> clean: > >> strb w1, [x2] > >> add x2, x2, #(4096) > >> cmp x2, x3 > >> ble clean > >> > >> /* main body */ > >> mainloop: > >> mov x2, #(0x40000000 + 1 * 1024*1024) > >> > >> innerloop: > >> /* clean cache because el2 might still cache guest data under KVM */ > >> dc civac, x2 > > > > Can you explain a bit more about that please; if it's guest > > visible acorss migration, doesn't that suggest we need the cache clear > > in KVM or QEMU? > > I think this is ARM specific. This is caused by the inconsistency > between guest VM's data accesses and userspace's accesses (in > check_guest_ram) at the destination: > > 1) Because uncacheable (guest) + cacheable (host) ==> uncacheable. So > the data accesses from guest VM go directly into memory. > 2) QEMU user space will use the cacheable version. So it is possible > that data can come from cache, instead of RAM. The difference between > (1) and (2) obviously can cause check_guest_ram() to fail sometimes. > > x86's EPT has the capability to ignore guest-provided memory type. So it > is possible to have consistent data views between guest and QEMU user-space. I'll admit to not quite understanding where this thread ended up; it seems to have fallen into other ARM consistency questions. Other than that it looks fine to me, but see the separate patch I posted yesterday that includes the x86 source. Peter: What's your view on the ight fix for hte memory consistency? Dave > > > > > Dave > > > >> ldrb w1, [x2] > >> add w1, w1, #1 > >> and w1, w1, #(0xff) > >> strb w1, [x2] > >> > >> add x2, x2, #(4096) > >> cmp x2, x3 > >> blt innerloop > >> > >> add w6, w6, #1 > >> and w6, w6, #(0xff) > >> cmp w6, #0 > >> bne mainloop > >> > >> /* output char 'B' to PL011 */ > >> mov w4, #66 > >> mov x5, #0x9000000 > >> strb w4, [x5] > >> > >> bl mainloop > >> > >> The code is compiled with the following commands: > >> > gcc -c -o fill.o fill.s > >> > gcc -O2 -o fill.elf -Wl,-T,/tmp/flat.lds,--build-id=none,-Ttext=40080000 \ > >> -nostdlib fill.o > >> > objcopy -O binary fill.elf fill.flat > >> > truncate -c -s 144 ./fill.flat > >> > xxd -g 4 -c 24 -e fill.flat | awk '{print "0x"$2 ", " "0x"$3 ", " "0x"C$4 ", " "0x"C$5 ", " "0x"$6 ", " "0x"C$7 "," }' > >> > >> The linker file (borrowed from KVM unit test) is defined as: > >> > >> SECTIONS > >> { > >> .text : { *(.init) *(.text) *(.text.*) } > >> . = ALIGN(64K); > >> etext = .; > >> .data : { > >> *(.data) > >> } > >> . = ALIGN(16); > >> .rodata : { *(.rodata) } > >> . = ALIGN(16); > >> .bss : { *(.bss) } > >> . = ALIGN(64K); > >> edata = .; > >> . += 64K; > >> . = ALIGN(64K); > >> /* > >> * stack depth is 16K for arm and PAGE_SIZE for arm64, see THREAD_SIZE > >> * sp must be 16 byte aligned for arm64, and 8 byte aligned for arm > >> * sp must always be strictly less than the true stacktop > >> */ > >> stackptr = . - 16; > >> stacktop = .; > >> } > >> > >> ENTRY(start) > >> > >> Signed-off-by: Wei Huang <wei@redhat.com> > >> --- > >> tests/Makefile.include | 1 + > >> tests/migration-test.c | 37 +++++++++++++++++++++++++++++++------ > >> 2 files changed, 32 insertions(+), 6 deletions(-) > >> > >> diff --git a/tests/Makefile.include b/tests/Makefile.include > >> index b4bcc872f2..2a520e53ab 100644 > >> --- a/tests/Makefile.include > >> +++ b/tests/Makefile.include > >> @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF) > >> gcov-files-arm-y += hw/timer/arm_mptimer.c > >> > >> check-qtest-aarch64-y = tests/numa-test$(EXESUF) > >> +check-qtest-aarch64-y += tests/migration-test$(EXESUF) > >> > >> check-qtest-microblazeel-y = $(check-qtest-microblaze-y) > >> > >> diff --git a/tests/migration-test.c b/tests/migration-test.c > >> index be598d3257..3237fe93b2 100644 > >> --- a/tests/migration-test.c > >> +++ b/tests/migration-test.c > >> @@ -22,8 +22,8 @@ > >> > >> #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */ > >> > >> -const unsigned start_address = 1024 * 1024; > >> -const unsigned end_address = 100 * 1024 * 1024; > >> +unsigned start_address = 1024 * 1024; > >> +unsigned end_address = 100 * 1024 * 1024; > >> bool got_stop; > >> > >> #if defined(__linux__) > >> @@ -79,7 +79,7 @@ static const char *tmpfs; > >> /* A simple PC boot sector that modifies memory (1-100MB) quickly > >> * outputing a 'B' every so often if it's still running. > >> */ > >> -unsigned char bootsect[] = { > >> +unsigned char x86_bootsect[] = { > >> 0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00, > >> 0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00, > >> 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02, > >> @@ -125,11 +125,20 @@ unsigned char bootsect[] = { > >> 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa > >> }; > >> > >> -static void init_bootfile_x86(const char *bootpath) > >> +uint32_t aarch64_kernel[] = { > >> + 0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005, > >> + 0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041, > >> + 0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041, > >> + 0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b, > >> + 0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005, > >> + 0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, > >> +}; > >> + > >> +static void init_bootfile(const char *bootpath, void *content) > >> { > >> FILE *bootfile = fopen(bootpath, "wb"); > >> > >> - g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1); > >> + g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1); > >> fclose(bootfile); > >> } > >> > >> @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to, > >> got_stop = false; > >> > >> if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { > >> - init_bootfile_x86(bootpath); > >> + init_bootfile(bootpath, x86_bootsect); > >> cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M" > >> " -name pcsource,debug-threads=on" > >> " -serial file:%s/src_serial" > >> @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to, > >> " -serial file:%s/dest_serial" > >> " -incoming %s", > >> accel, tmpfs, uri); > >> + } else if (strcmp(arch, "aarch64") == 0) { > >> + init_bootfile(bootpath, aarch64_kernel); > >> + cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " > >> + "-name vmsource,debug-threads=on -cpu host " > >> + "-serial file:%s/src_serial " > >> + "-kernel %s ", > >> + tmpfs, bootpath); > >> + cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " > >> + "-name vmdest,debug-threads=on -cpu host " > >> + "-serial file:%s/dest_serial " > >> + "-kernel %s " > >> + "-incoming %s ", > >> + tmpfs, bootpath, uri); > >> + /* aarch64 virt machine physical mem started from 0x40000000 */ > >> + start_address += 0x40000000; > >> + end_address += 0x40000000; > >> } else { > >> g_assert_not_reached(); > >> } > >> -- > >> 2.14.3 > >> > > -- > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK > > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff --git a/tests/Makefile.include b/tests/Makefile.include index b4bcc872f2..2a520e53ab 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -357,6 +357,7 @@ check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF) gcov-files-arm-y += hw/timer/arm_mptimer.c check-qtest-aarch64-y = tests/numa-test$(EXESUF) +check-qtest-aarch64-y += tests/migration-test$(EXESUF) check-qtest-microblazeel-y = $(check-qtest-microblaze-y) diff --git a/tests/migration-test.c b/tests/migration-test.c index be598d3257..3237fe93b2 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -22,8 +22,8 @@ #define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */ -const unsigned start_address = 1024 * 1024; -const unsigned end_address = 100 * 1024 * 1024; +unsigned start_address = 1024 * 1024; +unsigned end_address = 100 * 1024 * 1024; bool got_stop; #if defined(__linux__) @@ -79,7 +79,7 @@ static const char *tmpfs; /* A simple PC boot sector that modifies memory (1-100MB) quickly * outputing a 'B' every so often if it's still running. */ -unsigned char bootsect[] = { +unsigned char x86_bootsect[] = { 0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02, @@ -125,11 +125,20 @@ unsigned char bootsect[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa }; -static void init_bootfile_x86(const char *bootpath) +uint32_t aarch64_kernel[] = { + 0xd5381000, 0x927ff800, 0xd5181000, 0xd5033fdf, 0x52800824, 0xd2a12005, + 0x390000a4, 0x52800006, 0xd2a8c803, 0xd2a80202, 0x52800001, 0x39000041, + 0x91400442, 0xeb03005f, 0x54ffffad, 0xd2a80202, 0xd50b7e22, 0x39400041, + 0x11000421, 0x12001c21, 0x39000041, 0x91400442, 0xeb03005f, 0x54ffff2b, + 0x110004c6, 0x12001cc6, 0x710000df, 0x54fffe81, 0x52800844, 0xd2a12005, + 0x390000a4, 0x97fffff0, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +static void init_bootfile(const char *bootpath, void *content) { FILE *bootfile = fopen(bootpath, "wb"); - g_assert_cmpint(fwrite(bootsect, 512, 1, bootfile), ==, 1); + g_assert_cmpint(fwrite(content, 512, 1, bootfile), ==, 1); fclose(bootfile); } @@ -442,7 +451,7 @@ static void test_migrate_start(QTestState **from, QTestState **to, got_stop = false; if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { - init_bootfile_x86(bootpath); + init_bootfile(bootpath, x86_bootsect); cmd_src = g_strdup_printf("-machine accel=kvm:tcg -m 150M" " -name pcsource,debug-threads=on" " -serial file:%s/src_serial" @@ -470,6 +479,22 @@ static void test_migrate_start(QTestState **from, QTestState **to, " -serial file:%s/dest_serial" " -incoming %s", accel, tmpfs, uri); + } else if (strcmp(arch, "aarch64") == 0) { + init_bootfile(bootpath, aarch64_kernel); + cmd_src = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " + "-name vmsource,debug-threads=on -cpu host " + "-serial file:%s/src_serial " + "-kernel %s ", + tmpfs, bootpath); + cmd_dst = g_strdup_printf("-machine virt,accel=kvm:tcg -m 150M " + "-name vmdest,debug-threads=on -cpu host " + "-serial file:%s/dest_serial " + "-kernel %s " + "-incoming %s ", + tmpfs, bootpath, uri); + /* aarch64 virt machine physical mem started from 0x40000000 */ + start_address += 0x40000000; + end_address += 0x40000000; } else { g_assert_not_reached(); }