Message ID | 20210318174611.293520-4-andrey.gruzdev@virtuozzo.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | migration: Fixes to the 'background-snapshot' code | expand |
> +/* > + * ram_block_populate_pages: populate memory in the RAM block by reading > + * an integer from the beginning of each page. > + * > + * Since it's solely used for userfault_fd WP feature, here we just > + * hardcode page size to TARGET_PAGE_SIZE. > + * > + * @bs: RAM block to populate > + */ > +volatile int ram_block_populate_pages__tmp; > +static void ram_block_populate_pages(RAMBlock *bs) > +{ > + ram_addr_t offset = 0; > + int tmp = 0; > + > + for (char *ptr = (char *) bs->host; offset < bs->used_length; > + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE > + /* Try to do it without memory writes */ > + tmp += *(volatile int *) ptr; > + } The following is slightly simpler and doesn't rely on volatile semantics [1]. Should work on any arch I guess. static void ram_block_populate_pages(RAMBlock *bs) { char *ptr = (char *) bs->host; ram_addr_t offset; for (offset = 0; offset < bs->used_length; offset += qemu_real_host_page_size) { char tmp = *(volatile char *)(ptr + offset) /* Don't optimize the read out. */ asm volatile ("" : "+r" (tmp)); } Compiles to for (offset = 0; offset < bs->used_length; 316d: 48 8b 4b 30 mov 0x30(%rbx),%rcx char *ptr = (char *) bs->host; 3171: 48 8b 73 18 mov 0x18(%rbx),%rsi for (offset = 0; offset < bs->used_length; 3175: 48 85 c9 test %rcx,%rcx 3178: 74 ce je 3148 <ram_write_tracking_prepare+0x58> offset += qemu_real_host_page_size) { 317a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 3181 <ram_write_tracking_prepare+0x91> 3181: 48 8b 38 mov (%rax),%rdi 3184: 0f 1f 40 00 nopl 0x0(%rax) char tmp = *(volatile char *)(ptr + offset); 3188: 48 8d 04 16 lea (%rsi,%rdx,1),%rax 318c: 0f b6 00 movzbl (%rax),%eax offset += qemu_real_host_page_size) { 318f: 48 01 fa add %rdi,%rdx for (offset = 0; offset < bs->used_length; 3192: 48 39 ca cmp %rcx,%rdx 3195: 72 f1 jb 3188 <ram_write_tracking_prepare+0x98> [1] https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ I'll send patches soon to take care of virtio-mem via RamDiscardManager - to skip populating the parts that are supposed to remain discarded and not migrated. Unfortunately, the RamDiscardManager patches are still stuck waiting for acks ... and now we're in soft-freeze.
On 19.03.21 10:28, David Hildenbrand wrote: >> +/* >> + * ram_block_populate_pages: populate memory in the RAM block by reading >> + * an integer from the beginning of each page. >> + * >> + * Since it's solely used for userfault_fd WP feature, here we just >> + * hardcode page size to TARGET_PAGE_SIZE. >> + * >> + * @bs: RAM block to populate >> + */ >> +volatile int ram_block_populate_pages__tmp; >> +static void ram_block_populate_pages(RAMBlock *bs) >> +{ >> + ram_addr_t offset = 0; >> + int tmp = 0; >> + >> + for (char *ptr = (char *) bs->host; offset < bs->used_length; >> + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { > > You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE > >> + /* Try to do it without memory writes */ >> + tmp += *(volatile int *) ptr; >> + } > > > The following is slightly simpler and doesn't rely on volatile semantics [1]. > Should work on any arch I guess. > > static void ram_block_populate_pages(RAMBlock *bs) > { > char *ptr = (char *) bs->host; > ram_addr_t offset; > > for (offset = 0; offset < bs->used_length; > offset += qemu_real_host_page_size) { > char tmp = *(volatile char *)(ptr + offset) I wanted to do a "= *(ptr + offset)" here. > > /* Don't optimize the read out. */ > asm volatile ("" : "+r" (tmp)); So this is the only volatile thing that the compiler must guarantee to not optimize away.
On 19.03.2021 12:28, David Hildenbrand wrote: >> +/* >> + * ram_block_populate_pages: populate memory in the RAM block by >> reading >> + * an integer from the beginning of each page. >> + * >> + * Since it's solely used for userfault_fd WP feature, here we just >> + * hardcode page size to TARGET_PAGE_SIZE. >> + * >> + * @bs: RAM block to populate >> + */ >> +volatile int ram_block_populate_pages__tmp; >> +static void ram_block_populate_pages(RAMBlock *bs) >> +{ >> + ram_addr_t offset = 0; >> + int tmp = 0; >> + >> + for (char *ptr = (char *) bs->host; offset < bs->used_length; >> + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { > > You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE > Ok. >> + /* Try to do it without memory writes */ >> + tmp += *(volatile int *) ptr; >> + } > > > The following is slightly simpler and doesn't rely on volatile > semantics [1]. > Should work on any arch I guess. > > static void ram_block_populate_pages(RAMBlock *bs) > { > char *ptr = (char *) bs->host; > ram_addr_t offset; > > for (offset = 0; offset < bs->used_length; > offset += qemu_real_host_page_size) { > char tmp = *(volatile char *)(ptr + offset) > > /* Don't optimize the read out. */ > asm volatile ("" : "+r" (tmp)); > } > Thanks, good option, I'll change the code. > Compiles to > > for (offset = 0; offset < bs->used_length; > 316d: 48 8b 4b 30 mov 0x30(%rbx),%rcx > char *ptr = (char *) bs->host; > 3171: 48 8b 73 18 mov 0x18(%rbx),%rsi > for (offset = 0; offset < bs->used_length; > 3175: 48 85 c9 test %rcx,%rcx > 3178: 74 ce je 3148 > <ram_write_tracking_prepare+0x58> > offset += qemu_real_host_page_size) { > 317a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # > 3181 <ram_write_tracking_prepare+0x91> > 3181: 48 8b 38 mov (%rax),%rdi > 3184: 0f 1f 40 00 nopl 0x0(%rax) > char tmp = *(volatile char *)(ptr + offset); > 3188: 48 8d 04 16 lea (%rsi,%rdx,1),%rax > 318c: 0f b6 00 movzbl (%rax),%eax > offset += qemu_real_host_page_size) { > 318f: 48 01 fa add %rdi,%rdx > for (offset = 0; offset < bs->used_length; > 3192: 48 39 ca cmp %rcx,%rdx > 3195: 72 f1 jb 3188 > <ram_write_tracking_prepare+0x98> > > > [1] > https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ > > > I'll send patches soon to take care of virtio-mem via RamDiscardManager - > to skip populating the parts that are supposed to remain discarded and > not migrated. > Unfortunately, the RamDiscardManager patches are still stuck waiting for > acks ... and now we're in soft-freeze. > RamDiscardManager patches - do they also modify migration code? I mean which part is responsible of not migrating discarded ranges.
On 19.03.2021 12:32, David Hildenbrand wrote: > On 19.03.21 10:28, David Hildenbrand wrote: >>> +/* >>> + * ram_block_populate_pages: populate memory in the RAM block by >>> reading >>> + * an integer from the beginning of each page. >>> + * >>> + * Since it's solely used for userfault_fd WP feature, here we just >>> + * hardcode page size to TARGET_PAGE_SIZE. >>> + * >>> + * @bs: RAM block to populate >>> + */ >>> +volatile int ram_block_populate_pages__tmp; >>> +static void ram_block_populate_pages(RAMBlock *bs) >>> +{ >>> + ram_addr_t offset = 0; >>> + int tmp = 0; >>> + >>> + for (char *ptr = (char *) bs->host; offset < bs->used_length; >>> + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { >> >> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE >> >>> + /* Try to do it without memory writes */ >>> + tmp += *(volatile int *) ptr; >>> + } >> >> >> The following is slightly simpler and doesn't rely on volatile >> semantics [1]. >> Should work on any arch I guess. >> >> static void ram_block_populate_pages(RAMBlock *bs) >> { >> char *ptr = (char *) bs->host; >> ram_addr_t offset; >> >> for (offset = 0; offset < bs->used_length; >> offset += qemu_real_host_page_size) { >> char tmp = *(volatile char *)(ptr + offset) > > I wanted to do a "= *(ptr + offset)" here. > Yep >> >> /* Don't optimize the read out. */ >> asm volatile ("" : "+r" (tmp)); > > So this is the only volatile thing that the compiler must guarantee to > not optimize away. > >
On 19.03.21 12:05, Andrey Gruzdev wrote: > On 19.03.2021 12:28, David Hildenbrand wrote: >>> +/* >>> + * ram_block_populate_pages: populate memory in the RAM block by >>> reading >>> + * an integer from the beginning of each page. >>> + * >>> + * Since it's solely used for userfault_fd WP feature, here we just >>> + * hardcode page size to TARGET_PAGE_SIZE. >>> + * >>> + * @bs: RAM block to populate >>> + */ >>> +volatile int ram_block_populate_pages__tmp; >>> +static void ram_block_populate_pages(RAMBlock *bs) >>> +{ >>> + ram_addr_t offset = 0; >>> + int tmp = 0; >>> + >>> + for (char *ptr = (char *) bs->host; offset < bs->used_length; >>> + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { >> >> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE >> > Ok. >>> + /* Try to do it without memory writes */ >>> + tmp += *(volatile int *) ptr; >>> + } >> >> >> The following is slightly simpler and doesn't rely on volatile >> semantics [1]. >> Should work on any arch I guess. >> >> static void ram_block_populate_pages(RAMBlock *bs) >> { >> char *ptr = (char *) bs->host; >> ram_addr_t offset; >> >> for (offset = 0; offset < bs->used_length; >> offset += qemu_real_host_page_size) { >> char tmp = *(volatile char *)(ptr + offset) >> >> /* Don't optimize the read out. */ >> asm volatile ("" : "+r" (tmp)); >> } >> > Thanks, good option, I'll change the code. > >> Compiles to >> >> for (offset = 0; offset < bs->used_length; >> 316d: 48 8b 4b 30 mov 0x30(%rbx),%rcx >> char *ptr = (char *) bs->host; >> 3171: 48 8b 73 18 mov 0x18(%rbx),%rsi >> for (offset = 0; offset < bs->used_length; >> 3175: 48 85 c9 test %rcx,%rcx >> 3178: 74 ce je 3148 >> <ram_write_tracking_prepare+0x58> >> offset += qemu_real_host_page_size) { >> 317a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # >> 3181 <ram_write_tracking_prepare+0x91> >> 3181: 48 8b 38 mov (%rax),%rdi >> 3184: 0f 1f 40 00 nopl 0x0(%rax) >> char tmp = *(volatile char *)(ptr + offset); >> 3188: 48 8d 04 16 lea (%rsi,%rdx,1),%rax >> 318c: 0f b6 00 movzbl (%rax),%eax >> offset += qemu_real_host_page_size) { >> 318f: 48 01 fa add %rdi,%rdx >> for (offset = 0; offset < bs->used_length; >> 3192: 48 39 ca cmp %rcx,%rdx >> 3195: 72 f1 jb 3188 >> <ram_write_tracking_prepare+0x98> >> >> >> [1] >> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ >> >> >> I'll send patches soon to take care of virtio-mem via RamDiscardManager - >> to skip populating the parts that are supposed to remain discarded and >> not migrated. >> Unfortunately, the RamDiscardManager patches are still stuck waiting for >> acks ... and now we're in soft-freeze. >> > RamDiscardManager patches - do they also modify migration code? > I mean which part is responsible of not migrating discarded ranges. I haven't shared relevant patches yet that touch migration code. I'm planning on doing that once the generic RamDiscardManager has all relevant acks. I'll put you on cc.
On 19.03.2021 14:27, David Hildenbrand wrote: > On 19.03.21 12:05, Andrey Gruzdev wrote: >> On 19.03.2021 12:28, David Hildenbrand wrote: >>>> +/* >>>> + * ram_block_populate_pages: populate memory in the RAM block by >>>> reading >>>> + * an integer from the beginning of each page. >>>> + * >>>> + * Since it's solely used for userfault_fd WP feature, here we just >>>> + * hardcode page size to TARGET_PAGE_SIZE. >>>> + * >>>> + * @bs: RAM block to populate >>>> + */ >>>> +volatile int ram_block_populate_pages__tmp; >>>> +static void ram_block_populate_pages(RAMBlock *bs) >>>> +{ >>>> + ram_addr_t offset = 0; >>>> + int tmp = 0; >>>> + >>>> + for (char *ptr = (char *) bs->host; offset < bs->used_length; >>>> + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { >>> >>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE >>> >> Ok. >>>> + /* Try to do it without memory writes */ >>>> + tmp += *(volatile int *) ptr; >>>> + } >>> >>> >>> The following is slightly simpler and doesn't rely on volatile >>> semantics [1]. >>> Should work on any arch I guess. >>> >>> static void ram_block_populate_pages(RAMBlock *bs) >>> { >>> char *ptr = (char *) bs->host; >>> ram_addr_t offset; >>> >>> for (offset = 0; offset < bs->used_length; >>> offset += qemu_real_host_page_size) { >>> char tmp = *(volatile char *)(ptr + offset) >>> >>> /* Don't optimize the read out. */ >>> asm volatile ("" : "+r" (tmp)); >>> } >>> >> Thanks, good option, I'll change the code. >> >>> Compiles to >>> >>> for (offset = 0; offset < bs->used_length; >>> 316d: 48 8b 4b 30 mov 0x30(%rbx),%rcx >>> char *ptr = (char *) bs->host; >>> 3171: 48 8b 73 18 mov 0x18(%rbx),%rsi >>> for (offset = 0; offset < bs->used_length; >>> 3175: 48 85 c9 test %rcx,%rcx >>> 3178: 74 ce je 3148 >>> <ram_write_tracking_prepare+0x58> >>> offset += qemu_real_host_page_size) { >>> 317a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # >>> 3181 <ram_write_tracking_prepare+0x91> >>> 3181: 48 8b 38 mov (%rax),%rdi >>> 3184: 0f 1f 40 00 nopl 0x0(%rax) >>> char tmp = *(volatile char *)(ptr + offset); >>> 3188: 48 8d 04 16 lea (%rsi,%rdx,1),%rax >>> 318c: 0f b6 00 movzbl (%rax),%eax >>> offset += qemu_real_host_page_size) { >>> 318f: 48 01 fa add %rdi,%rdx >>> for (offset = 0; offset < bs->used_length; >>> 3192: 48 39 ca cmp %rcx,%rdx >>> 3195: 72 f1 jb 3188 >>> <ram_write_tracking_prepare+0x98> >>> >>> >>> [1] >>> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ >>> >>> >>> >>> I'll send patches soon to take care of virtio-mem via >>> RamDiscardManager - >>> to skip populating the parts that are supposed to remain discarded and >>> not migrated. >>> Unfortunately, the RamDiscardManager patches are still stuck waiting >>> for >>> acks ... and now we're in soft-freeze. >>> >> RamDiscardManager patches - do they also modify migration code? >> I mean which part is responsible of not migrating discarded ranges. > > I haven't shared relevant patches yet that touch migration code. I'm > planning on doing that once the generic RamDiscardManager has all > relevant acks. I'll put you on cc. > Got it, thanks.
diff --git a/migration/migration.c b/migration/migration.c index 656d6249a6..496e88cbda 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3872,6 +3872,12 @@ static void *bg_migration_thread(void *opaque) update_iteration_initial_status(s); + /* + * Prepare for tracking memory writes with UFFD-WP - populate + * RAM pages before protecting. + */ + ram_write_tracking_prepare(); + qemu_savevm_state_header(s->to_dst_file); qemu_savevm_state_setup(s->to_dst_file); diff --git a/migration/ram.c b/migration/ram.c index 52537f14ac..825eb80030 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1560,6 +1560,57 @@ out: return ret; } +/* + * ram_block_populate_pages: populate memory in the RAM block by reading + * an integer from the beginning of each page. + * + * Since it's solely used for userfault_fd WP feature, here we just + * hardcode page size to TARGET_PAGE_SIZE. + * + * @bs: RAM block to populate + */ +volatile int ram_block_populate_pages__tmp; +static void ram_block_populate_pages(RAMBlock *bs) +{ + ram_addr_t offset = 0; + int tmp = 0; + + for (char *ptr = (char *) bs->host; offset < bs->used_length; + ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) { + /* Try to do it without memory writes */ + tmp += *(volatile int *) ptr; + } + /* Create dependency on 'extern volatile int' to avoid optimizing out */ + ram_block_populate_pages__tmp += tmp; +} + +/* + * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking + */ +void ram_write_tracking_prepare(void) +{ + RAMBlock *bs; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + + /* + * Populate pages of the RAM block before enabling userfault_fd + * write protection. + * + * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with + * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip + * pages with pte_none() entries in page table. + */ + ram_block_populate_pages(bs); + } +} + /* * ram_write_tracking_start: start UFFD-WP memory tracking * diff --git a/migration/ram.h b/migration/ram.h index 6378bb3ebc..4833e9fd5b 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -82,6 +82,7 @@ void colo_incoming_start_dirty_log(void); /* Background snapshot */ bool ram_write_tracking_available(void); bool ram_write_tracking_compatible(void); +void ram_write_tracking_prepare(void); int ram_write_tracking_start(void); void ram_write_tracking_stop(void);
This commit solves the issue with userfault_fd WP feature that background snapshot is based on. For any never poluated or discarded memory page, the UFFDIO_WRITEPROTECT ioctl() would skip updating PTE for that page, thereby loosing WP setting for it. So we need to pre-fault pages for each RAM block to be protected before making a userfault_fd wr-protect ioctl(). Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> --- migration/migration.c | 6 +++++ migration/ram.c | 51 +++++++++++++++++++++++++++++++++++++++++++ migration/ram.h | 1 + 3 files changed, 58 insertions(+)