Message ID | 20210106152120.31279-5-andrey.gruzdev@virtuozzo.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | UFFD write-tracking migration/snapshots | expand |
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > Introducing implementation of 'background' snapshot thread > which in overall follows the logic of precopy migration > while internally utilizes completely different mechanism > to 'freeze' vmstate at the start of snapshot creation. > > This mechanism is based on userfault_fd with wr-protection > support and is Linux-specific. I noticed there weren't any bdrv_ calls in here; I guess with a snapshot you still have the source running so still have it accessing the disk; do you do anything to try and wire the ram snapshotting up to disk snapshotting? > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > Acked-by: Peter Xu <peterx@redhat.com> > --- > migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- > migration/migration.h | 3 + > migration/ram.c | 2 + > migration/savevm.c | 1 - > migration/savevm.h | 2 + > 5 files changed, 260 insertions(+), 3 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index 2c2cb9ef01..0901a15ac5 100644 <snip> > - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, > - QEMU_THREAD_JOINABLE); > + > + if (migrate_background_snapshot()) { > + qemu_thread_create(&s->thread, "background_snapshot", Unfortunately that wont work - there's a 14 character limit on the thread name length; I guess we just shorten that to bg_snapshot Other than that, Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > + bg_migration_thread, s, QEMU_THREAD_JOINABLE); > + } else { > + qemu_thread_create(&s->thread, "live_migration", > + migration_thread, s, QEMU_THREAD_JOINABLE); > + } > s->migration_thread_running = true; > } > > diff --git a/migration/migration.h b/migration/migration.h > index f40338cfbf..0723955cd7 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -20,6 +20,7 @@ > #include "qemu/thread.h" > #include "qemu/coroutine_int.h" > #include "io/channel.h" > +#include "io/channel-buffer.h" > #include "net/announce.h" > #include "qom/object.h" > > @@ -147,8 +148,10 @@ struct MigrationState { > > /*< public >*/ > QemuThread thread; > + QEMUBH *vm_start_bh; > QEMUBH *cleanup_bh; > QEMUFile *to_dst_file; > + QIOChannelBuffer *bioc; > /* > * Protects to_dst_file pointer. We need to make sure we won't > * yield or hang during the critical section, since this lock will > diff --git a/migration/ram.c b/migration/ram.c > index 5707382db1..05fe0c8592 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > page_address = (void *) uffd_msg.arg.pagefault.address; > bs = qemu_ram_block_from_host(page_address, false, offset); > assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > + > return bs; > } > #endif /* CONFIG_LINUX */ > @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > /* Un-protect memory range. */ > res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > false, false); > + > /* We don't want to override existing error from ram_save_host_page(). */ > if (res < 0 && *res_override >= 0) { > *res_override = res; > diff --git a/migration/savevm.c b/migration/savevm.c > index 27e842812e..dd4ad0aaaf 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > return 0; > } > > -static > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > bool in_postcopy, > bool inactivate_disks) > diff --git a/migration/savevm.h b/migration/savevm.h > index ba64a7e271..aaee2528ed 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); > void qemu_loadvm_state_cleanup(void); > int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); > int qemu_load_device_state(QEMUFile *f); > +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > + bool in_postcopy, bool inactivate_disks); > > #endif > -- > 2.25.1 >
On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> Introducing implementation of 'background' snapshot thread >> which in overall follows the logic of precopy migration >> while internally utilizes completely different mechanism >> to 'freeze' vmstate at the start of snapshot creation. >> >> This mechanism is based on userfault_fd with wr-protection >> support and is Linux-specific. > I noticed there weren't any bdrv_ calls in here; I guess with a snapshot > you still have the source running so still have it accessing the disk; > do you do anything to try and wire the ram snapshotting up to disk > snapshotting? Block-related manipulations should be done externally, I think. So create backing images for RW nodes, then stop VM, switch block graph and start background snapshot. Something like create 'virsh snapshot-create-as' does, but in other sequence. // >> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >> Acked-by: Peter Xu <peterx@redhat.com> >> --- >> migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- >> migration/migration.h | 3 + >> migration/ram.c | 2 + >> migration/savevm.c | 1 - >> migration/savevm.h | 2 + >> 5 files changed, 260 insertions(+), 3 deletions(-) >> >> diff --git a/migration/migration.c b/migration/migration.c >> index 2c2cb9ef01..0901a15ac5 100644 > <snip> > >> - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, >> - QEMU_THREAD_JOINABLE); >> + >> + if (migrate_background_snapshot()) { >> + qemu_thread_create(&s->thread, "background_snapshot", > Unfortunately that wont work - there's a 14 character limit on > the thread name length; I guess we just shorten that to bg_snapshot Yep, missed that pthread_set_name_np() has a length limit) > Other than that, > > > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > >> + bg_migration_thread, s, QEMU_THREAD_JOINABLE); >> + } else { >> + qemu_thread_create(&s->thread, "live_migration", >> + migration_thread, s, QEMU_THREAD_JOINABLE); >> + } >> s->migration_thread_running = true; >> } >> >> diff --git a/migration/migration.h b/migration/migration.h >> index f40338cfbf..0723955cd7 100644 >> --- a/migration/migration.h >> +++ b/migration/migration.h >> @@ -20,6 +20,7 @@ >> #include "qemu/thread.h" >> #include "qemu/coroutine_int.h" >> #include "io/channel.h" >> +#include "io/channel-buffer.h" >> #include "net/announce.h" >> #include "qom/object.h" >> >> @@ -147,8 +148,10 @@ struct MigrationState { >> >> /*< public >*/ >> QemuThread thread; >> + QEMUBH *vm_start_bh; >> QEMUBH *cleanup_bh; >> QEMUFile *to_dst_file; >> + QIOChannelBuffer *bioc; >> /* >> * Protects to_dst_file pointer. We need to make sure we won't >> * yield or hang during the critical section, since this lock will >> diff --git a/migration/ram.c b/migration/ram.c >> index 5707382db1..05fe0c8592 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >> page_address = (void *) uffd_msg.arg.pagefault.address; >> bs = qemu_ram_block_from_host(page_address, false, offset); >> assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >> + >> return bs; >> } >> #endif /* CONFIG_LINUX */ >> @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >> /* Un-protect memory range. */ >> res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >> false, false); >> + >> /* We don't want to override existing error from ram_save_host_page(). */ >> if (res < 0 && *res_override >= 0) { >> *res_override = res; >> diff --git a/migration/savevm.c b/migration/savevm.c >> index 27e842812e..dd4ad0aaaf 100644 >> --- a/migration/savevm.c >> +++ b/migration/savevm.c >> @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) >> return 0; >> } >> >> -static >> int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >> bool in_postcopy, >> bool inactivate_disks) >> diff --git a/migration/savevm.h b/migration/savevm.h >> index ba64a7e271..aaee2528ed 100644 >> --- a/migration/savevm.h >> +++ b/migration/savevm.h >> @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); >> void qemu_loadvm_state_cleanup(void); >> int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); >> int qemu_load_device_state(QEMUFile *f); >> +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >> + bool in_postcopy, bool inactivate_disks); >> >> #endif >> -- >> 2.25.1 >>
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > Introducing implementation of 'background' snapshot thread > > > which in overall follows the logic of precopy migration > > > while internally utilizes completely different mechanism > > > to 'freeze' vmstate at the start of snapshot creation. > > > > > > This mechanism is based on userfault_fd with wr-protection > > > support and is Linux-specific. > > I noticed there weren't any bdrv_ calls in here; I guess with a snapshot > > you still have the source running so still have it accessing the disk; > > do you do anything to try and wire the ram snapshotting up to disk > > snapshotting? > > Block-related manipulations should be done externally, I think. > So create backing images for RW nodes, then stop VM, switch block graph > and start background snapshot. Something like create 'virsh snapshot-create-as' > does, but in other sequence. If you get a chance it would be great if you could put together an example of doing the combination RAM+block; that way we find out if there's anything silly missing. Dave > // > > > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > > > Acked-by: Peter Xu <peterx@redhat.com> > > > --- > > > migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- > > > migration/migration.h | 3 + > > > migration/ram.c | 2 + > > > migration/savevm.c | 1 - > > > migration/savevm.h | 2 + > > > 5 files changed, 260 insertions(+), 3 deletions(-) > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > index 2c2cb9ef01..0901a15ac5 100644 > > <snip> > > > > > - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, > > > - QEMU_THREAD_JOINABLE); > > > + > > > + if (migrate_background_snapshot()) { > > > + qemu_thread_create(&s->thread, "background_snapshot", > > Unfortunately that wont work - there's a 14 character limit on > > the thread name length; I guess we just shorten that to bg_snapshot > > Yep, missed that pthread_set_name_np() has a length limit) > > > Other than that, > > > > > > > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > > > > + bg_migration_thread, s, QEMU_THREAD_JOINABLE); > > > + } else { > > > + qemu_thread_create(&s->thread, "live_migration", > > > + migration_thread, s, QEMU_THREAD_JOINABLE); > > > + } > > > s->migration_thread_running = true; > > > } > > > diff --git a/migration/migration.h b/migration/migration.h > > > index f40338cfbf..0723955cd7 100644 > > > --- a/migration/migration.h > > > +++ b/migration/migration.h > > > @@ -20,6 +20,7 @@ > > > #include "qemu/thread.h" > > > #include "qemu/coroutine_int.h" > > > #include "io/channel.h" > > > +#include "io/channel-buffer.h" > > > #include "net/announce.h" > > > #include "qom/object.h" > > > @@ -147,8 +148,10 @@ struct MigrationState { > > > /*< public >*/ > > > QemuThread thread; > > > + QEMUBH *vm_start_bh; > > > QEMUBH *cleanup_bh; > > > QEMUFile *to_dst_file; > > > + QIOChannelBuffer *bioc; > > > /* > > > * Protects to_dst_file pointer. We need to make sure we won't > > > * yield or hang during the critical section, since this lock will > > > diff --git a/migration/ram.c b/migration/ram.c > > > index 5707382db1..05fe0c8592 100644 > > > --- a/migration/ram.c > > > +++ b/migration/ram.c > > > @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > > > page_address = (void *) uffd_msg.arg.pagefault.address; > > > bs = qemu_ram_block_from_host(page_address, false, offset); > > > assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > > > + > > > return bs; > > > } > > > #endif /* CONFIG_LINUX */ > > > @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > > > /* Un-protect memory range. */ > > > res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > > > false, false); > > > + > > > /* We don't want to override existing error from ram_save_host_page(). */ > > > if (res < 0 && *res_override >= 0) { > > > *res_override = res; > > > diff --git a/migration/savevm.c b/migration/savevm.c > > > index 27e842812e..dd4ad0aaaf 100644 > > > --- a/migration/savevm.c > > > +++ b/migration/savevm.c > > > @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > > > return 0; > > > } > > > -static > > > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > bool in_postcopy, > > > bool inactivate_disks) > > > diff --git a/migration/savevm.h b/migration/savevm.h > > > index ba64a7e271..aaee2528ed 100644 > > > --- a/migration/savevm.h > > > +++ b/migration/savevm.h > > > @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); > > > void qemu_loadvm_state_cleanup(void); > > > int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); > > > int qemu_load_device_state(QEMUFile *f); > > > +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > + bool in_postcopy, bool inactivate_disks); > > > #endif > > > -- > > > 2.25.1 > > > > > -- > Andrey Gruzdev, Principal Engineer > Virtuozzo GmbH +7-903-247-6397 > virtuzzo.com >
On 21.01.2021 12:56, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: >>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>> Introducing implementation of 'background' snapshot thread >>>> which in overall follows the logic of precopy migration >>>> while internally utilizes completely different mechanism >>>> to 'freeze' vmstate at the start of snapshot creation. >>>> >>>> This mechanism is based on userfault_fd with wr-protection >>>> support and is Linux-specific. >>> I noticed there weren't any bdrv_ calls in here; I guess with a snapshot >>> you still have the source running so still have it accessing the disk; >>> do you do anything to try and wire the ram snapshotting up to disk >>> snapshotting? >> Block-related manipulations should be done externally, I think. >> So create backing images for RW nodes, then stop VM, switch block graph >> and start background snapshot. Something like create 'virsh snapshot-create-as' >> does, but in other sequence. > If you get a chance it would be great if you could put together an > example of doing the combination RAM+block; that way we find out if there's > anything silly missing. > > Dave Yep, I'll take a look at the QMP command sequences, how it should look like in our case and prepare an example, hope we are not missing something serious. At least we know that block setup data won't go to snapshot. I've also checked starting background snapshot from the stopped VM state - looks OK, VM resumes operation, snapshot is saved, no apparent problems. Maybe it will take some time, since now I'm on task to create tool to store snapshots with RAM indexable by GPFNs, together with the rest of VMSTATE. Based on QCOW2 format. Also it should support snapshot revert in postcopy mode. Andrey >> // >> >>>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >>>> Acked-by: Peter Xu <peterx@redhat.com> >>>> --- >>>> migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- >>>> migration/migration.h | 3 + >>>> migration/ram.c | 2 + >>>> migration/savevm.c | 1 - >>>> migration/savevm.h | 2 + >>>> 5 files changed, 260 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/migration/migration.c b/migration/migration.c >>>> index 2c2cb9ef01..0901a15ac5 100644 >>> <snip> >>> >>>> - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, >>>> - QEMU_THREAD_JOINABLE); >>>> + >>>> + if (migrate_background_snapshot()) { >>>> + qemu_thread_create(&s->thread, "background_snapshot", >>> Unfortunately that wont work - there's a 14 character limit on >>> the thread name length; I guess we just shorten that to bg_snapshot >> Yep, missed that pthread_set_name_np() has a length limit) >> >>> Other than that, >>> >>> >>> >>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> >>> >>>> + bg_migration_thread, s, QEMU_THREAD_JOINABLE); >>>> + } else { >>>> + qemu_thread_create(&s->thread, "live_migration", >>>> + migration_thread, s, QEMU_THREAD_JOINABLE); >>>> + } >>>> s->migration_thread_running = true; >>>> } >>>> diff --git a/migration/migration.h b/migration/migration.h >>>> index f40338cfbf..0723955cd7 100644 >>>> --- a/migration/migration.h >>>> +++ b/migration/migration.h >>>> @@ -20,6 +20,7 @@ >>>> #include "qemu/thread.h" >>>> #include "qemu/coroutine_int.h" >>>> #include "io/channel.h" >>>> +#include "io/channel-buffer.h" >>>> #include "net/announce.h" >>>> #include "qom/object.h" >>>> @@ -147,8 +148,10 @@ struct MigrationState { >>>> /*< public >*/ >>>> QemuThread thread; >>>> + QEMUBH *vm_start_bh; >>>> QEMUBH *cleanup_bh; >>>> QEMUFile *to_dst_file; >>>> + QIOChannelBuffer *bioc; >>>> /* >>>> * Protects to_dst_file pointer. We need to make sure we won't >>>> * yield or hang during the critical section, since this lock will >>>> diff --git a/migration/ram.c b/migration/ram.c >>>> index 5707382db1..05fe0c8592 100644 >>>> --- a/migration/ram.c >>>> +++ b/migration/ram.c >>>> @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >>>> page_address = (void *) uffd_msg.arg.pagefault.address; >>>> bs = qemu_ram_block_from_host(page_address, false, offset); >>>> assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >>>> + >>>> return bs; >>>> } >>>> #endif /* CONFIG_LINUX */ >>>> @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >>>> /* Un-protect memory range. */ >>>> res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >>>> false, false); >>>> + >>>> /* We don't want to override existing error from ram_save_host_page(). */ >>>> if (res < 0 && *res_override >= 0) { >>>> *res_override = res; >>>> diff --git a/migration/savevm.c b/migration/savevm.c >>>> index 27e842812e..dd4ad0aaaf 100644 >>>> --- a/migration/savevm.c >>>> +++ b/migration/savevm.c >>>> @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) >>>> return 0; >>>> } >>>> -static >>>> int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>> bool in_postcopy, >>>> bool inactivate_disks) >>>> diff --git a/migration/savevm.h b/migration/savevm.h >>>> index ba64a7e271..aaee2528ed 100644 >>>> --- a/migration/savevm.h >>>> +++ b/migration/savevm.h >>>> @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); >>>> void qemu_loadvm_state_cleanup(void); >>>> int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); >>>> int qemu_load_device_state(QEMUFile *f); >>>> +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>> + bool in_postcopy, bool inactivate_disks); >>>> #endif >>>> -- >>>> 2.25.1 >>>> >> -- >> Andrey Gruzdev, Principal Engineer >> Virtuozzo GmbH +7-903-247-6397 >> virtuzzo.com >>
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > On 21.01.2021 12:56, Dr. David Alan Gilbert wrote: > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: > > > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > > > Introducing implementation of 'background' snapshot thread > > > > > which in overall follows the logic of precopy migration > > > > > while internally utilizes completely different mechanism > > > > > to 'freeze' vmstate at the start of snapshot creation. > > > > > > > > > > This mechanism is based on userfault_fd with wr-protection > > > > > support and is Linux-specific. > > > > I noticed there weren't any bdrv_ calls in here; I guess with a snapshot > > > > you still have the source running so still have it accessing the disk; > > > > do you do anything to try and wire the ram snapshotting up to disk > > > > snapshotting? > > > Block-related manipulations should be done externally, I think. > > > So create backing images for RW nodes, then stop VM, switch block graph > > > and start background snapshot. Something like create 'virsh snapshot-create-as' > > > does, but in other sequence. > > If you get a chance it would be great if you could put together an > > example of doing the combination RAM+block; that way we find out if there's > > anything silly missing. > > > > Dave > > Yep, I'll take a look at the QMP command sequences, how it should look > like in our case and prepare an example, hope we are not missing something serious. > At least we know that block setup data won't go to snapshot. > I've also checked starting background snapshot from the stopped VM state - > looks OK, VM resumes operation, snapshot is saved, no apparent problems. > > Maybe it will take some time, since now I'm on task to create tool to store > snapshots with RAM indexable by GPFNs, together with the rest of VMSTATE. If you want to make it indexable, why not just do a simple write(2) call for the whole of RAM rather than doing the thing like normal migration? Dave > Based on QCOW2 format. Also it should support snapshot revert in postcopy mode. > > Andrey > > > > // > > > > > > > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > > --- > > > > > migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- > > > > > migration/migration.h | 3 + > > > > > migration/ram.c | 2 + > > > > > migration/savevm.c | 1 - > > > > > migration/savevm.h | 2 + > > > > > 5 files changed, 260 insertions(+), 3 deletions(-) > > > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > > > index 2c2cb9ef01..0901a15ac5 100644 > > > > <snip> > > > > > > > > > - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, > > > > > - QEMU_THREAD_JOINABLE); > > > > > + > > > > > + if (migrate_background_snapshot()) { > > > > > + qemu_thread_create(&s->thread, "background_snapshot", > > > > Unfortunately that wont work - there's a 14 character limit on > > > > the thread name length; I guess we just shorten that to bg_snapshot > > > Yep, missed that pthread_set_name_np() has a length limit) > > > > > > > Other than that, > > > > > > > > > > > > > > > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > > > > > > > > + bg_migration_thread, s, QEMU_THREAD_JOINABLE); > > > > > + } else { > > > > > + qemu_thread_create(&s->thread, "live_migration", > > > > > + migration_thread, s, QEMU_THREAD_JOINABLE); > > > > > + } > > > > > s->migration_thread_running = true; > > > > > } > > > > > diff --git a/migration/migration.h b/migration/migration.h > > > > > index f40338cfbf..0723955cd7 100644 > > > > > --- a/migration/migration.h > > > > > +++ b/migration/migration.h > > > > > @@ -20,6 +20,7 @@ > > > > > #include "qemu/thread.h" > > > > > #include "qemu/coroutine_int.h" > > > > > #include "io/channel.h" > > > > > +#include "io/channel-buffer.h" > > > > > #include "net/announce.h" > > > > > #include "qom/object.h" > > > > > @@ -147,8 +148,10 @@ struct MigrationState { > > > > > /*< public >*/ > > > > > QemuThread thread; > > > > > + QEMUBH *vm_start_bh; > > > > > QEMUBH *cleanup_bh; > > > > > QEMUFile *to_dst_file; > > > > > + QIOChannelBuffer *bioc; > > > > > /* > > > > > * Protects to_dst_file pointer. We need to make sure we won't > > > > > * yield or hang during the critical section, since this lock will > > > > > diff --git a/migration/ram.c b/migration/ram.c > > > > > index 5707382db1..05fe0c8592 100644 > > > > > --- a/migration/ram.c > > > > > +++ b/migration/ram.c > > > > > @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > > > > > page_address = (void *) uffd_msg.arg.pagefault.address; > > > > > bs = qemu_ram_block_from_host(page_address, false, offset); > > > > > assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > > > > > + > > > > > return bs; > > > > > } > > > > > #endif /* CONFIG_LINUX */ > > > > > @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > > > > > /* Un-protect memory range. */ > > > > > res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > > > > > false, false); > > > > > + > > > > > /* We don't want to override existing error from ram_save_host_page(). */ > > > > > if (res < 0 && *res_override >= 0) { > > > > > *res_override = res; > > > > > diff --git a/migration/savevm.c b/migration/savevm.c > > > > > index 27e842812e..dd4ad0aaaf 100644 > > > > > --- a/migration/savevm.c > > > > > +++ b/migration/savevm.c > > > > > @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > > > > > return 0; > > > > > } > > > > > -static > > > > > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > > > bool in_postcopy, > > > > > bool inactivate_disks) > > > > > diff --git a/migration/savevm.h b/migration/savevm.h > > > > > index ba64a7e271..aaee2528ed 100644 > > > > > --- a/migration/savevm.h > > > > > +++ b/migration/savevm.h > > > > > @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); > > > > > void qemu_loadvm_state_cleanup(void); > > > > > int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); > > > > > int qemu_load_device_state(QEMUFile *f); > > > > > +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > > > + bool in_postcopy, bool inactivate_disks); > > > > > #endif > > > > > -- > > > > > 2.25.1 > > > > > > > > -- > > > Andrey Gruzdev, Principal Engineer > > > Virtuozzo GmbH +7-903-247-6397 > > > virtuzzo.com > > > > -- > Andrey Gruzdev, Principal Engineer > Virtuozzo GmbH +7-903-247-6397 > virtuzzo.com >
On 21.01.2021 19:11, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> On 21.01.2021 12:56, Dr. David Alan Gilbert wrote: >>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>> On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: >>>>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>>>> Introducing implementation of 'background' snapshot thread >>>>>> which in overall follows the logic of precopy migration >>>>>> while internally utilizes completely different mechanism >>>>>> to 'freeze' vmstate at the start of snapshot creation. >>>>>> >>>>>> This mechanism is based on userfault_fd with wr-protection >>>>>> support and is Linux-specific. >>>>> I noticed there weren't any bdrv_ calls in here; I guess with a snapshot >>>>> you still have the source running so still have it accessing the disk; >>>>> do you do anything to try and wire the ram snapshotting up to disk >>>>> snapshotting? >>>> Block-related manipulations should be done externally, I think. >>>> So create backing images for RW nodes, then stop VM, switch block graph >>>> and start background snapshot. Something like create 'virsh snapshot-create-as' >>>> does, but in other sequence. >>> If you get a chance it would be great if you could put together an >>> example of doing the combination RAM+block; that way we find out if there's >>> anything silly missing. >>> >>> Dave >> Yep, I'll take a look at the QMP command sequences, how it should look >> like in our case and prepare an example, hope we are not missing something serious. >> At least we know that block setup data won't go to snapshot. >> I've also checked starting background snapshot from the stopped VM state - >> looks OK, VM resumes operation, snapshot is saved, no apparent problems. >> >> Maybe it will take some time, since now I'm on task to create tool to store >> snapshots with RAM indexable by GPFNs, together with the rest of VMSTATE. > If you want to make it indexable, why not just do a simple write(2) call > for the whole of RAM rather than doing the thing like normal migration? > > Dave For me the main reason is apparent file size.. While we can get the same allocation size when saving via write(2) on Linux, in many cases the apparent file size will be much bigger then if use QCOW2. Andrey >> Based on QCOW2 format. Also it should support snapshot revert in postcopy mode. >> >> Andrey >> >>>> // >>>> >>>>>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >>>>>> Acked-by: Peter Xu <peterx@redhat.com> >>>>>> --- >>>>>> migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- >>>>>> migration/migration.h | 3 + >>>>>> migration/ram.c | 2 + >>>>>> migration/savevm.c | 1 - >>>>>> migration/savevm.h | 2 + >>>>>> 5 files changed, 260 insertions(+), 3 deletions(-) >>>>>> >>>>>> diff --git a/migration/migration.c b/migration/migration.c >>>>>> index 2c2cb9ef01..0901a15ac5 100644 >>>>> <snip> >>>>> >>>>>> - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, >>>>>> - QEMU_THREAD_JOINABLE); >>>>>> + >>>>>> + if (migrate_background_snapshot()) { >>>>>> + qemu_thread_create(&s->thread, "background_snapshot", >>>>> Unfortunately that wont work - there's a 14 character limit on >>>>> the thread name length; I guess we just shorten that to bg_snapshot >>>> Yep, missed that pthread_set_name_np() has a length limit) >>>> >>>>> Other than that, >>>>> >>>>> >>>>> >>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> >>>>> >>>>>> + bg_migration_thread, s, QEMU_THREAD_JOINABLE); >>>>>> + } else { >>>>>> + qemu_thread_create(&s->thread, "live_migration", >>>>>> + migration_thread, s, QEMU_THREAD_JOINABLE); >>>>>> + } >>>>>> s->migration_thread_running = true; >>>>>> } >>>>>> diff --git a/migration/migration.h b/migration/migration.h >>>>>> index f40338cfbf..0723955cd7 100644 >>>>>> --- a/migration/migration.h >>>>>> +++ b/migration/migration.h >>>>>> @@ -20,6 +20,7 @@ >>>>>> #include "qemu/thread.h" >>>>>> #include "qemu/coroutine_int.h" >>>>>> #include "io/channel.h" >>>>>> +#include "io/channel-buffer.h" >>>>>> #include "net/announce.h" >>>>>> #include "qom/object.h" >>>>>> @@ -147,8 +148,10 @@ struct MigrationState { >>>>>> /*< public >*/ >>>>>> QemuThread thread; >>>>>> + QEMUBH *vm_start_bh; >>>>>> QEMUBH *cleanup_bh; >>>>>> QEMUFile *to_dst_file; >>>>>> + QIOChannelBuffer *bioc; >>>>>> /* >>>>>> * Protects to_dst_file pointer. We need to make sure we won't >>>>>> * yield or hang during the critical section, since this lock will >>>>>> diff --git a/migration/ram.c b/migration/ram.c >>>>>> index 5707382db1..05fe0c8592 100644 >>>>>> --- a/migration/ram.c >>>>>> +++ b/migration/ram.c >>>>>> @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >>>>>> page_address = (void *) uffd_msg.arg.pagefault.address; >>>>>> bs = qemu_ram_block_from_host(page_address, false, offset); >>>>>> assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >>>>>> + >>>>>> return bs; >>>>>> } >>>>>> #endif /* CONFIG_LINUX */ >>>>>> @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >>>>>> /* Un-protect memory range. */ >>>>>> res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >>>>>> false, false); >>>>>> + >>>>>> /* We don't want to override existing error from ram_save_host_page(). */ >>>>>> if (res < 0 && *res_override >= 0) { >>>>>> *res_override = res; >>>>>> diff --git a/migration/savevm.c b/migration/savevm.c >>>>>> index 27e842812e..dd4ad0aaaf 100644 >>>>>> --- a/migration/savevm.c >>>>>> +++ b/migration/savevm.c >>>>>> @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) >>>>>> return 0; >>>>>> } >>>>>> -static >>>>>> int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>>>> bool in_postcopy, >>>>>> bool inactivate_disks) >>>>>> diff --git a/migration/savevm.h b/migration/savevm.h >>>>>> index ba64a7e271..aaee2528ed 100644 >>>>>> --- a/migration/savevm.h >>>>>> +++ b/migration/savevm.h >>>>>> @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); >>>>>> void qemu_loadvm_state_cleanup(void); >>>>>> int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); >>>>>> int qemu_load_device_state(QEMUFile *f); >>>>>> +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>>>> + bool in_postcopy, bool inactivate_disks); >>>>>> #endif >>>>>> -- >>>>>> 2.25.1 >>>>>> >>>> -- >>>> Andrey Gruzdev, Principal Engineer >>>> Virtuozzo GmbH +7-903-247-6397 >>>> virtuzzo.com >>>> >> -- >> Andrey Gruzdev, Principal Engineer >> Virtuozzo GmbH +7-903-247-6397 >> virtuzzo.com >>
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > On 21.01.2021 19:11, Dr. David Alan Gilbert wrote: > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > On 21.01.2021 12:56, Dr. David Alan Gilbert wrote: > > > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > > > On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: > > > > > > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > > > > > > > Introducing implementation of 'background' snapshot thread > > > > > > > which in overall follows the logic of precopy migration > > > > > > > while internally utilizes completely different mechanism > > > > > > > to 'freeze' vmstate at the start of snapshot creation. > > > > > > > > > > > > > > This mechanism is based on userfault_fd with wr-protection > > > > > > > support and is Linux-specific. > > > > > > I noticed there weren't any bdrv_ calls in here; I guess with a snapshot > > > > > > you still have the source running so still have it accessing the disk; > > > > > > do you do anything to try and wire the ram snapshotting up to disk > > > > > > snapshotting? > > > > > Block-related manipulations should be done externally, I think. > > > > > So create backing images for RW nodes, then stop VM, switch block graph > > > > > and start background snapshot. Something like create 'virsh snapshot-create-as' > > > > > does, but in other sequence. > > > > If you get a chance it would be great if you could put together an > > > > example of doing the combination RAM+block; that way we find out if there's > > > > anything silly missing. > > > > > > > > Dave > > > Yep, I'll take a look at the QMP command sequences, how it should look > > > like in our case and prepare an example, hope we are not missing something serious. > > > At least we know that block setup data won't go to snapshot. > > > I've also checked starting background snapshot from the stopped VM state - > > > looks OK, VM resumes operation, snapshot is saved, no apparent problems. > > > > > > Maybe it will take some time, since now I'm on task to create tool to store > > > snapshots with RAM indexable by GPFNs, together with the rest of VMSTATE. > > If you want to make it indexable, why not just do a simple write(2) call > > for the whole of RAM rather than doing the thing like normal migration? > > > > Dave > > For me the main reason is apparent file size.. While we can get the same allocation > size when saving via write(2) on Linux, in many cases the apparent file size will > be much bigger then if use QCOW2. Do you mean because of zero pages or for some other reason? Dave > Andrey > > > > Based on QCOW2 format. Also it should support snapshot revert in postcopy mode. > > > > > > Andrey > > > > > > > > // > > > > > > > > > > > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > > > > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > > > > --- > > > > > > > migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- > > > > > > > migration/migration.h | 3 + > > > > > > > migration/ram.c | 2 + > > > > > > > migration/savevm.c | 1 - > > > > > > > migration/savevm.h | 2 + > > > > > > > 5 files changed, 260 insertions(+), 3 deletions(-) > > > > > > > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > > > > > index 2c2cb9ef01..0901a15ac5 100644 > > > > > > <snip> > > > > > > > > > > > > > - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, > > > > > > > - QEMU_THREAD_JOINABLE); > > > > > > > + > > > > > > > + if (migrate_background_snapshot()) { > > > > > > > + qemu_thread_create(&s->thread, "background_snapshot", > > > > > > Unfortunately that wont work - there's a 14 character limit on > > > > > > the thread name length; I guess we just shorten that to bg_snapshot > > > > > Yep, missed that pthread_set_name_np() has a length limit) > > > > > > > > > > > Other than that, > > > > > > > > > > > > > > > > > > > > > > > > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > > > > > > > > > > > > + bg_migration_thread, s, QEMU_THREAD_JOINABLE); > > > > > > > + } else { > > > > > > > + qemu_thread_create(&s->thread, "live_migration", > > > > > > > + migration_thread, s, QEMU_THREAD_JOINABLE); > > > > > > > + } > > > > > > > s->migration_thread_running = true; > > > > > > > } > > > > > > > diff --git a/migration/migration.h b/migration/migration.h > > > > > > > index f40338cfbf..0723955cd7 100644 > > > > > > > --- a/migration/migration.h > > > > > > > +++ b/migration/migration.h > > > > > > > @@ -20,6 +20,7 @@ > > > > > > > #include "qemu/thread.h" > > > > > > > #include "qemu/coroutine_int.h" > > > > > > > #include "io/channel.h" > > > > > > > +#include "io/channel-buffer.h" > > > > > > > #include "net/announce.h" > > > > > > > #include "qom/object.h" > > > > > > > @@ -147,8 +148,10 @@ struct MigrationState { > > > > > > > /*< public >*/ > > > > > > > QemuThread thread; > > > > > > > + QEMUBH *vm_start_bh; > > > > > > > QEMUBH *cleanup_bh; > > > > > > > QEMUFile *to_dst_file; > > > > > > > + QIOChannelBuffer *bioc; > > > > > > > /* > > > > > > > * Protects to_dst_file pointer. We need to make sure we won't > > > > > > > * yield or hang during the critical section, since this lock will > > > > > > > diff --git a/migration/ram.c b/migration/ram.c > > > > > > > index 5707382db1..05fe0c8592 100644 > > > > > > > --- a/migration/ram.c > > > > > > > +++ b/migration/ram.c > > > > > > > @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > > > > > > > page_address = (void *) uffd_msg.arg.pagefault.address; > > > > > > > bs = qemu_ram_block_from_host(page_address, false, offset); > > > > > > > assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); > > > > > > > + > > > > > > > return bs; > > > > > > > } > > > > > > > #endif /* CONFIG_LINUX */ > > > > > > > @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > > > > > > > /* Un-protect memory range. */ > > > > > > > res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, > > > > > > > false, false); > > > > > > > + > > > > > > > /* We don't want to override existing error from ram_save_host_page(). */ > > > > > > > if (res < 0 && *res_override >= 0) { > > > > > > > *res_override = res; > > > > > > > diff --git a/migration/savevm.c b/migration/savevm.c > > > > > > > index 27e842812e..dd4ad0aaaf 100644 > > > > > > > --- a/migration/savevm.c > > > > > > > +++ b/migration/savevm.c > > > > > > > @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > > > > > > > return 0; > > > > > > > } > > > > > > > -static > > > > > > > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > > > > > bool in_postcopy, > > > > > > > bool inactivate_disks) > > > > > > > diff --git a/migration/savevm.h b/migration/savevm.h > > > > > > > index ba64a7e271..aaee2528ed 100644 > > > > > > > --- a/migration/savevm.h > > > > > > > +++ b/migration/savevm.h > > > > > > > @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); > > > > > > > void qemu_loadvm_state_cleanup(void); > > > > > > > int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); > > > > > > > int qemu_load_device_state(QEMUFile *f); > > > > > > > +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > > > > > > > + bool in_postcopy, bool inactivate_disks); > > > > > > > #endif > > > > > > > -- > > > > > > > 2.25.1 > > > > > > > > > > > > -- > > > > > Andrey Gruzdev, Principal Engineer > > > > > Virtuozzo GmbH +7-903-247-6397 > > > > > virtuzzo.com > > > > > > > > -- > > > Andrey Gruzdev, Principal Engineer > > > Virtuozzo GmbH +7-903-247-6397 > > > virtuzzo.com > > > > > -- > Andrey Gruzdev, Principal Engineer > Virtuozzo GmbH +7-903-247-6397 > virtuzzo.com >
On 21.01.2021 20:48, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> On 21.01.2021 19:11, Dr. David Alan Gilbert wrote: >>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>> On 21.01.2021 12:56, Dr. David Alan Gilbert wrote: >>>>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>>>> On 19.01.2021 21:49, Dr. David Alan Gilbert wrote: >>>>>>> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >>>>>>>> Introducing implementation of 'background' snapshot thread >>>>>>>> which in overall follows the logic of precopy migration >>>>>>>> while internally utilizes completely different mechanism >>>>>>>> to 'freeze' vmstate at the start of snapshot creation. >>>>>>>> >>>>>>>> This mechanism is based on userfault_fd with wr-protection >>>>>>>> support and is Linux-specific. >>>>>>> I noticed there weren't any bdrv_ calls in here; I guess with a snapshot >>>>>>> you still have the source running so still have it accessing the disk; >>>>>>> do you do anything to try and wire the ram snapshotting up to disk >>>>>>> snapshotting? >>>>>> Block-related manipulations should be done externally, I think. >>>>>> So create backing images for RW nodes, then stop VM, switch block graph >>>>>> and start background snapshot. Something like create 'virsh snapshot-create-as' >>>>>> does, but in other sequence. >>>>> If you get a chance it would be great if you could put together an >>>>> example of doing the combination RAM+block; that way we find out if there's >>>>> anything silly missing. >>>>> >>>>> Dave >>>> Yep, I'll take a look at the QMP command sequences, how it should look >>>> like in our case and prepare an example, hope we are not missing something serious. >>>> At least we know that block setup data won't go to snapshot. >>>> I've also checked starting background snapshot from the stopped VM state - >>>> looks OK, VM resumes operation, snapshot is saved, no apparent problems. >>>> >>>> Maybe it will take some time, since now I'm on task to create tool to store >>>> snapshots with RAM indexable by GPFNs, together with the rest of VMSTATE. >>> If you want to make it indexable, why not just do a simple write(2) call >>> for the whole of RAM rather than doing the thing like normal migration? >>> >>> Dave >> For me the main reason is apparent file size.. While we can get the same allocation >> size when saving via write(2) on Linux, in many cases the apparent file size will >> be much bigger then if use QCOW2. > Do you mean because of zero pages or for some other reason? > > Dave Yes. So plain sparse file on ext4 would grow to apparent size equal to highest non-zero GPA. While QCOW2 won't. It's important from the point of user experience, since desktop workload often show very small non-zero RSS. When I start Win10 on QEMU with a single Firefox tab with some Youtube HD video I have only 2-5GB of migration data on a 16GB VM. Andrey >> Andrey >> >>>> Based on QCOW2 format. Also it should support snapshot revert in postcopy mode. >>>> >>>> Andrey >>>> >>>>>> // >>>>>> >>>>>>>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >>>>>>>> Acked-by: Peter Xu <peterx@redhat.com> >>>>>>>> --- >>>>>>>> migration/migration.c | 255 +++++++++++++++++++++++++++++++++++++++++- >>>>>>>> migration/migration.h | 3 + >>>>>>>> migration/ram.c | 2 + >>>>>>>> migration/savevm.c | 1 - >>>>>>>> migration/savevm.h | 2 + >>>>>>>> 5 files changed, 260 insertions(+), 3 deletions(-) >>>>>>>> >>>>>>>> diff --git a/migration/migration.c b/migration/migration.c >>>>>>>> index 2c2cb9ef01..0901a15ac5 100644 >>>>>>> <snip> >>>>>>> >>>>>>>> - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, >>>>>>>> - QEMU_THREAD_JOINABLE); >>>>>>>> + >>>>>>>> + if (migrate_background_snapshot()) { >>>>>>>> + qemu_thread_create(&s->thread, "background_snapshot", >>>>>>> Unfortunately that wont work - there's a 14 character limit on >>>>>>> the thread name length; I guess we just shorten that to bg_snapshot >>>>>> Yep, missed that pthread_set_name_np() has a length limit) >>>>>> >>>>>>> Other than that, >>>>>>> >>>>>>> >>>>>>> >>>>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> >>>>>>> >>>>>>>> + bg_migration_thread, s, QEMU_THREAD_JOINABLE); >>>>>>>> + } else { >>>>>>>> + qemu_thread_create(&s->thread, "live_migration", >>>>>>>> + migration_thread, s, QEMU_THREAD_JOINABLE); >>>>>>>> + } >>>>>>>> s->migration_thread_running = true; >>>>>>>> } >>>>>>>> diff --git a/migration/migration.h b/migration/migration.h >>>>>>>> index f40338cfbf..0723955cd7 100644 >>>>>>>> --- a/migration/migration.h >>>>>>>> +++ b/migration/migration.h >>>>>>>> @@ -20,6 +20,7 @@ >>>>>>>> #include "qemu/thread.h" >>>>>>>> #include "qemu/coroutine_int.h" >>>>>>>> #include "io/channel.h" >>>>>>>> +#include "io/channel-buffer.h" >>>>>>>> #include "net/announce.h" >>>>>>>> #include "qom/object.h" >>>>>>>> @@ -147,8 +148,10 @@ struct MigrationState { >>>>>>>> /*< public >*/ >>>>>>>> QemuThread thread; >>>>>>>> + QEMUBH *vm_start_bh; >>>>>>>> QEMUBH *cleanup_bh; >>>>>>>> QEMUFile *to_dst_file; >>>>>>>> + QIOChannelBuffer *bioc; >>>>>>>> /* >>>>>>>> * Protects to_dst_file pointer. We need to make sure we won't >>>>>>>> * yield or hang during the critical section, since this lock will >>>>>>>> diff --git a/migration/ram.c b/migration/ram.c >>>>>>>> index 5707382db1..05fe0c8592 100644 >>>>>>>> --- a/migration/ram.c >>>>>>>> +++ b/migration/ram.c >>>>>>>> @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) >>>>>>>> page_address = (void *) uffd_msg.arg.pagefault.address; >>>>>>>> bs = qemu_ram_block_from_host(page_address, false, offset); >>>>>>>> assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); >>>>>>>> + >>>>>>>> return bs; >>>>>>>> } >>>>>>>> #endif /* CONFIG_LINUX */ >>>>>>>> @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, >>>>>>>> /* Un-protect memory range. */ >>>>>>>> res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, >>>>>>>> false, false); >>>>>>>> + >>>>>>>> /* We don't want to override existing error from ram_save_host_page(). */ >>>>>>>> if (res < 0 && *res_override >= 0) { >>>>>>>> *res_override = res; >>>>>>>> diff --git a/migration/savevm.c b/migration/savevm.c >>>>>>>> index 27e842812e..dd4ad0aaaf 100644 >>>>>>>> --- a/migration/savevm.c >>>>>>>> +++ b/migration/savevm.c >>>>>>>> @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) >>>>>>>> return 0; >>>>>>>> } >>>>>>>> -static >>>>>>>> int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>>>>>> bool in_postcopy, >>>>>>>> bool inactivate_disks) >>>>>>>> diff --git a/migration/savevm.h b/migration/savevm.h >>>>>>>> index ba64a7e271..aaee2528ed 100644 >>>>>>>> --- a/migration/savevm.h >>>>>>>> +++ b/migration/savevm.h >>>>>>>> @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); >>>>>>>> void qemu_loadvm_state_cleanup(void); >>>>>>>> int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); >>>>>>>> int qemu_load_device_state(QEMUFile *f); >>>>>>>> +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >>>>>>>> + bool in_postcopy, bool inactivate_disks); >>>>>>>> #endif >>>>>>>> -- >>>>>>>> 2.25.1 >>>>>>>> >>>>>> -- >>>>>> Andrey Gruzdev, Principal Engineer >>>>>> Virtuozzo GmbH +7-903-247-6397 >>>>>> virtuzzo.com >>>>>> >>>> -- >>>> Andrey Gruzdev, Principal Engineer >>>> Virtuozzo GmbH +7-903-247-6397 >>>> virtuzzo.com >>>> >> -- >> Andrey Gruzdev, Principal Engineer >> Virtuozzo GmbH +7-903-247-6397 >> virtuzzo.com >>
diff --git a/migration/migration.c b/migration/migration.c index 2c2cb9ef01..0901a15ac5 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2007,6 +2007,7 @@ void migrate_init(MigrationState *s) * locks. */ s->cleanup_bh = 0; + s->vm_start_bh = 0; s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; @@ -3211,6 +3212,50 @@ fail: MIGRATION_STATUS_FAILED); } +/** + * bg_migration_completion: Used by bg_migration_thread when after all the + * RAM has been saved. The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void bg_migration_completion(MigrationState *s) +{ + int current_active_state = s->state; + + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * By this moment we have RAM content saved into the migration stream. + * The next step is to flush the non-RAM content (device state) + * right after the ram content. The device state has been stored into + * the temporary buffer before RAM saving started. + */ + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); + qemu_fflush(s->to_dst_file); + } else if (s->state == MIGRATION_STATUS_CANCELLING) { + goto fail; + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + bool migrate_colo_enabled(void) { MigrationState *s = migrate_get_current(); @@ -3551,6 +3596,47 @@ static void migration_iteration_finish(MigrationState *s) qemu_mutex_unlock_iothread(); } +static void bg_migration_iteration_finish(MigrationState *s) +{ + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + break; + + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_CANCELLING: + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + + migrate_fd_cleanup_schedule(s); + qemu_mutex_unlock_iothread(); +} + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState bg_migration_iteration_run(MigrationState *s) +{ + int res; + + res = qemu_savevm_state_iterate(s->to_dst_file, false); + if (res > 0) { + bg_migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + void migration_make_urgent_request(void) { qemu_sem_post(&migrate_get_current()->rate_limit_sem); @@ -3698,6 +3784,165 @@ static void *migration_thread(void *opaque) return NULL; } +static void bg_migration_vm_start_bh(void *opaque) +{ + MigrationState *s = opaque; + + qemu_bh_delete(s->vm_start_bh); + s->vm_start_bh = NULL; + + vm_start(); + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; +} + +/** + * Background snapshot thread, based on live migration code. + * This is an alternative implementation of live migration mechanism + * introduced specifically to support background snapshots. + * + * It takes advantage of userfault_fd write protection mechanism introduced + * in v5.7 kernel. Compared to existing dirty page logging migration much + * lesser stream traffic is produced resulting in smaller snapshot images, + * simply cause of no page duplicates can get into the stream. + * + * Another key point is that generated vmstate stream reflects machine state + * 'frozen' at the beginning of snapshot creation compared to dirty page logging + * mechanism, which effectively results in that saved snapshot is the state of VM + * at the end of the process. + */ +static void *bg_migration_thread(void *opaque) +{ + MigrationState *s = opaque; + int64_t setup_start; + MigThrError thr_error; + QEMUFile *fb; + bool early_fail = true; + + rcu_register_thread(); + object_ref(OBJECT(s)); + + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + /* + * We want to save vmstate for the moment when migration has been + * initiated but also we want to save RAM content while VM is running. + * The RAM content should appear first in the vmstate. So, we first + * stash the non-RAM part of the vmstate to the temporary buffer, + * then write RAM part of the vmstate to the migration stream + * with vCPUs running and, finally, write stashed non-RAM part of + * the vmstate from the buffer to the migration stream. + */ + s->bioc = qio_channel_buffer_new(128 * 1024); + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); + object_unref(OBJECT(s->bioc)); + + update_iteration_initial_status(s); + + qemu_savevm_state_header(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); + + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, + MIGRATION_STATUS_ACTIVE); + } else { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); + } + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + + trace_migration_thread_setup_complete(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + qemu_mutex_lock_iothread(); + + /* + * If VM is currently in suspended state, then, to make a valid runstate + * transition in vm_stop_force_state() we need to wakeup it up. + */ + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + s->vm_was_running = runstate_is_running(); + + if (global_state_store()) { + goto fail; + } + /* Forcibly stop VM before saving state of vCPUs and devices */ + if (vm_stop_force_state(RUN_STATE_PAUSED)) { + goto fail; + } + /* + * Put vCPUs in sync with shadow context structures, then + * save their state to channel-buffer along with devices. + */ + cpu_synchronize_all_states(); + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + goto fail; + } + /* Now initialize UFFD context and start tracking RAM writes */ + if (ram_write_tracking_start()) { + goto fail; + } + early_fail = false; + + /* + * Start VM from BH handler to avoid write-fault lock here. + * UFFD-WP protection for the whole RAM is already enabled so + * calling VM state change notifiers from vm_start() would initiate + * writes to virtio VQs memory which is in write-protected region. + */ + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); + qemu_bh_schedule(s->vm_start_bh); + + qemu_mutex_unlock_iothread(); + + while (migration_is_active(s)) { + MigIterateState iter_state = bg_migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { + break; + } + + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ + break; + } + + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } + + trace_migration_thread_after_loop(); + +fail: + if (early_fail) { + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + } + + bg_migration_iteration_finish(s); + + qemu_fclose(fb); + object_unref(OBJECT(s)); + rcu_unregister_thread(); + + return NULL; +} + void migrate_fd_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; @@ -3761,8 +4006,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) migrate_fd_cleanup(s); return; } - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, - QEMU_THREAD_JOINABLE); + + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "background_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&s->thread, "live_migration", + migration_thread, s, QEMU_THREAD_JOINABLE); + } s->migration_thread_running = true; } diff --git a/migration/migration.h b/migration/migration.h index f40338cfbf..0723955cd7 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -20,6 +20,7 @@ #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" +#include "io/channel-buffer.h" #include "net/announce.h" #include "qom/object.h" @@ -147,8 +148,10 @@ struct MigrationState { /*< public >*/ QemuThread thread; + QEMUBH *vm_start_bh; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + QIOChannelBuffer *bioc; /* * Protects to_dst_file pointer. We need to make sure we won't * yield or hang during the critical section, since this lock will diff --git a/migration/ram.c b/migration/ram.c index 5707382db1..05fe0c8592 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1471,6 +1471,7 @@ static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) page_address = (void *) uffd_msg.arg.pagefault.address; bs = qemu_ram_block_from_host(page_address, false, offset); assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); + return bs; } #endif /* CONFIG_LINUX */ @@ -1836,6 +1837,7 @@ static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, /* Un-protect memory range. */ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, false, false); + /* We don't want to override existing error from ram_save_host_page(). */ if (res < 0 && *res_override >= 0) { *res_override = res; diff --git a/migration/savevm.c b/migration/savevm.c index 27e842812e..dd4ad0aaaf 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1354,7 +1354,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) return 0; } -static int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) diff --git a/migration/savevm.h b/migration/savevm.h index ba64a7e271..aaee2528ed 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); #endif