Message ID | 1429fc59079d99ca035b31303892d807868dc6c0.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Multifd | expand |
On 27/08/2024 20:54, Maciej S. Szmigiero wrote: > External email: Use caution opening links or attachments > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > Implement the multifd device state transfer via additional per-device > thread inside save_live_complete_precopy_thread handler. > > Switch between doing the data transfer in the new handler and doing it > in the old save_state handler depending on the > x-migration-multifd-transfer device property value. > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > --- > hw/vfio/migration.c | 169 ++++++++++++++++++++++++++++++++++ > hw/vfio/trace-events | 2 + > include/hw/vfio/vfio-common.h | 1 + > 3 files changed, 172 insertions(+) > > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > index 57c1542528dc..67996aa2df8b 100644 > --- a/hw/vfio/migration.c > +++ b/hw/vfio/migration.c > @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) > uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; > int ret; > > + /* Make a copy of this setting at the start in case it is changed mid-migration */ > + migration->multifd_transfer = vbasedev->migration_multifd_transfer; Should VFIO multifd be controlled by main migration multifd capability, and let the per VFIO device migration_multifd_transfer property be immutable and enabled by default? Then we would have a single point of configuration (and an extra one per VFIO device just to disable for backward compatibility). Unless there are other benefits to have this property configurable? > + > + if (migration->multifd_transfer && !migration_has_device_state_support()) { > + error_setg(errp, > + "%s: Multifd device transfer requested but unsupported in the current config", > + vbasedev->name); > + return -EINVAL; > + } > + > qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); > > vfio_query_stop_copy_size(vbasedev, &stop_copy_size); > @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) > static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) > { > VFIODevice *vbasedev = opaque; > + VFIOMigration *migration = vbasedev->migration; > ssize_t data_size; > int ret; > Error *local_err = NULL; > > + if (migration->multifd_transfer) { > + /* > + * Emit dummy NOP data, vfio_save_complete_precopy_thread() > + * does the actual transfer. > + */ > + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); There are three places where we send this dummy end of state, maybe worth extracting it to a helper? I.e., vfio_send_end_of_state() and then document there the rationale. > + return 0; > + } > + > trace_vfio_save_complete_precopy_started(vbasedev->name); > > /* We reach here with device state STOP or STOP_COPY only */ > @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) > return ret; > } > > +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev, > + char *idstr, > + uint32_t instance_id, > + uint32_t idx) > +{ > + g_autoptr(QIOChannelBuffer) bioc = NULL; > + QEMUFile *f = NULL; > + int ret; > + g_autofree VFIODeviceStatePacket *packet = NULL; > + size_t packet_len; > + > + bioc = qio_channel_buffer_new(0); > + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); > + > + f = qemu_file_new_output(QIO_CHANNEL(bioc)); > + > + ret = vfio_save_device_config_state(f, vbasedev, NULL); > + if (ret) { > + return ret; Need to close f in this case. > + } > + > + ret = qemu_fflush(f); > + if (ret) { > + goto ret_close_file; > + } > + > + packet_len = sizeof(*packet) + bioc->usage; > + packet = g_malloc0(packet_len); > + packet->idx = idx; > + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; > + memcpy(&packet->data, bioc->data, bioc->usage); > + > + if (!multifd_queue_device_state(idstr, instance_id, > + (char *)packet, packet_len)) { > + ret = -1; goto ret_close_file? > + } > + > + bytes_transferred += packet_len; bytes_transferred is a global variable. Now that we access it from multiple threads it should be protected. Note that now the VFIO device data is reported also in multifd stats (if I am not mistaken), is this the behavior we want? Maybe we should enhance multifd stats to distinguish between RAM data and device data? > + > +ret_close_file: Rename to "out" as we only have one exit point? > + g_clear_pointer(&f, qemu_fclose); f is a local variable, wouldn't qemu_fclose(f) be enough here? > + return ret; > +} > + > +static int vfio_save_complete_precopy_thread(char *idstr, > + uint32_t instance_id, > + bool *abort_flag, > + void *opaque) > +{ > + VFIODevice *vbasedev = opaque; > + VFIOMigration *migration = vbasedev->migration; > + int ret; > + g_autofree VFIODeviceStatePacket *packet = NULL; > + uint32_t idx; > + > + if (!migration->multifd_transfer) { > + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ > + return 0; > + } > + > + trace_vfio_save_complete_precopy_thread_started(vbasedev->name, > + idstr, instance_id); > + > + /* We reach here with device state STOP or STOP_COPY only */ > + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, > + VFIO_DEVICE_STATE_STOP, NULL); > + if (ret) { > + goto ret_finish; > + } > + > + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); > + > + for (idx = 0; ; idx++) { > + ssize_t data_size; > + size_t packet_size; > + > + if (qatomic_read(abort_flag)) { > + ret = -ECANCELED; > + goto ret_finish; > + } > + > + data_size = read(migration->data_fd, &packet->data, > + migration->data_buffer_size); > + if (data_size < 0) { > + if (errno != ENOMSG) { > + ret = -errno; > + goto ret_finish; > + } > + > + /* > + * Pre-copy emptied all the device state for now. For more information, > + * please refer to the Linux kernel VFIO uAPI. > + */ > + data_size = 0; According to VFIO uAPI, ENOMSG can only be returned in the PRE_COPY device states. Here we are in STOP_COPY, so we can drop the ENOMSG handling. Thanks. > + } > + > + if (data_size == 0) > + break; > + > + packet->idx = idx; > + packet_size = sizeof(*packet) + data_size; > + > + if (!multifd_queue_device_state(idstr, instance_id, > + (char *)packet, packet_size)) { > + ret = -1; > + goto ret_finish; > + } > + > + bytes_transferred += packet_size; > + } > + > + ret = vfio_save_complete_precopy_async_thread_config_state(vbasedev, idstr, > + instance_id, > + idx); > + > +ret_finish: > + trace_vfio_save_complete_precopy_thread_finished(vbasedev->name, ret); > + > + return ret; > +} > + > +static int vfio_save_complete_precopy_begin(QEMUFile *f, > + char *idstr, uint32_t instance_id, > + void *opaque) > +{ > + VFIODevice *vbasedev = opaque; > + VFIOMigration *migration = vbasedev->migration; > + > + if (!migration->multifd_transfer) { > + /* Emit dummy NOP data */ > + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > + return 0; > + } > + > + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE); > + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > + > + return qemu_fflush(f); > +} > + > static void vfio_save_state(QEMUFile *f, void *opaque) > { > VFIODevice *vbasedev = opaque; > + VFIOMigration *migration = vbasedev->migration; > Error *local_err = NULL; > int ret; > > + if (migration->multifd_transfer) { > + /* Emit dummy NOP data */ > + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > + return; > + } > + > ret = vfio_save_device_config_state(f, opaque, &local_err); > if (ret) { > error_prepend(&local_err, > @@ -1119,7 +1286,9 @@ static const SaveVMHandlers savevm_vfio_handlers = { > .state_pending_exact = vfio_state_pending_exact, > .is_active_iterate = vfio_is_active_iterate, > .save_live_iterate = vfio_save_iterate, > + .save_live_complete_precopy_begin = vfio_save_complete_precopy_begin, > .save_live_complete_precopy = vfio_save_complete_precopy, > + .save_live_complete_precopy_thread = vfio_save_complete_precopy_thread, > .save_state = vfio_save_state, > .load_setup = vfio_load_setup, > .load_cleanup = vfio_load_cleanup, > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index 9d2519a28a7e..b1d9c9d5f2e1 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -167,6 +167,8 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d" > vfio_save_cleanup(const char *name) " (%s)" > vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" > vfio_save_complete_precopy_started(const char *name) " (%s)" > +vfio_save_complete_precopy_thread_started(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32 > +vfio_save_complete_precopy_thread_finished(const char *name, int ret) " (%s) ret %d" > vfio_save_device_config_state(const char *name) " (%s)" > vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 > vfio_save_iterate_started(const char *name) " (%s)" > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index fe05acb9a5d1..4578a0ca6a5c 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -72,6 +72,7 @@ typedef struct VFIOMigration { > uint64_t mig_flags; > uint64_t precopy_init_size; > uint64_t precopy_dirty_size; > + bool multifd_transfer; > bool initial_data_sent; > > bool save_iterate_run;
On 9.09.2024 13:41, Avihai Horon wrote: > > On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >> External email: Use caution opening links or attachments >> >> >> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >> >> Implement the multifd device state transfer via additional per-device >> thread inside save_live_complete_precopy_thread handler. >> >> Switch between doing the data transfer in the new handler and doing it >> in the old save_state handler depending on the >> x-migration-multifd-transfer device property value. >> >> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >> --- >> hw/vfio/migration.c | 169 ++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 2 + >> include/hw/vfio/vfio-common.h | 1 + >> 3 files changed, 172 insertions(+) >> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c >> index 57c1542528dc..67996aa2df8b 100644 >> --- a/hw/vfio/migration.c >> +++ b/hw/vfio/migration.c >> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) >> uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; >> int ret; >> >> + /* Make a copy of this setting at the start in case it is changed mid-migration */ >> + migration->multifd_transfer = vbasedev->migration_multifd_transfer; > > Should VFIO multifd be controlled by main migration multifd capability, and let the per VFIO device migration_multifd_transfer property be immutable and enabled by default? > Then we would have a single point of configuration (and an extra one per VFIO device just to disable for backward compatibility). > Unless there are other benefits to have this property configurable? We want multifd device state transfer property to be configurable per-device in case in the future we add another device type (besides VFIO) that supports multifd device state transfer. In this case, we might need to enable the multifd device state transfer just for VFIO devices, but not for this new device type when we are migrating to a QEMU target that supports just the VFIO multifd device state transfer. TBH, I'm not opposed to adding a additional global multifd device state transfer switch (if we keep the per-device ones too) but I am not sure what value it adds. >> + >> + if (migration->multifd_transfer && !migration_has_device_state_support()) { >> + error_setg(errp, >> + "%s: Multifd device transfer requested but unsupported in the current config", >> + vbasedev->name); >> + return -EINVAL; >> + } >> + >> qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); >> >> vfio_query_stop_copy_size(vbasedev, &stop_copy_size); >> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) >> static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) >> { >> VFIODevice *vbasedev = opaque; >> + VFIOMigration *migration = vbasedev->migration; >> ssize_t data_size; >> int ret; >> Error *local_err = NULL; >> >> + if (migration->multifd_transfer) { >> + /* >> + * Emit dummy NOP data, vfio_save_complete_precopy_thread() >> + * does the actual transfer. >> + */ >> + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > > There are three places where we send this dummy end of state, maybe worth extracting it to a helper? I.e., vfio_send_end_of_state() and then document there the rationale. I'm not totally against it but it's wrapping just a single line of code in a separate function? >> + return 0; >> + } >> + >> trace_vfio_save_complete_precopy_started(vbasedev->name); >> >> /* We reach here with device state STOP or STOP_COPY only */ >> @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) >> return ret; >> } >> >> +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev, >> + char *idstr, >> + uint32_t instance_id, >> + uint32_t idx) >> +{ >> + g_autoptr(QIOChannelBuffer) bioc = NULL; >> + QEMUFile *f = NULL; >> + int ret; >> + g_autofree VFIODeviceStatePacket *packet = NULL; >> + size_t packet_len; >> + >> + bioc = qio_channel_buffer_new(0); >> + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); >> + >> + f = qemu_file_new_output(QIO_CHANNEL(bioc)); >> + >> + ret = vfio_save_device_config_state(f, vbasedev, NULL); >> + if (ret) { >> + return ret; > > Need to close f in this case. Right - by the way, that's a good example why RAII helps avoid such mistakes. >> + } >> + >> + ret = qemu_fflush(f); >> + if (ret) { >> + goto ret_close_file; >> + } >> + >> + packet_len = sizeof(*packet) + bioc->usage; >> + packet = g_malloc0(packet_len); >> + packet->idx = idx; >> + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; >> + memcpy(&packet->data, bioc->data, bioc->usage); >> + >> + if (!multifd_queue_device_state(idstr, instance_id, >> + (char *)packet, packet_len)) { >> + ret = -1; > > goto ret_close_file? Right, it would be better not to increment the counter in this case. >> + } >> + >> + bytes_transferred += packet_len; > > bytes_transferred is a global variable. Now that we access it from multiple threads it should be protected. Right, this stat needs some concurrent access protection. > Note that now the VFIO device data is reported also in multifd stats (if I am not mistaken), is this the behavior we want? Maybe we should enhance multifd stats to distinguish between RAM data and device data? Multifd stats report total size of data transferred via multifd so they should include device state too. It may make sense to add a dedicated device state transfer counter at some time though. >> + >> +ret_close_file: > > Rename to "out" as we only have one exit point? > >> + g_clear_pointer(&f, qemu_fclose); > > f is a local variable, wouldn't qemu_fclose(f) be enough here? Sure, but why leave a dangling pointer? Currently, it is obviously a NOP (probably deleted by dead store elimination anyway) but the code might get refactored at some point and I think it's good practice to always NULL pointers after freeing them where possible and so be on the safe side. >> + return ret; >> +} >> + >> +static int vfio_save_complete_precopy_thread(char *idstr, >> + uint32_t instance_id, >> + bool *abort_flag, >> + void *opaque) >> +{ >> + VFIODevice *vbasedev = opaque; >> + VFIOMigration *migration = vbasedev->migration; >> + int ret; >> + g_autofree VFIODeviceStatePacket *packet = NULL; >> + uint32_t idx; >> + >> + if (!migration->multifd_transfer) { >> + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ >> + return 0; >> + } >> + >> + trace_vfio_save_complete_precopy_thread_started(vbasedev->name, >> + idstr, instance_id); >> + >> + /* We reach here with device state STOP or STOP_COPY only */ >> + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, >> + VFIO_DEVICE_STATE_STOP, NULL); >> + if (ret) { >> + goto ret_finish; >> + } >> + >> + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); >> + >> + for (idx = 0; ; idx++) { >> + ssize_t data_size; >> + size_t packet_size; >> + >> + if (qatomic_read(abort_flag)) { >> + ret = -ECANCELED; >> + goto ret_finish; >> + } >> + >> + data_size = read(migration->data_fd, &packet->data, >> + migration->data_buffer_size); >> + if (data_size < 0) { >> + if (errno != ENOMSG) { >> + ret = -errno; >> + goto ret_finish; >> + } >> + >> + /* >> + * Pre-copy emptied all the device state for now. For more information, >> + * please refer to the Linux kernel VFIO uAPI. >> + */ >> + data_size = 0; > > According to VFIO uAPI, ENOMSG can only be returned in the PRE_COPY device states. > Here we are in STOP_COPY, so we can drop the ENOMSG handling. Will drop this ENOMSG handling. > Thanks. Thanks, Maciej
On 09/09/2024 21:07, Maciej S. Szmigiero wrote: > External email: Use caution opening links or attachments > > > On 9.09.2024 13:41, Avihai Horon wrote: >> >> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>> External email: Use caution opening links or attachments >>> >>> >>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>> >>> Implement the multifd device state transfer via additional per-device >>> thread inside save_live_complete_precopy_thread handler. >>> >>> Switch between doing the data transfer in the new handler and doing it >>> in the old save_state handler depending on the >>> x-migration-multifd-transfer device property value. >>> >>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>> --- >>> hw/vfio/migration.c | 169 >>> ++++++++++++++++++++++++++++++++++ >>> hw/vfio/trace-events | 2 + >>> include/hw/vfio/vfio-common.h | 1 + >>> 3 files changed, 172 insertions(+) >>> >>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c >>> index 57c1542528dc..67996aa2df8b 100644 >>> --- a/hw/vfio/migration.c >>> +++ b/hw/vfio/migration.c >>> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void >>> *opaque, Error **errp) >>> uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; >>> int ret; >>> >>> + /* Make a copy of this setting at the start in case it is >>> changed mid-migration */ >>> + migration->multifd_transfer = >>> vbasedev->migration_multifd_transfer; >> >> Should VFIO multifd be controlled by main migration multifd >> capability, and let the per VFIO device migration_multifd_transfer >> property be immutable and enabled by default? >> Then we would have a single point of configuration (and an extra one >> per VFIO device just to disable for backward compatibility). >> Unless there are other benefits to have this property configurable? > > We want multifd device state transfer property to be configurable > per-device > in case in the future we add another device type (besides VFIO) that > supports > multifd device state transfer. > > In this case, we might need to enable the multifd device state > transfer just > for VFIO devices, but not for this new device type when we are > migrating to a > QEMU target that supports just the VFIO multifd device state transfer. I think for this case we can use hw/core/machine.c:hw_compat_X_Y arrays [1]. [1] https://www.qemu.org/docs/master/devel/migration/compatibility.html#how-backwards-compatibility-works > > TBH, I'm not opposed to adding a additional global multifd device > state transfer > switch (if we keep the per-device ones too) but I am not sure what > value it adds. > >>> + >>> + if (migration->multifd_transfer && >>> !migration_has_device_state_support()) { >>> + error_setg(errp, >>> + "%s: Multifd device transfer requested but >>> unsupported in the current config", >>> + vbasedev->name); >>> + return -EINVAL; >>> + } >>> + >>> qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); >>> >>> vfio_query_stop_copy_size(vbasedev, &stop_copy_size); >>> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void >>> *opaque) >>> static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) >>> { >>> VFIODevice *vbasedev = opaque; >>> + VFIOMigration *migration = vbasedev->migration; >>> ssize_t data_size; >>> int ret; >>> Error *local_err = NULL; >>> >>> + if (migration->multifd_transfer) { >>> + /* >>> + * Emit dummy NOP data, vfio_save_complete_precopy_thread() >>> + * does the actual transfer. >>> + */ >>> + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); >> >> There are three places where we send this dummy end of state, maybe >> worth extracting it to a helper? I.e., vfio_send_end_of_state() and >> then document there the rationale. > > I'm not totally against it but it's wrapping just a single line of > code in > a separate function? Yes, it's more for self-documentation purpose and for not duplicating comments. I guess it's a matter of taste, so we can go either way and let maintainer decide. > >>> + return 0; >>> + } >>> + >>> trace_vfio_save_complete_precopy_started(vbasedev->name); >>> >>> /* We reach here with device state STOP or STOP_COPY only */ >>> @@ -864,12 +884,159 @@ static int >>> vfio_save_complete_precopy(QEMUFile *f, void *opaque) >>> return ret; >>> } >>> >>> +static int >>> vfio_save_complete_precopy_async_thread_config_state(VFIODevice >>> *vbasedev, >>> + >>> char *idstr, >>> + uint32_t instance_id, >>> + uint32_t idx) >>> +{ >>> + g_autoptr(QIOChannelBuffer) bioc = NULL; >>> + QEMUFile *f = NULL; >>> + int ret; >>> + g_autofree VFIODeviceStatePacket *packet = NULL; >>> + size_t packet_len; >>> + >>> + bioc = qio_channel_buffer_new(0); >>> + qio_channel_set_name(QIO_CHANNEL(bioc), >>> "vfio-device-config-save"); >>> + >>> + f = qemu_file_new_output(QIO_CHANNEL(bioc)); >>> + >>> + ret = vfio_save_device_config_state(f, vbasedev, NULL); >>> + if (ret) { >>> + return ret; >> >> Need to close f in this case. > > Right - by the way, that's a good example why RAII > helps avoid such mistakes. Agreed :) > >>> + } >>> + >>> + ret = qemu_fflush(f); >>> + if (ret) { >>> + goto ret_close_file; >>> + } >>> + >>> + packet_len = sizeof(*packet) + bioc->usage; >>> + packet = g_malloc0(packet_len); >>> + packet->idx = idx; >>> + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; >>> + memcpy(&packet->data, bioc->data, bioc->usage); >>> + >>> + if (!multifd_queue_device_state(idstr, instance_id, >>> + (char *)packet, packet_len)) { >>> + ret = -1; >> >> goto ret_close_file? > > Right, it would be better not to increment the counter in this case. > >>> + } >>> + >>> + bytes_transferred += packet_len; >> >> bytes_transferred is a global variable. Now that we access it from >> multiple threads it should be protected. > > Right, this stat needs some concurrent access protection. > >> Note that now the VFIO device data is reported also in multifd stats >> (if I am not mistaken), is this the behavior we want? Maybe we should >> enhance multifd stats to distinguish between RAM data and device data? > > Multifd stats report total size of data transferred via multifd so > they should include device state too. Yes I agree. But now we are reporting double the amount of VFIO data that we actually transfer (once in "vfio device transferred" and another in multifd stats) and this may be misleading. So maybe we should add a dedicated multifd device state counter and report VFIO multifd bytes there instead of in bytes_transferred? We can wait for other people's opinion about that. > > It may make sense to add a dedicated device state transfer counter > at some time though. > >>> + >>> +ret_close_file: >> >> Rename to "out" as we only have one exit point? >> >>> + g_clear_pointer(&f, qemu_fclose); >> >> f is a local variable, wouldn't qemu_fclose(f) be enough here? > > Sure, but why leave a dangling pointer? > > Currently, it is obviously a NOP (probably deleted by dead store > elimination anyway) but the code might get refactored at some point > and I think it's good practice to always NULL pointers after freeing > them where possible and so be on the safe side. Ack. Thanks.
On 9/12/24 10:26, Avihai Horon wrote: > > On 09/09/2024 21:07, Maciej S. Szmigiero wrote: >> External email: Use caution opening links or attachments >> >> >> On 9.09.2024 13:41, Avihai Horon wrote: >>> >>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote: >>>> External email: Use caution opening links or attachments >>>> >>>> >>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>>> >>>> Implement the multifd device state transfer via additional per-device >>>> thread inside save_live_complete_precopy_thread handler. >>>> >>>> Switch between doing the data transfer in the new handler and doing it >>>> in the old save_state handler depending on the >>>> x-migration-multifd-transfer device property value. >>>> >>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>>> --- >>>> hw/vfio/migration.c | 169 ++++++++++++++++++++++++++++++++++ >>>> hw/vfio/trace-events | 2 + >>>> include/hw/vfio/vfio-common.h | 1 + >>>> 3 files changed, 172 insertions(+) >>>> >>>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c >>>> index 57c1542528dc..67996aa2df8b 100644 >>>> --- a/hw/vfio/migration.c >>>> +++ b/hw/vfio/migration.c >>>> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) >>>> uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; >>>> int ret; >>>> >>>> + /* Make a copy of this setting at the start in case it is changed mid-migration */ >>>> + migration->multifd_transfer = vbasedev->migration_multifd_transfer; >>> >>> Should VFIO multifd be controlled by main migration multifd capability, and let the per VFIO device migration_multifd_transfer property be immutable and enabled by default? >>> Then we would have a single point of configuration (and an extra one per VFIO device just to disable for backward compatibility). >>> Unless there are other benefits to have this property configurable? >> >> We want multifd device state transfer property to be configurable per-device >> in case in the future we add another device type (besides VFIO) that supports >> multifd device state transfer. >> >> In this case, we might need to enable the multifd device state transfer just >> for VFIO devices, but not for this new device type when we are migrating to a >> QEMU target that supports just the VFIO multifd device state transfer. > > I think for this case we can use hw/core/machine.c:hw_compat_X_Y arrays [1]. > > [1] https://www.qemu.org/docs/master/devel/migration/compatibility.html#how-backwards-compatibility-works > >> >> TBH, I'm not opposed to adding a additional global multifd device state transfer >> switch (if we keep the per-device ones too) but I am not sure what value it adds. >> >>>> + >>>> + if (migration->multifd_transfer && !migration_has_device_state_support()) { >>>> + error_setg(errp, >>>> + "%s: Multifd device transfer requested but unsupported in the current config", >>>> + vbasedev->name); >>>> + return -EINVAL; >>>> + } >>>> + >>>> qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); >>>> >>>> vfio_query_stop_copy_size(vbasedev, &stop_copy_size); >>>> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) >>>> static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) >>>> { >>>> VFIODevice *vbasedev = opaque; >>>> + VFIOMigration *migration = vbasedev->migration; >>>> ssize_t data_size; >>>> int ret; >>>> Error *local_err = NULL; >>>> >>>> + if (migration->multifd_transfer) { >>>> + /* >>>> + * Emit dummy NOP data, vfio_save_complete_precopy_thread() >>>> + * does the actual transfer. >>>> + */ >>>> + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); >>> >>> There are three places where we send this dummy end of state, maybe worth extracting it to a helper? I.e., vfio_send_end_of_state() and then document there the rationale. >> >> I'm not totally against it but it's wrapping just a single line of code in >> a separate function? > > Yes, it's more for self-documentation purpose and for not duplicating comments. > I guess it's a matter of taste, so we can go either way and let maintainer decide. I'd prefer an helper too. This comment applies to all additions in pre-existing code. Ideally new routines should have a 'vfio_{migration,save,load}_multifd' prefix so that the reader understands what the code is for. Thanks, C. > >> >>>> + return 0; >>>> + } >>>> + >>>> trace_vfio_save_complete_precopy_started(vbasedev->name); >>>> >>>> /* We reach here with device state STOP or STOP_COPY only */ >>>> @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) >>>> return ret; >>>> } >>>> >>>> +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev, >>>> + char *idstr, >>>> + uint32_t instance_id, >>>> + uint32_t idx) >>>> +{ >>>> + g_autoptr(QIOChannelBuffer) bioc = NULL; >>>> + QEMUFile *f = NULL; >>>> + int ret; >>>> + g_autofree VFIODeviceStatePacket *packet = NULL; >>>> + size_t packet_len; >>>> + >>>> + bioc = qio_channel_buffer_new(0); >>>> + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); >>>> + >>>> + f = qemu_file_new_output(QIO_CHANNEL(bioc)); >>>> + >>>> + ret = vfio_save_device_config_state(f, vbasedev, NULL); >>>> + if (ret) { >>>> + return ret; >>> >>> Need to close f in this case. >> >> Right - by the way, that's a good example why RAII >> helps avoid such mistakes. > > Agreed :) > >> >>>> + } >>>> + >>>> + ret = qemu_fflush(f); >>>> + if (ret) { >>>> + goto ret_close_file; >>>> + } >>>> + >>>> + packet_len = sizeof(*packet) + bioc->usage; >>>> + packet = g_malloc0(packet_len); >>>> + packet->idx = idx; >>>> + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; >>>> + memcpy(&packet->data, bioc->data, bioc->usage); >>>> + >>>> + if (!multifd_queue_device_state(idstr, instance_id, >>>> + (char *)packet, packet_len)) { >>>> + ret = -1; >>> >>> goto ret_close_file? >> >> Right, it would be better not to increment the counter in this case. >> >>>> + } >>>> + >>>> + bytes_transferred += packet_len; >>> >>> bytes_transferred is a global variable. Now that we access it from multiple threads it should be protected. >> >> Right, this stat needs some concurrent access protection. >> >>> Note that now the VFIO device data is reported also in multifd stats (if I am not mistaken), is this the behavior we want? Maybe we should enhance multifd stats to distinguish between RAM data and device data? >> >> Multifd stats report total size of data transferred via multifd so >> they should include device state too. > > Yes I agree. But now we are reporting double the amount of VFIO data that we actually transfer (once in "vfio device transferred" and another in multifd stats) and this may be misleading. > So maybe we should add a dedicated multifd device state counter and report VFIO multifd bytes there instead of in bytes_transferred? > We can wait for other people's opinion about that. > >> >> It may make sense to add a dedicated device state transfer counter >> at some time though. >> >>>> + >>>> +ret_close_file: >>> >>> Rename to "out" as we only have one exit point? >>> >>>> + g_clear_pointer(&f, qemu_fclose); >>> >>> f is a local variable, wouldn't qemu_fclose(f) be enough here? >> >> Sure, but why leave a dangling pointer? >> >> Currently, it is obviously a NOP (probably deleted by dead store >> elimination anyway) but the code might get refactored at some point >> and I think it's good practice to always NULL pointers after freeing >> them where possible and so be on the safe side. > > Ack. > > Thanks. >
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 57c1542528dc..67996aa2df8b 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; int ret; + /* Make a copy of this setting at the start in case it is changed mid-migration */ + migration->multifd_transfer = vbasedev->migration_multifd_transfer; + + if (migration->multifd_transfer && !migration_has_device_state_support()) { + error_setg(errp, + "%s: Multifd device transfer requested but unsupported in the current config", + vbasedev->name); + return -EINVAL; + } + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); vfio_query_stop_copy_size(vbasedev, &stop_copy_size); @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; ssize_t data_size; int ret; Error *local_err = NULL; + if (migration->multifd_transfer) { + /* + * Emit dummy NOP data, vfio_save_complete_precopy_thread() + * does the actual transfer. + */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + return 0; + } + trace_vfio_save_complete_precopy_started(vbasedev->name); /* We reach here with device state STOP or STOP_COPY only */ @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) return ret; } +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev, + char *idstr, + uint32_t instance_id, + uint32_t idx) +{ + g_autoptr(QIOChannelBuffer) bioc = NULL; + QEMUFile *f = NULL; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + size_t packet_len; + + bioc = qio_channel_buffer_new(0); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); + + f = qemu_file_new_output(QIO_CHANNEL(bioc)); + + ret = vfio_save_device_config_state(f, vbasedev, NULL); + if (ret) { + return ret; + } + + ret = qemu_fflush(f); + if (ret) { + goto ret_close_file; + } + + packet_len = sizeof(*packet) + bioc->usage; + packet = g_malloc0(packet_len); + packet->idx = idx; + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; + memcpy(&packet->data, bioc->data, bioc->usage); + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_len)) { + ret = -1; + } + + bytes_transferred += packet_len; + +ret_close_file: + g_clear_pointer(&f, qemu_fclose); + return ret; +} + +static int vfio_save_complete_precopy_thread(char *idstr, + uint32_t instance_id, + bool *abort_flag, + void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + uint32_t idx; + + if (!migration->multifd_transfer) { + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ + return 0; + } + + trace_vfio_save_complete_precopy_thread_started(vbasedev->name, + idstr, instance_id); + + /* We reach here with device state STOP or STOP_COPY only */ + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP, NULL); + if (ret) { + goto ret_finish; + } + + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); + + for (idx = 0; ; idx++) { + ssize_t data_size; + size_t packet_size; + + if (qatomic_read(abort_flag)) { + ret = -ECANCELED; + goto ret_finish; + } + + data_size = read(migration->data_fd, &packet->data, + migration->data_buffer_size); + if (data_size < 0) { + if (errno != ENOMSG) { + ret = -errno; + goto ret_finish; + } + + /* + * Pre-copy emptied all the device state for now. For more information, + * please refer to the Linux kernel VFIO uAPI. + */ + data_size = 0; + } + + if (data_size == 0) + break; + + packet->idx = idx; + packet_size = sizeof(*packet) + data_size; + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_size)) { + ret = -1; + goto ret_finish; + } + + bytes_transferred += packet_size; + } + + ret = vfio_save_complete_precopy_async_thread_config_state(vbasedev, idstr, + instance_id, + idx); + +ret_finish: + trace_vfio_save_complete_precopy_thread_finished(vbasedev->name, ret); + + return ret; +} + +static int vfio_save_complete_precopy_begin(QEMUFile *f, + char *idstr, uint32_t instance_id, + void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + + if (!migration->multifd_transfer) { + /* Emit dummy NOP data */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + return 0; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE); + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + return qemu_fflush(f); +} + static void vfio_save_state(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; Error *local_err = NULL; int ret; + if (migration->multifd_transfer) { + /* Emit dummy NOP data */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + return; + } + ret = vfio_save_device_config_state(f, opaque, &local_err); if (ret) { error_prepend(&local_err, @@ -1119,7 +1286,9 @@ static const SaveVMHandlers savevm_vfio_handlers = { .state_pending_exact = vfio_state_pending_exact, .is_active_iterate = vfio_is_active_iterate, .save_live_iterate = vfio_save_iterate, + .save_live_complete_precopy_begin = vfio_save_complete_precopy_begin, .save_live_complete_precopy = vfio_save_complete_precopy, + .save_live_complete_precopy_thread = vfio_save_complete_precopy_thread, .save_state = vfio_save_state, .load_setup = vfio_load_setup, .load_cleanup = vfio_load_cleanup, diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9d2519a28a7e..b1d9c9d5f2e1 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -167,6 +167,8 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_complete_precopy_started(const char *name) " (%s)" +vfio_save_complete_precopy_thread_started(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32 +vfio_save_complete_precopy_thread_finished(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_save_iterate_started(const char *name) " (%s)" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index fe05acb9a5d1..4578a0ca6a5c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -72,6 +72,7 @@ typedef struct VFIOMigration { uint64_t mig_flags; uint64_t precopy_init_size; uint64_t precopy_dirty_size; + bool multifd_transfer; bool initial_data_sent; bool save_iterate_run;