Message ID | fdcfd68dfcf3b20278a4495eb639905b2a8e8ff3.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Multifd | expand |
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes: > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> > > A new function multifd_queue_device_state() is provided for device to queue > its state for transmission via a multifd channel. > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> > --- > include/migration/misc.h | 4 ++ > migration/meson.build | 1 + > migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++ > migration/multifd-nocomp.c | 6 +- > migration/multifd-qpl.c | 2 +- > migration/multifd-uadk.c | 2 +- > migration/multifd-zlib.c | 2 +- > migration/multifd-zstd.c | 2 +- > migration/multifd.c | 65 +++++++++++++++------ > migration/multifd.h | 29 +++++++++- > 10 files changed, 184 insertions(+), 28 deletions(-) > create mode 100644 migration/multifd-device-state.c > > diff --git a/include/migration/misc.h b/include/migration/misc.h > index bfadc5613bac..7266b1b77d1f 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void); > /* migration/block-dirty-bitmap.c */ > void dirty_bitmap_mig_init(void); > > +/* migration/multifd-device-state.c */ > +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > + char *data, size_t len); > + > #endif > diff --git a/migration/meson.build b/migration/meson.build > index 77f3abf08eb1..00853595894f 100644 > --- a/migration/meson.build > +++ b/migration/meson.build > @@ -21,6 +21,7 @@ system_ss.add(files( > 'migration-hmp-cmds.c', > 'migration.c', > 'multifd.c', > + 'multifd-device-state.c', > 'multifd-nocomp.c', > 'multifd-zlib.c', > 'multifd-zero-page.c', > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > new file mode 100644 > index 000000000000..c9b44f0b5ab9 > --- /dev/null > +++ b/migration/multifd-device-state.c > @@ -0,0 +1,99 @@ > +/* > + * Multifd device state migration > + * > + * Copyright (C) 2024 Oracle and/or its affiliates. > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > + > +#include "qemu/osdep.h" > +#include "qemu/lockable.h" > +#include "migration/misc.h" > +#include "multifd.h" > + > +static QemuMutex queue_job_mutex; > + > +static MultiFDSendData *device_state_send; > + > +size_t multifd_device_state_payload_size(void) > +{ > + return sizeof(MultiFDDeviceState_t); > +} This will not be necessary because the payload size is the same as the data type. We only need it for the special case where the MultiFDPages_t is smaller than the total ram payload size. > + > +void multifd_device_state_save_setup(void) s/save/send/. The ram ones are only called "save" because they're called from ram_save_setup(), but we then have the proper nocomp_send_setup hook. > +{ > + qemu_mutex_init(&queue_job_mutex); > + > + device_state_send = multifd_send_data_alloc(); > +} > + > +void multifd_device_state_clear(MultiFDDeviceState_t *device_state) > +{ > + g_clear_pointer(&device_state->idstr, g_free); > + g_clear_pointer(&device_state->buf, g_free); > +} > + > +void multifd_device_state_save_cleanup(void) s/save/send/ > +{ > + g_clear_pointer(&device_state_send, multifd_send_data_free); > + > + qemu_mutex_destroy(&queue_job_mutex); > +} > + > +static void multifd_device_state_fill_packet(MultiFDSendParams *p) > +{ > + MultiFDDeviceState_t *device_state = &p->data->u.device_state; > + MultiFDPacketDeviceState_t *packet = p->packet_device_state; > + > + packet->hdr.flags = cpu_to_be32(p->flags); > + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr)); > + packet->instance_id = cpu_to_be32(device_state->instance_id); > + packet->next_packet_size = cpu_to_be32(p->next_packet_size); > +} > + > +void multifd_device_state_send_prepare(MultiFDSendParams *p) > +{ > + MultiFDDeviceState_t *device_state = &p->data->u.device_state; > + > + assert(multifd_payload_device_state(p->data)); > + > + multifd_send_prepare_header_device_state(p); > + > + assert(!(p->flags & MULTIFD_FLAG_SYNC)); > + > + p->next_packet_size = device_state->buf_len; > + if (p->next_packet_size > 0) { > + p->iov[p->iovs_num].iov_base = device_state->buf; > + p->iov[p->iovs_num].iov_len = p->next_packet_size; > + p->iovs_num++; > + } > + > + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; > + > + multifd_device_state_fill_packet(p); > +} > + > +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > + char *data, size_t len) > +{ > + /* Device state submissions can come from multiple threads */ > + QEMU_LOCK_GUARD(&queue_job_mutex); > + MultiFDDeviceState_t *device_state; > + > + assert(multifd_payload_empty(device_state_send)); > + > + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > + device_state = &device_state_send->u.device_state; > + device_state->idstr = g_strdup(idstr); > + device_state->instance_id = instance_id; > + device_state->buf = g_memdup2(data, len); > + device_state->buf_len = len; > + > + if (!multifd_send(&device_state_send)) { > + multifd_send_data_clear(device_state_send); > + return false; > + } > + > + return true; > +} > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c > index 39eb77c9b3b7..0b7b543f44db 100644 > --- a/migration/multifd-nocomp.c > +++ b/migration/multifd-nocomp.c > @@ -116,13 +116,13 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) > * Only !zerocopy needs the header in IOV; zerocopy will > * send it separately. > */ > - multifd_send_prepare_header(p); > + multifd_send_prepare_header_ram(p); > } > > multifd_send_prepare_iovs(p); > p->flags |= MULTIFD_FLAG_NOCOMP; > > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > > if (use_zero_copy_send) { > /* Send header first, without zerocopy */ > @@ -371,7 +371,7 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) > return false; > } > > - multifd_send_prepare_header(p); > + multifd_send_prepare_header_ram(p); > > return true; > } > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c > index 75041a4c4dfe..bd6b5b6a3868 100644 > --- a/migration/multifd-qpl.c > +++ b/migration/multifd-qpl.c > @@ -490,7 +490,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) > > out: > p->flags |= MULTIFD_FLAG_QPL; > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > return 0; > } > > diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c > index db2549f59bfe..6e2d26010742 100644 > --- a/migration/multifd-uadk.c > +++ b/migration/multifd-uadk.c > @@ -198,7 +198,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) > } > out: > p->flags |= MULTIFD_FLAG_UADK; > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > return 0; > } > > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c > index 6787538762d2..62a1fe59ad3e 100644 > --- a/migration/multifd-zlib.c > +++ b/migration/multifd-zlib.c > @@ -156,7 +156,7 @@ static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp) > > out: > p->flags |= MULTIFD_FLAG_ZLIB; > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > return 0; > } > > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c > index 1576b1e2adc6..f98b07e7f9f5 100644 > --- a/migration/multifd-zstd.c > +++ b/migration/multifd-zstd.c > @@ -143,7 +143,7 @@ static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp) > > out: > p->flags |= MULTIFD_FLAG_ZSTD; > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > return 0; > } > > diff --git a/migration/multifd.c b/migration/multifd.c > index a74e8a5cc891..bebe5b5a9b9c 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -12,6 +12,7 @@ > > #include "qemu/osdep.h" > #include "qemu/cutils.h" > +#include "qemu/iov.h" > #include "qemu/rcu.h" > #include "exec/target_page.h" > #include "sysemu/sysemu.h" > @@ -19,6 +20,7 @@ > #include "qemu/error-report.h" > #include "qapi/error.h" > #include "file.h" > +#include "migration/misc.h" > #include "migration.h" > #include "migration-stats.h" > #include "savevm.h" > @@ -107,7 +109,9 @@ MultiFDSendData *multifd_send_data_alloc(void) > * added to the union in the future are larger than > * (MultiFDPages_t + flex array). > */ > - max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload)); > + max_payload_size = MAX(multifd_ram_payload_size(), > + multifd_device_state_payload_size()); This is not needed, the sizeof(MultiFDPayload) below already has the same effect. > + max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); > > /* > * Account for any holes the compiler might insert. We can't pack > @@ -126,6 +130,9 @@ void multifd_send_data_clear(MultiFDSendData *data) > } > > switch (data->type) { > + case MULTIFD_PAYLOAD_DEVICE_STATE: > + multifd_device_state_clear(&data->u.device_state); > + break; > default: > /* Nothing to do */ > break; > @@ -228,7 +235,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) > return msg.id; > } > > -void multifd_send_fill_packet(MultiFDSendParams *p) > +void multifd_send_fill_packet_ram(MultiFDSendParams *p) Do we need this change if there's no counterpart for device_state? It might be less confusing to just leave this one as it is. > { > MultiFDPacket_t *packet = p->packet; > uint64_t packet_num; > @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) > > p = &multifd_send_state->params[i]; > /* > - * Lockless read to p->pending_job is safe, because only multifd > - * sender thread can clear it. > + * Lockless RMW on p->pending_job_preparing is safe, because only multifd > + * sender thread can clear it after it had seen p->pending_job being set. > + * > + * Pairs with qatomic_store_release() in multifd_send_thread(). > */ > - if (qatomic_read(&p->pending_job) == false) { > + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { What's the motivation for this change? It would be better to have it in a separate patch with a proper justification. > break; > } > } > > - /* > - * Make sure we read p->pending_job before all the rest. Pairs with > - * qatomic_store_release() in multifd_send_thread(). > - */ > - smp_mb_acquire(); > - > assert(multifd_payload_empty(p->data)); > > /* > @@ -534,6 +537,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) > p->name = NULL; > g_clear_pointer(&p->data, multifd_send_data_free); > p->packet_len = 0; > + g_clear_pointer(&p->packet_device_state, g_free); > g_free(p->packet); > p->packet = NULL; > multifd_send_state->ops->send_cleanup(p, errp); > @@ -545,6 +549,7 @@ static void multifd_send_cleanup_state(void) > { > file_cleanup_outgoing_migration(); > socket_cleanup_outgoing_migration(); > + multifd_device_state_save_cleanup(); > qemu_sem_destroy(&multifd_send_state->channels_created); > qemu_sem_destroy(&multifd_send_state->channels_ready); > g_free(multifd_send_state->params); > @@ -670,19 +675,29 @@ static void *multifd_send_thread(void *opaque) > * qatomic_store_release() in multifd_send(). > */ > if (qatomic_load_acquire(&p->pending_job)) { > + bool is_device_state = multifd_payload_device_state(p->data); > + size_t total_size; > + > p->flags = 0; > p->iovs_num = 0; > assert(!multifd_payload_empty(p->data)); > > - ret = multifd_send_state->ops->send_prepare(p, &local_err); > - if (ret != 0) { > - break; > + if (is_device_state) { > + multifd_device_state_send_prepare(p); > + } else { > + ret = multifd_send_state->ops->send_prepare(p, &local_err); > + if (ret != 0) { > + break; > + } > } > > if (migrate_mapped_ram()) { > + assert(!is_device_state); > + > ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, > &p->data->u.ram, &local_err); > } else { > + total_size = iov_size(p->iov, p->iovs_num); > ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, > NULL, 0, p->write_flags, > &local_err); > @@ -692,18 +707,27 @@ static void *multifd_send_thread(void *opaque) > break; > } > > - stat64_add(&mig_stats.multifd_bytes, > - p->next_packet_size + p->packet_len); > + if (is_device_state) { > + stat64_add(&mig_stats.multifd_bytes, total_size); > + } else { > + /* > + * Can't just always add total_size since IOVs do not include > + * packet header in the zerocopy RAM case. > + */ > + stat64_add(&mig_stats.multifd_bytes, > + p->next_packet_size + p->packet_len); You could set total_size for both branches after send_prepare and use it here unconditionally. > + } > > p->next_packet_size = 0; > multifd_send_data_clear(p->data); > > /* > * Making sure p->data is published before saying "we're > - * free". Pairs with the smp_mb_acquire() in > + * free". Pairs with the qatomic_cmpxchg() in > * multifd_send(). > */ > qatomic_store_release(&p->pending_job, false); > + qatomic_store_release(&p->pending_job_preparing, false); > } else { > /* > * If not a normal job, must be a sync request. Note that > @@ -714,7 +738,7 @@ static void *multifd_send_thread(void *opaque) > > if (use_packets) { > p->flags = MULTIFD_FLAG_SYNC; > - multifd_send_fill_packet(p); > + multifd_send_fill_packet_ram(p); > ret = qio_channel_write_all(p->c, (void *)p->packet, > p->packet_len, &local_err); > if (ret != 0) { > @@ -910,6 +934,9 @@ bool multifd_send_setup(void) > p->packet_len = sizeof(MultiFDPacket_t) > + sizeof(uint64_t) * page_count; > p->packet = g_malloc0(p->packet_len); > + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); > + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); > + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); > } > p->name = g_strdup_printf("mig/src/send_%d", i); > p->write_flags = 0; > @@ -944,6 +971,8 @@ bool multifd_send_setup(void) > } > } > > + multifd_device_state_save_setup(); > + > return true; > > err: > diff --git a/migration/multifd.h b/migration/multifd.h > index a0853622153e..c15c83104c8b 100644 > --- a/migration/multifd.h > +++ b/migration/multifd.h > @@ -120,10 +120,12 @@ typedef struct { > typedef enum { > MULTIFD_PAYLOAD_NONE, > MULTIFD_PAYLOAD_RAM, > + MULTIFD_PAYLOAD_DEVICE_STATE, > } MultiFDPayloadType; > > typedef union MultiFDPayload { > MultiFDPages_t ram; > + MultiFDDeviceState_t device_state; > } MultiFDPayload; > > struct MultiFDSendData { > @@ -136,6 +138,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) > return data->type == MULTIFD_PAYLOAD_NONE; > } > > +static inline bool multifd_payload_device_state(MultiFDSendData *data) > +{ > + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; > +} > + > static inline void multifd_set_payload_type(MultiFDSendData *data, > MultiFDPayloadType type) > { > @@ -182,13 +189,15 @@ typedef struct { > * cleared by the multifd sender threads. > */ > bool pending_job; > + bool pending_job_preparing; > bool pending_sync; > MultiFDSendData *data; > > /* thread local variables. No locking required */ > > - /* pointer to the packet */ > + /* pointers to the possible packet types */ > MultiFDPacket_t *packet; > + MultiFDPacketDeviceState_t *packet_device_state; > /* size of the next packet that contains pages */ > uint32_t next_packet_size; > /* packets sent through this channel */ > @@ -276,18 +285,25 @@ typedef struct { > } MultiFDMethods; > > void multifd_register_ops(int method, MultiFDMethods *ops); > -void multifd_send_fill_packet(MultiFDSendParams *p); > +void multifd_send_fill_packet_ram(MultiFDSendParams *p); > bool multifd_send_prepare_common(MultiFDSendParams *p); > void multifd_send_zero_page_detect(MultiFDSendParams *p); > void multifd_recv_zero_page_process(MultiFDRecvParams *p); > > -static inline void multifd_send_prepare_header(MultiFDSendParams *p) > +static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p) This could instead go to multifd-nocomp.c and become multifd_ram_prepare_header. > { > p->iov[0].iov_len = p->packet_len; > p->iov[0].iov_base = p->packet; > p->iovs_num++; > } > > +static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p) Seem like this could also move to multifd-device-state.c and drop the "send" part. > +{ > + p->iov[0].iov_len = sizeof(*p->packet_device_state); > + p->iov[0].iov_base = p->packet_device_state; > + p->iovs_num++; > +} > + > void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); > bool multifd_send(MultiFDSendData **send_data); > MultiFDSendData *multifd_send_data_alloc(void); > @@ -310,4 +326,11 @@ int multifd_ram_flush_and_sync(void); > size_t multifd_ram_payload_size(void); > void multifd_ram_fill_packet(MultiFDSendParams *p); > int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); > + > +size_t multifd_device_state_payload_size(void); > +void multifd_device_state_save_setup(void); > +void multifd_device_state_clear(MultiFDDeviceState_t *device_state); > +void multifd_device_state_save_cleanup(void); > +void multifd_device_state_send_prepare(MultiFDSendParams *p); > + > #endif
On 29.08.2024 02:41, Fabiano Rosas wrote: > "Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes: > >> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >> >> A new function multifd_queue_device_state() is provided for device to queue >> its state for transmission via a multifd channel. >> >> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >> --- >> include/migration/misc.h | 4 ++ >> migration/meson.build | 1 + >> migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++ >> migration/multifd-nocomp.c | 6 +- >> migration/multifd-qpl.c | 2 +- >> migration/multifd-uadk.c | 2 +- >> migration/multifd-zlib.c | 2 +- >> migration/multifd-zstd.c | 2 +- >> migration/multifd.c | 65 +++++++++++++++------ >> migration/multifd.h | 29 +++++++++- >> 10 files changed, 184 insertions(+), 28 deletions(-) >> create mode 100644 migration/multifd-device-state.c >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index bfadc5613bac..7266b1b77d1f 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void); >> /* migration/block-dirty-bitmap.c */ >> void dirty_bitmap_mig_init(void); >> >> +/* migration/multifd-device-state.c */ >> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> + char *data, size_t len); >> + >> #endif >> diff --git a/migration/meson.build b/migration/meson.build >> index 77f3abf08eb1..00853595894f 100644 >> --- a/migration/meson.build >> +++ b/migration/meson.build >> @@ -21,6 +21,7 @@ system_ss.add(files( >> 'migration-hmp-cmds.c', >> 'migration.c', >> 'multifd.c', >> + 'multifd-device-state.c', >> 'multifd-nocomp.c', >> 'multifd-zlib.c', >> 'multifd-zero-page.c', >> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c >> new file mode 100644 >> index 000000000000..c9b44f0b5ab9 >> --- /dev/null >> +++ b/migration/multifd-device-state.c >> @@ -0,0 +1,99 @@ >> +/* >> + * Multifd device state migration >> + * >> + * Copyright (C) 2024 Oracle and/or its affiliates. >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2 or later. >> + * See the COPYING file in the top-level directory. >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "qemu/lockable.h" >> +#include "migration/misc.h" >> +#include "multifd.h" >> + >> +static QemuMutex queue_job_mutex; >> + >> +static MultiFDSendData *device_state_send; >> + >> +size_t multifd_device_state_payload_size(void) >> +{ >> + return sizeof(MultiFDDeviceState_t); >> +} > > This will not be necessary because the payload size is the same as the > data type. We only need it for the special case where the MultiFDPages_t > is smaller than the total ram payload size. I know - I just wanted to make the API consistent with the one RAM handler provides since these multifd_send_data_alloc() calls are done just once per migration - it isn't any kind of a hot path. >> + >> +void multifd_device_state_save_setup(void) > > s/save/send/. The ram ones are only called "save" because they're called > from ram_save_setup(), but we then have the proper nocomp_send_setup > hook. Ack. >> +{ >> + qemu_mutex_init(&queue_job_mutex); >> + >> + device_state_send = multifd_send_data_alloc(); >> +} >> + >> +void multifd_device_state_clear(MultiFDDeviceState_t *device_state) >> +{ >> + g_clear_pointer(&device_state->idstr, g_free); >> + g_clear_pointer(&device_state->buf, g_free); >> +} >> + >> +void multifd_device_state_save_cleanup(void) > > s/save/send/ Ack. >> +{ >> + g_clear_pointer(&device_state_send, multifd_send_data_free); >> + >> + qemu_mutex_destroy(&queue_job_mutex); >> +} >> + >> +static void multifd_device_state_fill_packet(MultiFDSendParams *p) >> +{ >> + MultiFDDeviceState_t *device_state = &p->data->u.device_state; >> + MultiFDPacketDeviceState_t *packet = p->packet_device_state; >> + >> + packet->hdr.flags = cpu_to_be32(p->flags); >> + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr)); >> + packet->instance_id = cpu_to_be32(device_state->instance_id); >> + packet->next_packet_size = cpu_to_be32(p->next_packet_size); >> +} >> + >> +void multifd_device_state_send_prepare(MultiFDSendParams *p) >> +{ >> + MultiFDDeviceState_t *device_state = &p->data->u.device_state; >> + >> + assert(multifd_payload_device_state(p->data)); >> + >> + multifd_send_prepare_header_device_state(p); >> + >> + assert(!(p->flags & MULTIFD_FLAG_SYNC)); >> + >> + p->next_packet_size = device_state->buf_len; >> + if (p->next_packet_size > 0) { >> + p->iov[p->iovs_num].iov_base = device_state->buf; >> + p->iov[p->iovs_num].iov_len = p->next_packet_size; >> + p->iovs_num++; >> + } >> + >> + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; >> + >> + multifd_device_state_fill_packet(p); >> +} >> + >> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> + char *data, size_t len) >> +{ >> + /* Device state submissions can come from multiple threads */ >> + QEMU_LOCK_GUARD(&queue_job_mutex); >> + MultiFDDeviceState_t *device_state; >> + >> + assert(multifd_payload_empty(device_state_send)); >> + >> + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); >> + device_state = &device_state_send->u.device_state; >> + device_state->idstr = g_strdup(idstr); >> + device_state->instance_id = instance_id; >> + device_state->buf = g_memdup2(data, len); >> + device_state->buf_len = len; >> + >> + if (!multifd_send(&device_state_send)) { >> + multifd_send_data_clear(device_state_send); >> + return false; >> + } >> + >> + return true; >> +} >> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c >> index 39eb77c9b3b7..0b7b543f44db 100644 >> --- a/migration/multifd-nocomp.c >> +++ b/migration/multifd-nocomp.c >> @@ -116,13 +116,13 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) >> * Only !zerocopy needs the header in IOV; zerocopy will >> * send it separately. >> */ >> - multifd_send_prepare_header(p); >> + multifd_send_prepare_header_ram(p); >> } >> >> multifd_send_prepare_iovs(p); >> p->flags |= MULTIFD_FLAG_NOCOMP; >> >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> >> if (use_zero_copy_send) { >> /* Send header first, without zerocopy */ >> @@ -371,7 +371,7 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) >> return false; >> } >> >> - multifd_send_prepare_header(p); >> + multifd_send_prepare_header_ram(p); >> >> return true; >> } >> diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c >> index 75041a4c4dfe..bd6b5b6a3868 100644 >> --- a/migration/multifd-qpl.c >> +++ b/migration/multifd-qpl.c >> @@ -490,7 +490,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) >> >> out: >> p->flags |= MULTIFD_FLAG_QPL; >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> return 0; >> } >> >> diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c >> index db2549f59bfe..6e2d26010742 100644 >> --- a/migration/multifd-uadk.c >> +++ b/migration/multifd-uadk.c >> @@ -198,7 +198,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) >> } >> out: >> p->flags |= MULTIFD_FLAG_UADK; >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> return 0; >> } >> >> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c >> index 6787538762d2..62a1fe59ad3e 100644 >> --- a/migration/multifd-zlib.c >> +++ b/migration/multifd-zlib.c >> @@ -156,7 +156,7 @@ static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp) >> >> out: >> p->flags |= MULTIFD_FLAG_ZLIB; >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> return 0; >> } >> >> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c >> index 1576b1e2adc6..f98b07e7f9f5 100644 >> --- a/migration/multifd-zstd.c >> +++ b/migration/multifd-zstd.c >> @@ -143,7 +143,7 @@ static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp) >> >> out: >> p->flags |= MULTIFD_FLAG_ZSTD; >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> return 0; >> } >> >> diff --git a/migration/multifd.c b/migration/multifd.c >> index a74e8a5cc891..bebe5b5a9b9c 100644 >> --- a/migration/multifd.c >> +++ b/migration/multifd.c >> @@ -12,6 +12,7 @@ >> >> #include "qemu/osdep.h" >> #include "qemu/cutils.h" >> +#include "qemu/iov.h" >> #include "qemu/rcu.h" >> #include "exec/target_page.h" >> #include "sysemu/sysemu.h" >> @@ -19,6 +20,7 @@ >> #include "qemu/error-report.h" >> #include "qapi/error.h" >> #include "file.h" >> +#include "migration/misc.h" >> #include "migration.h" >> #include "migration-stats.h" >> #include "savevm.h" >> @@ -107,7 +109,9 @@ MultiFDSendData *multifd_send_data_alloc(void) >> * added to the union in the future are larger than >> * (MultiFDPages_t + flex array). >> */ >> - max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload)); >> + max_payload_size = MAX(multifd_ram_payload_size(), >> + multifd_device_state_payload_size()); > > This is not needed, the sizeof(MultiFDPayload) below already has the > same effect. Same as above, I think it's good for consistency, but I don't mind removing it either (maybe by replacing it with a comment describing that it isn't currently needed). >> + max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); >> >> /* >> * Account for any holes the compiler might insert. We can't pack >> @@ -126,6 +130,9 @@ void multifd_send_data_clear(MultiFDSendData *data) >> } >> >> switch (data->type) { >> + case MULTIFD_PAYLOAD_DEVICE_STATE: >> + multifd_device_state_clear(&data->u.device_state); >> + break; >> default: >> /* Nothing to do */ >> break; >> @@ -228,7 +235,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) >> return msg.id; >> } >> >> -void multifd_send_fill_packet(MultiFDSendParams *p) >> +void multifd_send_fill_packet_ram(MultiFDSendParams *p) > > Do we need this change if there's no counterpart for device_state? It > might be less confusing to just leave this one as it is. Not really, will drop this change in the next patch set version. >> { >> MultiFDPacket_t *packet = p->packet; >> uint64_t packet_num; >> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) >> >> p = &multifd_send_state->params[i]; >> /* >> - * Lockless read to p->pending_job is safe, because only multifd >> - * sender thread can clear it. >> + * Lockless RMW on p->pending_job_preparing is safe, because only multifd >> + * sender thread can clear it after it had seen p->pending_job being set. >> + * >> + * Pairs with qatomic_store_release() in multifd_send_thread(). >> */ >> - if (qatomic_read(&p->pending_job) == false) { >> + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { > > What's the motivation for this change? It would be better to have it in > a separate patch with a proper justification. The original RFC patch set used dedicated device state multifd channels. Peter and other people wanted this functionality removed, however this caused a performance (downtime) regression. One of the things that seemed to help mitigate this regression was making the multifd channel selection more fair via this change. But I can split out it to a separate commit in the next patch set version and then see what performance improvement it currently brings. >> break; >> } >> } >> >> - /* >> - * Make sure we read p->pending_job before all the rest. Pairs with >> - * qatomic_store_release() in multifd_send_thread(). >> - */ >> - smp_mb_acquire(); >> - >> assert(multifd_payload_empty(p->data)); >> >> /* >> @@ -534,6 +537,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) >> p->name = NULL; >> g_clear_pointer(&p->data, multifd_send_data_free); >> p->packet_len = 0; >> + g_clear_pointer(&p->packet_device_state, g_free); >> g_free(p->packet); >> p->packet = NULL; >> multifd_send_state->ops->send_cleanup(p, errp); >> @@ -545,6 +549,7 @@ static void multifd_send_cleanup_state(void) >> { >> file_cleanup_outgoing_migration(); >> socket_cleanup_outgoing_migration(); >> + multifd_device_state_save_cleanup(); >> qemu_sem_destroy(&multifd_send_state->channels_created); >> qemu_sem_destroy(&multifd_send_state->channels_ready); >> g_free(multifd_send_state->params); >> @@ -670,19 +675,29 @@ static void *multifd_send_thread(void *opaque) >> * qatomic_store_release() in multifd_send(). >> */ >> if (qatomic_load_acquire(&p->pending_job)) { >> + bool is_device_state = multifd_payload_device_state(p->data); >> + size_t total_size; >> + >> p->flags = 0; >> p->iovs_num = 0; >> assert(!multifd_payload_empty(p->data)); >> >> - ret = multifd_send_state->ops->send_prepare(p, &local_err); >> - if (ret != 0) { >> - break; >> + if (is_device_state) { >> + multifd_device_state_send_prepare(p); >> + } else { >> + ret = multifd_send_state->ops->send_prepare(p, &local_err); >> + if (ret != 0) { >> + break; >> + } >> } >> >> if (migrate_mapped_ram()) { >> + assert(!is_device_state); >> + >> ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, >> &p->data->u.ram, &local_err); >> } else { >> + total_size = iov_size(p->iov, p->iovs_num); >> ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, >> NULL, 0, p->write_flags, >> &local_err); >> @@ -692,18 +707,27 @@ static void *multifd_send_thread(void *opaque) >> break; >> } >> >> - stat64_add(&mig_stats.multifd_bytes, >> - p->next_packet_size + p->packet_len); >> + if (is_device_state) { >> + stat64_add(&mig_stats.multifd_bytes, total_size); >> + } else { >> + /* >> + * Can't just always add total_size since IOVs do not include >> + * packet header in the zerocopy RAM case. >> + */ >> + stat64_add(&mig_stats.multifd_bytes, >> + p->next_packet_size + p->packet_len); > > You could set total_size for both branches after send_prepare and use it > here unconditionally. Ack. >> + } >> >> p->next_packet_size = 0; >> multifd_send_data_clear(p->data); >> >> /* >> * Making sure p->data is published before saying "we're >> - * free". Pairs with the smp_mb_acquire() in >> + * free". Pairs with the qatomic_cmpxchg() in >> * multifd_send(). >> */ >> qatomic_store_release(&p->pending_job, false); >> + qatomic_store_release(&p->pending_job_preparing, false); >> } else { >> /* >> * If not a normal job, must be a sync request. Note that >> @@ -714,7 +738,7 @@ static void *multifd_send_thread(void *opaque) >> >> if (use_packets) { >> p->flags = MULTIFD_FLAG_SYNC; >> - multifd_send_fill_packet(p); >> + multifd_send_fill_packet_ram(p); >> ret = qio_channel_write_all(p->c, (void *)p->packet, >> p->packet_len, &local_err); >> if (ret != 0) { >> @@ -910,6 +934,9 @@ bool multifd_send_setup(void) >> p->packet_len = sizeof(MultiFDPacket_t) >> + sizeof(uint64_t) * page_count; >> p->packet = g_malloc0(p->packet_len); >> + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); >> + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); >> + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); >> } >> p->name = g_strdup_printf("mig/src/send_%d", i); >> p->write_flags = 0; >> @@ -944,6 +971,8 @@ bool multifd_send_setup(void) >> } >> } >> >> + multifd_device_state_save_setup(); >> + >> return true; >> >> err: >> diff --git a/migration/multifd.h b/migration/multifd.h >> index a0853622153e..c15c83104c8b 100644 >> --- a/migration/multifd.h >> +++ b/migration/multifd.h >> @@ -120,10 +120,12 @@ typedef struct { >> typedef enum { >> MULTIFD_PAYLOAD_NONE, >> MULTIFD_PAYLOAD_RAM, >> + MULTIFD_PAYLOAD_DEVICE_STATE, >> } MultiFDPayloadType; >> >> typedef union MultiFDPayload { >> MultiFDPages_t ram; >> + MultiFDDeviceState_t device_state; >> } MultiFDPayload; >> >> struct MultiFDSendData { >> @@ -136,6 +138,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) >> return data->type == MULTIFD_PAYLOAD_NONE; >> } >> >> +static inline bool multifd_payload_device_state(MultiFDSendData *data) >> +{ >> + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; >> +} >> + >> static inline void multifd_set_payload_type(MultiFDSendData *data, >> MultiFDPayloadType type) >> { >> @@ -182,13 +189,15 @@ typedef struct { >> * cleared by the multifd sender threads. >> */ >> bool pending_job; >> + bool pending_job_preparing; >> bool pending_sync; >> MultiFDSendData *data; >> >> /* thread local variables. No locking required */ >> >> - /* pointer to the packet */ >> + /* pointers to the possible packet types */ >> MultiFDPacket_t *packet; >> + MultiFDPacketDeviceState_t *packet_device_state; >> /* size of the next packet that contains pages */ >> uint32_t next_packet_size; >> /* packets sent through this channel */ >> @@ -276,18 +285,25 @@ typedef struct { >> } MultiFDMethods; >> >> void multifd_register_ops(int method, MultiFDMethods *ops); >> -void multifd_send_fill_packet(MultiFDSendParams *p); >> +void multifd_send_fill_packet_ram(MultiFDSendParams *p); >> bool multifd_send_prepare_common(MultiFDSendParams *p); >> void multifd_send_zero_page_detect(MultiFDSendParams *p); >> void multifd_recv_zero_page_process(MultiFDRecvParams *p); >> >> -static inline void multifd_send_prepare_header(MultiFDSendParams *p) >> +static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p) > > This could instead go to multifd-nocomp.c and become multifd_ram_prepare_header. Ack. >> { >> p->iov[0].iov_len = p->packet_len; >> p->iov[0].iov_base = p->packet; >> p->iovs_num++; >> } >> >> +static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p) > > Seem like this could also move to multifd-device-state.c and drop the > "send" part. > Ack. Thanks, Maciej
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes: > On 29.08.2024 02:41, Fabiano Rosas wrote: >> "Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes: >> >>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com> >>> >>> A new function multifd_queue_device_state() is provided for device to queue >>> its state for transmission via a multifd channel. >>> >>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> >>> --- >>> include/migration/misc.h | 4 ++ >>> migration/meson.build | 1 + >>> migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++ >>> migration/multifd-nocomp.c | 6 +- >>> migration/multifd-qpl.c | 2 +- >>> migration/multifd-uadk.c | 2 +- >>> migration/multifd-zlib.c | 2 +- >>> migration/multifd-zstd.c | 2 +- >>> migration/multifd.c | 65 +++++++++++++++------ >>> migration/multifd.h | 29 +++++++++- >>> 10 files changed, 184 insertions(+), 28 deletions(-) >>> create mode 100644 migration/multifd-device-state.c >>> >>> diff --git a/include/migration/misc.h b/include/migration/misc.h >>> index bfadc5613bac..7266b1b77d1f 100644 >>> --- a/include/migration/misc.h >>> +++ b/include/migration/misc.h >>> @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void); >>> /* migration/block-dirty-bitmap.c */ >>> void dirty_bitmap_mig_init(void); >>> >>> +/* migration/multifd-device-state.c */ >>> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >>> + char *data, size_t len); >>> + >>> #endif >>> diff --git a/migration/meson.build b/migration/meson.build >>> index 77f3abf08eb1..00853595894f 100644 >>> --- a/migration/meson.build >>> +++ b/migration/meson.build >>> @@ -21,6 +21,7 @@ system_ss.add(files( >>> 'migration-hmp-cmds.c', >>> 'migration.c', >>> 'multifd.c', >>> + 'multifd-device-state.c', >>> 'multifd-nocomp.c', >>> 'multifd-zlib.c', >>> 'multifd-zero-page.c', >>> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c >>> new file mode 100644 >>> index 000000000000..c9b44f0b5ab9 >>> --- /dev/null >>> +++ b/migration/multifd-device-state.c >>> @@ -0,0 +1,99 @@ >>> +/* >>> + * Multifd device state migration >>> + * >>> + * Copyright (C) 2024 Oracle and/or its affiliates. >>> + * >>> + * This work is licensed under the terms of the GNU GPL, version 2 or later. >>> + * See the COPYING file in the top-level directory. >>> + */ >>> + >>> +#include "qemu/osdep.h" >>> +#include "qemu/lockable.h" >>> +#include "migration/misc.h" >>> +#include "multifd.h" >>> + >>> +static QemuMutex queue_job_mutex; >>> + >>> +static MultiFDSendData *device_state_send; >>> + >>> +size_t multifd_device_state_payload_size(void) >>> +{ >>> + return sizeof(MultiFDDeviceState_t); >>> +} >> >> This will not be necessary because the payload size is the same as the >> data type. We only need it for the special case where the MultiFDPages_t >> is smaller than the total ram payload size. > > I know - I just wanted to make the API consistent with the one RAM > handler provides since these multifd_send_data_alloc() calls are done > just once per migration - it isn't any kind of a hot path. > I think the array at the end of MultiFDPages_t should be considered enough of a hack that we might want to keep anything related to it outside of the interface. Other clients shouldn't have to think about that at all. >>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) >>> >>> p = &multifd_send_state->params[i]; >>> /* >>> - * Lockless read to p->pending_job is safe, because only multifd >>> - * sender thread can clear it. >>> + * Lockless RMW on p->pending_job_preparing is safe, because only multifd >>> + * sender thread can clear it after it had seen p->pending_job being set. >>> + * >>> + * Pairs with qatomic_store_release() in multifd_send_thread(). >>> */ >>> - if (qatomic_read(&p->pending_job) == false) { >>> + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { >> >> What's the motivation for this change? It would be better to have it in >> a separate patch with a proper justification. > > The original RFC patch set used dedicated device state multifd channels. > > Peter and other people wanted this functionality removed, however this caused > a performance (downtime) regression. > > One of the things that seemed to help mitigate this regression was making > the multifd channel selection more fair via this change. > > But I can split out it to a separate commit in the next patch set version and > then see what performance improvement it currently brings. Yes, better to have it separate if anything for documentation of the rationale.
On Fri, Aug 30, 2024 at 10:02:40AM -0300, Fabiano Rosas wrote: > >>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) > >>> > >>> p = &multifd_send_state->params[i]; > >>> /* > >>> - * Lockless read to p->pending_job is safe, because only multifd > >>> - * sender thread can clear it. > >>> + * Lockless RMW on p->pending_job_preparing is safe, because only multifd > >>> + * sender thread can clear it after it had seen p->pending_job being set. > >>> + * > >>> + * Pairs with qatomic_store_release() in multifd_send_thread(). > >>> */ > >>> - if (qatomic_read(&p->pending_job) == false) { > >>> + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { > >> > >> What's the motivation for this change? It would be better to have it in > >> a separate patch with a proper justification. > > > > The original RFC patch set used dedicated device state multifd channels. > > > > Peter and other people wanted this functionality removed, however this caused > > a performance (downtime) regression. > > > > One of the things that seemed to help mitigate this regression was making > > the multifd channel selection more fair via this change. > > > > But I can split out it to a separate commit in the next patch set version and > > then see what performance improvement it currently brings. > > Yes, better to have it separate if anything for documentation of the > rationale. And when drafting that patch, please add a comment explaining the field. Currently it's missing: /* * The sender thread has work to do if either of below boolean is set. * * @pending_job: a job is pending * @pending_sync: a sync request is pending * * For both of these fields, they're only set by the requesters, and * cleared by the multifd sender threads. */ bool pending_job; bool pending_job_preparing; bool pending_sync;
On Tue, Aug 27, 2024 at 07:54:31PM +0200, Maciej S. Szmigiero wrote: > +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > + char *data, size_t len) > +{ > + /* Device state submissions can come from multiple threads */ > + QEMU_LOCK_GUARD(&queue_job_mutex); Ah, just notice there's the mutex. So please consider the reply in the other thread, IIUC we can make it for multifd_send() to be a generic mutex to simplify the other patch too, then drop here. I assume the ram code should be fine taking one more mutex even without vfio, if it only takes once for each ~128 pages to enqueue, and only take in the main thread, then each update should be also in the hot path (e.g. no cache bouncing). > + MultiFDDeviceState_t *device_state; > + > + assert(multifd_payload_empty(device_state_send)); > + > + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > + device_state = &device_state_send->u.device_state; > + device_state->idstr = g_strdup(idstr); > + device_state->instance_id = instance_id; > + device_state->buf = g_memdup2(data, len); > + device_state->buf_len = len; > + > + if (!multifd_send(&device_state_send)) { > + multifd_send_data_clear(device_state_send); > + return false; > + } > + > + return true; > +}
On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > > +size_t multifd_device_state_payload_size(void) > > +{ > > + return sizeof(MultiFDDeviceState_t); > > +} > > This will not be necessary because the payload size is the same as the > data type. We only need it for the special case where the MultiFDPages_t > is smaller than the total ram payload size. Today I was thinking maybe we should really clean this up, as the current multifd_send_data_alloc() is indeed too tricky (blame me.. who requested that more or less). Knowing that VFIO can use dynamic buffers with ->idstr and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made that feeling stronger. I think we should change it now perhaps, otherwise we'll need to introduce other helpers to e.g. reset the device buffers, and that's not only slow but also not good looking, IMO. So I went ahead with the idea in previous discussion, that I managed to change the SendData union into struct; the memory consumption is not super important yet, IMHO, but we should still stick with the object model where multifd enqueue thread switch buffer with multifd, as it still sounds a sane way to do. Then when that patch is ready, I further tried to make VFIO reuse multifd buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we don't allocate it every time we enqueue. I hope it'll also work for VFIO. VFIO has a specialty on being able to dump the config space so it's more complex (and I noticed Maciej's current design requires the final chunk of VFIO config data be migrated in one packet.. that is also part of the complexity there). So I allowed that part to allocate a buffer but only that. IOW, I made some API (see below) that can either reuse preallocated buffer, or use a separate one only for the final bulk. In short, could both of you have a look at what I came up with below? I did that in patches because I think it's too much to comment, so patches may work better. No concern if any of below could be good changes to you, then either Maciej can squash whatever into existing patches (and I feel like some existing patches in this series can go away with below design), or I can post pre-requisite patch but only if any of you prefer that. Anyway, let me know, the patches apply on top of this whole series applied first. I also wonder whether there can be any perf difference already (I tested all multifd qtest with below, but no VFIO I can run), perhaps not that much, but just to mention below should avoid both buffer allocations and one round of copy (so VFIO read() directly writes to the multifd buffers now). Thanks, ==========8<========== From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Tue, 10 Sep 2024 12:10:59 -0400 Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct The newly introduced device state buffer can be used for either storing VFIO's read() raw data, but already also possible to store generic device states. After noticing that device states may not easily provide a max buffer size (also the fact that RAM MultiFDPages_t after all also want to have flexibility on managing offset[] array), it may not be a good idea to stick with union on MultiFDSendData.. as it won't play well with such flexibility. Switch MultiFDSendData to a struct. It won't consume a lot more space in reality, after all the real buffers were already dynamically allocated, so it's so far only about the two structs (pages, device_state) that will be duplicated, but they're small. With this, we can remove the pretty hard to understand alloc size logic. Because now we can allocate offset[] together with the SendData, and properly free it when the SendData is freed. Signed-off-by: Peter Xu <peterx@redhat.com> --- migration/multifd.h | 16 +++++++++++----- migration/multifd-device-state.c | 8 ++++++-- migration/multifd-nocomp.c | 13 ++++++------- migration/multifd.c | 25 ++++++------------------- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/migration/multifd.h b/migration/multifd.h index c15c83104c..47203334b9 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -98,9 +98,13 @@ typedef struct { uint32_t num; /* number of normal pages */ uint32_t normal_num; + /* + * Pointer to the ramblock. NOTE: it's caller's responsibility to make + * sure the pointer is always valid! + */ RAMBlock *block; - /* offset of each page */ - ram_addr_t offset[]; + /* offset array of each page, managed by multifd */ + ram_addr_t *offset; } MultiFDPages_t; struct MultiFDRecvData { @@ -123,7 +127,7 @@ typedef enum { MULTIFD_PAYLOAD_DEVICE_STATE, } MultiFDPayloadType; -typedef union MultiFDPayload { +typedef struct MultiFDPayload { MultiFDPages_t ram; MultiFDDeviceState_t device_state; } MultiFDPayload; @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void) void multifd_ram_save_setup(void); void multifd_ram_save_cleanup(void); int multifd_ram_flush_and_sync(void); -size_t multifd_ram_payload_size(void); +void multifd_ram_payload_alloc(MultiFDPages_t *pages); +void multifd_ram_payload_free(MultiFDPages_t *pages); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); -size_t multifd_device_state_payload_size(void); +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state); +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state); void multifd_device_state_save_setup(void); void multifd_device_state_clear(MultiFDDeviceState_t *device_state); void multifd_device_state_save_cleanup(void); diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c index 9b364e8ef3..72b72b6e62 100644 --- a/migration/multifd-device-state.c +++ b/migration/multifd-device-state.c @@ -22,9 +22,13 @@ bool send_threads_abort; static MultiFDSendData *device_state_send; -size_t multifd_device_state_payload_size(void) +/* TODO: use static buffers for idstr and buf */ +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) +{ +} + +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) { - return sizeof(MultiFDDeviceState_t); } void multifd_device_state_save_setup(void) diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 0b7b543f44..c1b95fee0d 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -22,15 +22,14 @@ static MultiFDSendData *multifd_ram_send; -size_t multifd_ram_payload_size(void) +void multifd_ram_payload_alloc(MultiFDPages_t *pages) { - uint32_t n = multifd_ram_page_count(); + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); +} - /* - * We keep an array of page offsets at the end of MultiFDPages_t, - * add space for it in the allocation. - */ - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); +void multifd_ram_payload_free(MultiFDPages_t *pages) +{ + g_clear_pointer(&pages->offset, g_free); } void multifd_ram_save_setup(void) diff --git a/migration/multifd.c b/migration/multifd.c index bebe5b5a9b..5a20b831cf 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -101,26 +101,12 @@ struct { MultiFDSendData *multifd_send_data_alloc(void) { - size_t max_payload_size, size_minus_payload; + MultiFDSendData *new = g_new0(MultiFDSendData, 1); - /* - * MultiFDPages_t has a flexible array at the end, account for it - * when allocating MultiFDSendData. Use max() in case other types - * added to the union in the future are larger than - * (MultiFDPages_t + flex array). - */ - max_payload_size = MAX(multifd_ram_payload_size(), - multifd_device_state_payload_size()); - max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); - - /* - * Account for any holes the compiler might insert. We can't pack - * the structure because that misaligns the members and triggers - * Waddress-of-packed-member. - */ - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); + multifd_ram_payload_alloc(&new->u.ram); + multifd_device_state_payload_alloc(&new->u.device_state); - return g_malloc0(size_minus_payload + max_payload_size); + return new; } void multifd_send_data_clear(MultiFDSendData *data) @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data) return; } - multifd_send_data_clear(data); + multifd_ram_payload_free(&data->u.ram); + multifd_device_state_payload_free(&data->u.device_state); g_free(data); }
Peter Xu <peterx@redhat.com> writes: Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you understand the rework is a little frustrating. > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >> > +size_t multifd_device_state_payload_size(void) >> > +{ >> > + return sizeof(MultiFDDeviceState_t); >> > +} >> >> This will not be necessary because the payload size is the same as the >> data type. We only need it for the special case where the MultiFDPages_t >> is smaller than the total ram payload size. > > Today I was thinking maybe we should really clean this up, as the current > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > that feeling stronger. If we're going to commit bad code and then rewrite it a week later, we could have just let the original series from Maciej merge without any of this. I already suggested it a couple of times, we shouldn't be doing core refactorings underneath contributors' patches, this is too fragile. Just let people contribute their code and we can change it later. This is also why I've been trying hard to separate core multifd functionality from migration code that uses multifd to transmit their data. My original RFC plus the suggestion to extend multifd_ops for device state would have (almost) made it so that no client code would be left in multifd. We could have been turning this thing upside down and it wouldn't affect anyone in terms of code conflicts. The ship has already sailed, so your patches below are fine, I have just some small comments. > > I think we should change it now perhaps, otherwise we'll need to introduce > other helpers to e.g. reset the device buffers, and that's not only slow > but also not good looking, IMO. I agree that part is kind of a sore thumb. > > So I went ahead with the idea in previous discussion, that I managed to > change the SendData union into struct; the memory consumption is not super > important yet, IMHO, but we should still stick with the object model where > multifd enqueue thread switch buffer with multifd, as it still sounds a > sane way to do. > > Then when that patch is ready, I further tried to make VFIO reuse multifd > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we > don't allocate it every time we enqueue. > > I hope it'll also work for VFIO. VFIO has a specialty on being able to > dump the config space so it's more complex (and I noticed Maciej's current > design requires the final chunk of VFIO config data be migrated in one > packet.. that is also part of the complexity there). So I allowed that > part to allocate a buffer but only that. IOW, I made some API (see below) > that can either reuse preallocated buffer, or use a separate one only for > the final bulk. > > In short, could both of you have a look at what I came up with below? I > did that in patches because I think it's too much to comment, so patches > may work better. No concern if any of below could be good changes to you, > then either Maciej can squash whatever into existing patches (and I feel > like some existing patches in this series can go away with below design), > or I can post pre-requisite patch but only if any of you prefer that. > > Anyway, let me know, the patches apply on top of this whole series applied > first. > > I also wonder whether there can be any perf difference already (I tested > all multifd qtest with below, but no VFIO I can run), perhaps not that > much, but just to mention below should avoid both buffer allocations and > one round of copy (so VFIO read() directly writes to the multifd buffers > now). > > Thanks, > > ==========8<========== > From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001 > From: Peter Xu <peterx@redhat.com> > Date: Tue, 10 Sep 2024 12:10:59 -0400 > Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct > > The newly introduced device state buffer can be used for either storing > VFIO's read() raw data, but already also possible to store generic device > states. After noticing that device states may not easily provide a max > buffer size (also the fact that RAM MultiFDPages_t after all also want to > have flexibility on managing offset[] array), it may not be a good idea to > stick with union on MultiFDSendData.. as it won't play well with such > flexibility. > > Switch MultiFDSendData to a struct. > > It won't consume a lot more space in reality, after all the real buffers > were already dynamically allocated, so it's so far only about the two > structs (pages, device_state) that will be duplicated, but they're small. > > With this, we can remove the pretty hard to understand alloc size logic. > Because now we can allocate offset[] together with the SendData, and > properly free it when the SendData is freed. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > migration/multifd.h | 16 +++++++++++----- > migration/multifd-device-state.c | 8 ++++++-- > migration/multifd-nocomp.c | 13 ++++++------- > migration/multifd.c | 25 ++++++------------------- > 4 files changed, 29 insertions(+), 33 deletions(-) > > diff --git a/migration/multifd.h b/migration/multifd.h > index c15c83104c..47203334b9 100644 > --- a/migration/multifd.h > +++ b/migration/multifd.h > @@ -98,9 +98,13 @@ typedef struct { > uint32_t num; > /* number of normal pages */ > uint32_t normal_num; > + /* > + * Pointer to the ramblock. NOTE: it's caller's responsibility to make > + * sure the pointer is always valid! > + */ This could use some rewording, it's not clear what "caller" means here. > RAMBlock *block; > - /* offset of each page */ > - ram_addr_t offset[]; > + /* offset array of each page, managed by multifd */ I'd drop the part after the comma, it's not very accurate and also gives an impression that something sophisticated is being done to this. > + ram_addr_t *offset; > } MultiFDPages_t; > > struct MultiFDRecvData { > @@ -123,7 +127,7 @@ typedef enum { > MULTIFD_PAYLOAD_DEVICE_STATE, > } MultiFDPayloadType; > > -typedef union MultiFDPayload { > +typedef struct MultiFDPayload { > MultiFDPages_t ram; > MultiFDDeviceState_t device_state; > } MultiFDPayload; > @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void) > void multifd_ram_save_setup(void); > void multifd_ram_save_cleanup(void); > int multifd_ram_flush_and_sync(void); > -size_t multifd_ram_payload_size(void); > +void multifd_ram_payload_alloc(MultiFDPages_t *pages); > +void multifd_ram_payload_free(MultiFDPages_t *pages); > void multifd_ram_fill_packet(MultiFDSendParams *p); > int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); > > -size_t multifd_device_state_payload_size(void); > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state); > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state); > void multifd_device_state_save_setup(void); > void multifd_device_state_clear(MultiFDDeviceState_t *device_state); > void multifd_device_state_save_cleanup(void); > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > index 9b364e8ef3..72b72b6e62 100644 > --- a/migration/multifd-device-state.c > +++ b/migration/multifd-device-state.c > @@ -22,9 +22,13 @@ bool send_threads_abort; > > static MultiFDSendData *device_state_send; > > -size_t multifd_device_state_payload_size(void) > +/* TODO: use static buffers for idstr and buf */ > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) > +{ > +} > + > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) > { > - return sizeof(MultiFDDeviceState_t); > } > > void multifd_device_state_save_setup(void) > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c > index 0b7b543f44..c1b95fee0d 100644 > --- a/migration/multifd-nocomp.c > +++ b/migration/multifd-nocomp.c > @@ -22,15 +22,14 @@ > > static MultiFDSendData *multifd_ram_send; > > -size_t multifd_ram_payload_size(void) > +void multifd_ram_payload_alloc(MultiFDPages_t *pages) > { > - uint32_t n = multifd_ram_page_count(); > + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); > +} > > - /* > - * We keep an array of page offsets at the end of MultiFDPages_t, > - * add space for it in the allocation. > - */ > - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); > +void multifd_ram_payload_free(MultiFDPages_t *pages) > +{ > + g_clear_pointer(&pages->offset, g_free); > } > > void multifd_ram_save_setup(void) > diff --git a/migration/multifd.c b/migration/multifd.c > index bebe5b5a9b..5a20b831cf 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -101,26 +101,12 @@ struct { > > MultiFDSendData *multifd_send_data_alloc(void) > { > - size_t max_payload_size, size_minus_payload; > + MultiFDSendData *new = g_new0(MultiFDSendData, 1); > > - /* > - * MultiFDPages_t has a flexible array at the end, account for it > - * when allocating MultiFDSendData. Use max() in case other types > - * added to the union in the future are larger than > - * (MultiFDPages_t + flex array). > - */ > - max_payload_size = MAX(multifd_ram_payload_size(), > - multifd_device_state_payload_size()); > - max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); > - > - /* > - * Account for any holes the compiler might insert. We can't pack > - * the structure because that misaligns the members and triggers > - * Waddress-of-packed-member. > - */ > - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); > + multifd_ram_payload_alloc(&new->u.ram); > + multifd_device_state_payload_alloc(&new->u.device_state); > > - return g_malloc0(size_minus_payload + max_payload_size); > + return new; > } > > void multifd_send_data_clear(MultiFDSendData *data) > @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data) > return; > } > > - multifd_send_data_clear(data); > + multifd_ram_payload_free(&data->u.ram); > + multifd_device_state_payload_free(&data->u.device_state); The "u" needs to be dropped now. Could just change to "p". Or can we now move the whole struct inside MultiFDSendData? > > g_free(data); > } > -- > 2.45.0 > > > > From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001 > From: Peter Xu <peterx@redhat.com> > Date: Tue, 10 Sep 2024 12:24:14 -0400 > Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates > > The duplication / allocation of idstr for each VFIO blob is an overkill, as > idstr isn't something that changes frequently. Also, the idstr always came > from the upper layer of se->idstr so it's always guaranteed to > exist (e.g. no device unplug allowed during migration). > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > migration/multifd.h | 4 ++++ > migration/multifd-device-state.c | 10 +++++++--- > 2 files changed, 11 insertions(+), 3 deletions(-) > > diff --git a/migration/multifd.h b/migration/multifd.h > index 47203334b9..1eaa5d4496 100644 > --- a/migration/multifd.h > +++ b/migration/multifd.h > @@ -115,6 +115,10 @@ struct MultiFDRecvData { > }; > > typedef struct { > + /* > + * Name of the owner device. NOTE: it's caller's responsibility to > + * make sure the pointer is always valid! > + */ Same comment as the other one here. > char *idstr; > uint32_t instance_id; > char *buf; > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > index 72b72b6e62..cfd0465bac 100644 > --- a/migration/multifd-device-state.c > +++ b/migration/multifd-device-state.c > @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void) > > void multifd_device_state_clear(MultiFDDeviceState_t *device_state) > { > - g_clear_pointer(&device_state->idstr, g_free); > + device_state->idstr = NULL; > g_clear_pointer(&device_state->buf, g_free); > } > > @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > > multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > device_state = &device_state_send->u.device_state; > - device_state->idstr = g_strdup(idstr); > + /* > + * NOTE: here we must use a static idstr (e.g. of a savevm state > + * entry) rather than any dynamically allocated buffer, because multifd > + * assumes this pointer is always valid! > + */ > + device_state->idstr = idstr; > device_state->instance_id = instance_id; > device_state->buf = g_memdup2(data, len); > device_state->buf_len = len; > @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque) > { > struct MultiFDDSSaveThreadData *data = opaque; > > - g_clear_pointer(&data->idstr, g_free); > g_free(data); > } > > -- > 2.45.0 > > > From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001 > From: Peter Xu <peterx@redhat.com> > Date: Tue, 10 Sep 2024 12:27:49 -0400 > Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer > allocations > > Provide a device_state->buf_prealloc so that the buffers can be reused if > possible. Provide a set of APIs to use it right. Please see the > documentation for the API in the code. > > The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of > now. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/hw/vfio/vfio-common.h | 9 ++++ > include/migration/misc.h | 12 ++++- > migration/multifd.h | 11 +++- > hw/vfio/migration.c | 43 ++++++++------- > migration/multifd-device-state.c | 93 +++++++++++++++++++++++++------- > migration/multifd.c | 9 ---- > 6 files changed, 126 insertions(+), 51 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 4578a0ca6a..c1f2f4ae55 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -61,6 +61,13 @@ typedef struct VFIORegion { > uint8_t nr; /* cache the region number for debug */ > } VFIORegion; > > +typedef struct VFIODeviceStatePacket { > + uint32_t version; > + uint32_t idx; > + uint32_t flags; > + uint8_t data[0]; > +} QEMU_PACKED VFIODeviceStatePacket; > + > typedef struct VFIOMigration { > struct VFIODevice *vbasedev; > VMChangeStateEntry *vm_state; > @@ -168,6 +175,8 @@ typedef struct VFIODevice { > int devid; > IOMMUFDBackend *iommufd; > VFIOIOASHwpt *hwpt; > + /* Only used on sender side when multifd is enabled */ > + VFIODeviceStatePacket *multifd_packet; > QLIST_ENTRY(VFIODevice) hwpt_next; > } VFIODevice; > > diff --git a/include/migration/misc.h b/include/migration/misc.h > index 26f7f3140f..1a8676ed3d 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void); > void dirty_bitmap_mig_init(void); > > /* migration/multifd-device-state.c */ > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > - char *data, size_t len); > +struct MultiFDDeviceState_t; > +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t; > + > +MultiFDDeviceState_t * > +multifd_device_state_prepare(char *idstr, uint32_t instance_id); > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, > + int64_t *buf_len); > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, > + void *buf, int64_t buf_len); > + > bool migration_has_device_state_support(void); > > void > diff --git a/migration/multifd.h b/migration/multifd.h > index 1eaa5d4496..1ccdeeb8c5 100644 > --- a/migration/multifd.h > +++ b/migration/multifd.h > @@ -15,6 +15,7 @@ > > #include "exec/target_page.h" > #include "ram.h" > +#include "migration/misc.h" > > typedef struct MultiFDRecvData MultiFDRecvData; > typedef struct MultiFDSendData MultiFDSendData; > @@ -114,16 +115,22 @@ struct MultiFDRecvData { > off_t file_offset; > }; > > -typedef struct { > +struct MultiFDDeviceState_t { > /* > * Name of the owner device. NOTE: it's caller's responsibility to > * make sure the pointer is always valid! > */ > char *idstr; > uint32_t instance_id; > + /* > + * Points to the buffer to send via multifd. Normally it's the same as > + * buf_prealloc, otherwise the caller needs to make sure the buffer is > + * avaliable through multifd running. "throughout multifd runtime" maybe. > + */ > char *buf; > + char *buf_prealloc; > size_t buf_len; > -} MultiFDDeviceState_t; > +}; > > typedef enum { > MULTIFD_PAYLOAD_NONE, > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > index 67996aa2df..e36422b7c5 100644 > --- a/hw/vfio/migration.c > +++ b/hw/vfio/migration.c > @@ -59,13 +59,6 @@ > > #define VFIO_DEVICE_STATE_CONFIG_STATE (1) > > -typedef struct VFIODeviceStatePacket { > - uint32_t version; > - uint32_t idx; > - uint32_t flags; > - uint8_t data[0]; > -} QEMU_PACKED VFIODeviceStatePacket; > - > static int64_t bytes_transferred; > > static const char *mig_state_to_str(enum vfio_device_mig_state state) > @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque) > migration->initial_data_sent = false; > vfio_migration_cleanup(vbasedev); > trace_vfio_save_cleanup(vbasedev->name); > + if (vbasedev->multifd_packet) { > + g_clear_pointer(&vbasedev->multifd_packet, g_free); > + } > } > > static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, > @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas > g_autoptr(QIOChannelBuffer) bioc = NULL; > QEMUFile *f = NULL; > int ret; > - g_autofree VFIODeviceStatePacket *packet = NULL; > + VFIODeviceStatePacket *packet; > + MultiFDDeviceState_t *state; > size_t packet_len; > > bioc = qio_channel_buffer_new(0); > @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas > } > > packet_len = sizeof(*packet) + bioc->usage; > - packet = g_malloc0(packet_len); > + > + state = multifd_device_state_prepare(idstr, instance_id); > + /* > + * Do not reuse multifd buffer, but use our own due to random size. > + * The buffer will be freed only when save cleanup. > + */ > + vbasedev->multifd_packet = g_malloc0(packet_len); > + packet = vbasedev->multifd_packet; > packet->idx = idx; > packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; > memcpy(&packet->data, bioc->data, bioc->usage); > > - if (!multifd_queue_device_state(idstr, instance_id, > - (char *)packet, packet_len)) { > + if (!multifd_device_state_finish(state, packet, packet_len)) { > ret = -1; > } > > @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr, > VFIODevice *vbasedev = opaque; > VFIOMigration *migration = vbasedev->migration; > int ret; > - g_autofree VFIODeviceStatePacket *packet = NULL; > uint32_t idx; > > if (!migration->multifd_transfer) { > @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr, > goto ret_finish; > } > > - packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); > - > for (idx = 0; ; idx++) { > + VFIODeviceStatePacket *packet; > + MultiFDDeviceState_t *state; > ssize_t data_size; > size_t packet_size; > + int64_t buf_size; > > if (qatomic_read(abort_flag)) { > ret = -ECANCELED; > goto ret_finish; > } > > + state = multifd_device_state_prepare(idstr, instance_id); > + packet = multifd_device_state_get_buffer(state, &buf_size); > data_size = read(migration->data_fd, &packet->data, > - migration->data_buffer_size); > + buf_size - sizeof(*packet)); > if (data_size < 0) { > if (errno != ENOMSG) { > + multifd_device_state_finish(state, NULL, 0); > ret = -errno; > goto ret_finish; > } > @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr, > data_size = 0; > } > > - if (data_size == 0) > + if (data_size == 0) { > + multifd_device_state_finish(state, NULL, 0); > break; > + } > > packet->idx = idx; > packet_size = sizeof(*packet) + data_size; > > - if (!multifd_queue_device_state(idstr, instance_id, > - (char *)packet, packet_size)) { > + if (!multifd_device_state_finish(state, packet, packet_size)) { > ret = -1; > goto ret_finish; > } > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > index cfd0465bac..6f0259426d 100644 > --- a/migration/multifd-device-state.c > +++ b/migration/multifd-device-state.c > @@ -20,15 +20,18 @@ ThreadPool *send_threads; > int send_threads_ret; > bool send_threads_abort; > > +#define MULTIFD_DEVICE_STATE_BUFLEN (1UL << 20) > + > static MultiFDSendData *device_state_send; > > -/* TODO: use static buffers for idstr and buf */ > void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) > { > + device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN); > } > > void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) > { > + g_clear_pointer(&device_state->buf_prealloc, g_free); > } > > void multifd_device_state_save_setup(void) > @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void) > send_threads_abort = false; > } > > -void multifd_device_state_clear(MultiFDDeviceState_t *device_state) > -{ > - device_state->idstr = NULL; > - g_clear_pointer(&device_state->buf, g_free); > -} > - > void multifd_device_state_save_cleanup(void) > { > g_clear_pointer(&send_threads, thread_pool_free); > @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p) > multifd_device_state_fill_packet(p); > } > > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > - char *data, size_t len) > +/* > + * Prepare to send some device state via multifd. Returns the current idle > + * MultiFDDeviceState_t*. > + * > + * As a follow up, the caller must call multifd_device_state_finish() to > + * release the resources. > + * > + * One example usage of the API: > + * > + * // Fetch a free multifd device state object > + * state = multifd_device_state_prepare(idstr, instance_id); > + * > + * // Optional: fetch the buffer to reuse > + * buf = multifd_device_state_get_buffer(state, &buf_size); > + * > + * // Here len>0 means success, otherwise failure > + * len = buffer_fill(buf, buf_size); > + * > + * // Finish the transaction, either enqueue or cancel the request. Here > + * // len>0 will enqueue, <=0 will cancel. > + * multifd_device_state_finish(state, buf, len); > + */ > +MultiFDDeviceState_t * > +multifd_device_state_prepare(char *idstr, uint32_t instance_id) > { > - /* Device state submissions can come from multiple threads */ > - QEMU_LOCK_GUARD(&queue_job_mutex); > MultiFDDeviceState_t *device_state; > > assert(multifd_payload_empty(device_state_send)); > > - multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > + /* > + * TODO: The lock name may need change, but I'm reusing just for > + * simplicity. > + */ > + qemu_mutex_lock(&queue_job_mutex); > + > device_state = &device_state_send->u.device_state; > /* > - * NOTE: here we must use a static idstr (e.g. of a savevm state > - * entry) rather than any dynamically allocated buffer, because multifd > + * NOTE: here we must use a static idstr (e.g. of a savevm state entry) > + * rather than any dynamically allocated buffer, because multifd > * assumes this pointer is always valid! > */ > device_state->idstr = idstr; > device_state->instance_id = instance_id; > - device_state->buf = g_memdup2(data, len); > - device_state->buf_len = len; > > - if (!multifd_send(&device_state_send)) { > - multifd_send_data_clear(device_state_send); > - return false; > + return &device_state_send->u.device_state; > +} > + > +/* > + * Need to be used after a previous call to multifd_device_state_prepare(), > + * the buffer must not be used after invoke multifd_device_state_finish(). > + */ > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, > + int64_t *buf_len) > +{ > + *buf_len = MULTIFD_DEVICE_STATE_BUFLEN; > + return s->buf_prealloc; > +} > + > +/* > + * Need to be used only in pair with a previous call to > + * multifd_device_state_prepare(). Returns true if enqueue successful, > + * false otherwise. > + */ > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, > + void *buf, int64_t buf_len) > +{ > + bool result = false; > + > + /* Currently we only have one global free buffer */ > + assert(state == &device_state_send->u.device_state); > + > + if (buf_len < 0) { > + goto out; > } > > - return true; > + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > + /* This normally will be the state->buf_prealloc, but not required */ > + state->buf = buf; > + state->buf_len = buf_len; > + result = multifd_send(&device_state_send); > +out: > + qemu_mutex_unlock(&queue_job_mutex); > + return result; > } > > bool migration_has_device_state_support(void) > diff --git a/migration/multifd.c b/migration/multifd.c > index 5a20b831cf..2b5185e298 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data) > return; > } > > - switch (data->type) { > - case MULTIFD_PAYLOAD_DEVICE_STATE: > - multifd_device_state_clear(&data->u.device_state); > - break; > - default: > - /* Nothing to do */ > - break; > - } > - > data->type = MULTIFD_PAYLOAD_NONE; > } > > -- > 2.45.0
On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > > Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you > understand the rework is a little frustrating. That's OK. [For some reason my email sync program decided to give up working for hours. I got more time looking at a tsc bug, which is good, but then I miss a lot of emails..] > > > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > >> > +size_t multifd_device_state_payload_size(void) > >> > +{ > >> > + return sizeof(MultiFDDeviceState_t); > >> > +} > >> > >> This will not be necessary because the payload size is the same as the > >> data type. We only need it for the special case where the MultiFDPages_t > >> is smaller than the total ram payload size. > > > > Today I was thinking maybe we should really clean this up, as the current > > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > > that feeling stronger. > > If we're going to commit bad code and then rewrite it a week later, we > could have just let the original series from Maciej merge without any of > this. Why it's "bad code"? It runs pretty well, I don't think it's bad code. You wrote it, and I don't think it's bad at all. But now we're rethinking after reading Maciej's new series. Personally I don't think it's a major problem. Note that we're not changing the design back: what was initially proposed was the client submitting an array of multifd objects. I still don't think that's right. What the change goes so far is we make the union a struct, however that's still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the important bits are still there (from your previous refactor series). > I already suggested it a couple of times, we shouldn't be doing > core refactorings underneath contributors' patches, this is too > fragile. Just let people contribute their code and we can change it > later. I sincerely don't think a lot needs changing... only patch 1. Basically patch 1 on top of your previous rework series will be at least what I want, but I'm open to comments from you guys. Note that patch 2-3 will be on top of Maciej's changes and they're totally not relevant to what we merged so far. Hence, nothing relevant there to what you worked. And this is the diff of patch 1: migration/multifd.h | 16 +++++++++++----- migration/multifd-device-state.c | 8 ++++++-- migration/multifd-nocomp.c | 13 ++++++------- migration/multifd.c | 25 ++++++------------------- 4 files changed, 29 insertions(+), 33 deletions(-) It's only 33 lines removed (many of which are comments..), it's not a huge lot. I don't know why you feel so bad at this... It's probably because we maintain migration together, or we can keep our own way of work. I don't think we did anything wrong yet so far. We can definitely talk about this in next 1:1. > > This is also why I've been trying hard to separate core multifd > functionality from migration code that uses multifd to transmit their > data. > > My original RFC plus the suggestion to extend multifd_ops for device > state would have (almost) made it so that no client code would be left > in multifd. We could have been turning this thing upside down and it > wouldn't affect anyone in terms of code conflicts. Do you mean you preferred the 2*N approach? > > The ship has already sailed, so your patches below are fine, I have just > some small comments. I'm not sure what you meant about "ship sailed", but we should merge code whenever we think is the most correct. I hope you meant after below all things look the best, or please shoot. That's exactly what I'm requesting for as comments. > > > > > I think we should change it now perhaps, otherwise we'll need to introduce > > other helpers to e.g. reset the device buffers, and that's not only slow > > but also not good looking, IMO. > > I agree that part is kind of a sore thumb. > > > > > So I went ahead with the idea in previous discussion, that I managed to > > change the SendData union into struct; the memory consumption is not super > > important yet, IMHO, but we should still stick with the object model where > > multifd enqueue thread switch buffer with multifd, as it still sounds a > > sane way to do. > > > > Then when that patch is ready, I further tried to make VFIO reuse multifd > > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we > > don't allocate it every time we enqueue. > > > > I hope it'll also work for VFIO. VFIO has a specialty on being able to > > dump the config space so it's more complex (and I noticed Maciej's current > > design requires the final chunk of VFIO config data be migrated in one > > packet.. that is also part of the complexity there). So I allowed that > > part to allocate a buffer but only that. IOW, I made some API (see below) > > that can either reuse preallocated buffer, or use a separate one only for > > the final bulk. > > > > In short, could both of you have a look at what I came up with below? I > > did that in patches because I think it's too much to comment, so patches > > may work better. No concern if any of below could be good changes to you, > > then either Maciej can squash whatever into existing patches (and I feel > > like some existing patches in this series can go away with below design), > > or I can post pre-requisite patch but only if any of you prefer that. > > > > Anyway, let me know, the patches apply on top of this whole series applied > > first. > > > > I also wonder whether there can be any perf difference already (I tested > > all multifd qtest with below, but no VFIO I can run), perhaps not that > > much, but just to mention below should avoid both buffer allocations and > > one round of copy (so VFIO read() directly writes to the multifd buffers > > now). > > > > Thanks, > > > > ==========8<========== > > From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001 > > From: Peter Xu <peterx@redhat.com> > > Date: Tue, 10 Sep 2024 12:10:59 -0400 > > Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct > > > > The newly introduced device state buffer can be used for either storing > > VFIO's read() raw data, but already also possible to store generic device > > states. After noticing that device states may not easily provide a max > > buffer size (also the fact that RAM MultiFDPages_t after all also want to > > have flexibility on managing offset[] array), it may not be a good idea to > > stick with union on MultiFDSendData.. as it won't play well with such > > flexibility. > > > > Switch MultiFDSendData to a struct. > > > > It won't consume a lot more space in reality, after all the real buffers > > were already dynamically allocated, so it's so far only about the two > > structs (pages, device_state) that will be duplicated, but they're small. > > > > With this, we can remove the pretty hard to understand alloc size logic. > > Because now we can allocate offset[] together with the SendData, and > > properly free it when the SendData is freed. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > migration/multifd.h | 16 +++++++++++----- > > migration/multifd-device-state.c | 8 ++++++-- > > migration/multifd-nocomp.c | 13 ++++++------- > > migration/multifd.c | 25 ++++++------------------- > > 4 files changed, 29 insertions(+), 33 deletions(-) > > > > diff --git a/migration/multifd.h b/migration/multifd.h > > index c15c83104c..47203334b9 100644 > > --- a/migration/multifd.h > > +++ b/migration/multifd.h > > @@ -98,9 +98,13 @@ typedef struct { > > uint32_t num; > > /* number of normal pages */ > > uint32_t normal_num; > > + /* > > + * Pointer to the ramblock. NOTE: it's caller's responsibility to make > > + * sure the pointer is always valid! > > + */ > > This could use some rewording, it's not clear what "caller" means here. > > > RAMBlock *block; > > - /* offset of each page */ > > - ram_addr_t offset[]; > > + /* offset array of each page, managed by multifd */ > > I'd drop the part after the comma, it's not very accurate and also gives > an impression that something sophisticated is being done to this. > > > + ram_addr_t *offset; > > } MultiFDPages_t; > > > > struct MultiFDRecvData { > > @@ -123,7 +127,7 @@ typedef enum { > > MULTIFD_PAYLOAD_DEVICE_STATE, > > } MultiFDPayloadType; > > > > -typedef union MultiFDPayload { > > +typedef struct MultiFDPayload { > > MultiFDPages_t ram; > > MultiFDDeviceState_t device_state; > > } MultiFDPayload; > > @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void) > > void multifd_ram_save_setup(void); > > void multifd_ram_save_cleanup(void); > > int multifd_ram_flush_and_sync(void); > > -size_t multifd_ram_payload_size(void); > > +void multifd_ram_payload_alloc(MultiFDPages_t *pages); > > +void multifd_ram_payload_free(MultiFDPages_t *pages); > > void multifd_ram_fill_packet(MultiFDSendParams *p); > > int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); > > > > -size_t multifd_device_state_payload_size(void); > > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state); > > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state); > > void multifd_device_state_save_setup(void); > > void multifd_device_state_clear(MultiFDDeviceState_t *device_state); > > void multifd_device_state_save_cleanup(void); > > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > > index 9b364e8ef3..72b72b6e62 100644 > > --- a/migration/multifd-device-state.c > > +++ b/migration/multifd-device-state.c > > @@ -22,9 +22,13 @@ bool send_threads_abort; > > > > static MultiFDSendData *device_state_send; > > > > -size_t multifd_device_state_payload_size(void) > > +/* TODO: use static buffers for idstr and buf */ > > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) > > +{ > > +} > > + > > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) > > { > > - return sizeof(MultiFDDeviceState_t); > > } > > > > void multifd_device_state_save_setup(void) > > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c > > index 0b7b543f44..c1b95fee0d 100644 > > --- a/migration/multifd-nocomp.c > > +++ b/migration/multifd-nocomp.c > > @@ -22,15 +22,14 @@ > > > > static MultiFDSendData *multifd_ram_send; > > > > -size_t multifd_ram_payload_size(void) > > +void multifd_ram_payload_alloc(MultiFDPages_t *pages) > > { > > - uint32_t n = multifd_ram_page_count(); > > + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); > > +} > > > > - /* > > - * We keep an array of page offsets at the end of MultiFDPages_t, > > - * add space for it in the allocation. > > - */ > > - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); > > +void multifd_ram_payload_free(MultiFDPages_t *pages) > > +{ > > + g_clear_pointer(&pages->offset, g_free); > > } > > > > void multifd_ram_save_setup(void) > > diff --git a/migration/multifd.c b/migration/multifd.c > > index bebe5b5a9b..5a20b831cf 100644 > > --- a/migration/multifd.c > > +++ b/migration/multifd.c > > @@ -101,26 +101,12 @@ struct { > > > > MultiFDSendData *multifd_send_data_alloc(void) > > { > > - size_t max_payload_size, size_minus_payload; > > + MultiFDSendData *new = g_new0(MultiFDSendData, 1); > > > > - /* > > - * MultiFDPages_t has a flexible array at the end, account for it > > - * when allocating MultiFDSendData. Use max() in case other types > > - * added to the union in the future are larger than > > - * (MultiFDPages_t + flex array). > > - */ > > - max_payload_size = MAX(multifd_ram_payload_size(), > > - multifd_device_state_payload_size()); > > - max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); > > - > > - /* > > - * Account for any holes the compiler might insert. We can't pack > > - * the structure because that misaligns the members and triggers > > - * Waddress-of-packed-member. > > - */ > > - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); > > + multifd_ram_payload_alloc(&new->u.ram); > > + multifd_device_state_payload_alloc(&new->u.device_state); > > > > - return g_malloc0(size_minus_payload + max_payload_size); > > + return new; > > } > > > > void multifd_send_data_clear(MultiFDSendData *data) > > @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data) > > return; > > } > > > > - multifd_send_data_clear(data); > > + multifd_ram_payload_free(&data->u.ram); > > + multifd_device_state_payload_free(&data->u.device_state); > > The "u" needs to be dropped now. Could just change to "p". Or can we now > move the whole struct inside MultiFDSendData? Yep, all your comments look good to me. A note here: I intentionally didn't touch "u." as that requires more changes which doesn't help as me leaving "patch-styled comment". As I said, feel free to see the patches as comments not patches for merging yet. I / Maciej can prepare patch but only if the idea in general can be accepted. For me as I mentioned patch 2-3 do not relevant much to current master branch, afaiu, so if you guys like I can repost patch 1 with a formal one, but only if Maciej thinks it's easier for him. > > > > > g_free(data); > > } > > -- > > 2.45.0 > > > > > > > > From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001 > > From: Peter Xu <peterx@redhat.com> > > Date: Tue, 10 Sep 2024 12:24:14 -0400 > > Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates > > > > The duplication / allocation of idstr for each VFIO blob is an overkill, as > > idstr isn't something that changes frequently. Also, the idstr always came > > from the upper layer of se->idstr so it's always guaranteed to > > exist (e.g. no device unplug allowed during migration). > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > migration/multifd.h | 4 ++++ > > migration/multifd-device-state.c | 10 +++++++--- > > 2 files changed, 11 insertions(+), 3 deletions(-) > > > > diff --git a/migration/multifd.h b/migration/multifd.h > > index 47203334b9..1eaa5d4496 100644 > > --- a/migration/multifd.h > > +++ b/migration/multifd.h > > @@ -115,6 +115,10 @@ struct MultiFDRecvData { > > }; > > > > typedef struct { > > + /* > > + * Name of the owner device. NOTE: it's caller's responsibility to > > + * make sure the pointer is always valid! > > + */ > > Same comment as the other one here. > > > char *idstr; > > uint32_t instance_id; > > char *buf; > > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > > index 72b72b6e62..cfd0465bac 100644 > > --- a/migration/multifd-device-state.c > > +++ b/migration/multifd-device-state.c > > @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void) > > > > void multifd_device_state_clear(MultiFDDeviceState_t *device_state) > > { > > - g_clear_pointer(&device_state->idstr, g_free); > > + device_state->idstr = NULL; > > g_clear_pointer(&device_state->buf, g_free); > > } > > > > @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > > > > multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > > device_state = &device_state_send->u.device_state; > > - device_state->idstr = g_strdup(idstr); > > + /* > > + * NOTE: here we must use a static idstr (e.g. of a savevm state > > + * entry) rather than any dynamically allocated buffer, because multifd > > + * assumes this pointer is always valid! > > + */ > > + device_state->idstr = idstr; > > device_state->instance_id = instance_id; > > device_state->buf = g_memdup2(data, len); > > device_state->buf_len = len; > > @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque) > > { > > struct MultiFDDSSaveThreadData *data = opaque; > > > > - g_clear_pointer(&data->idstr, g_free); > > g_free(data); > > } > > > > -- > > 2.45.0 > > > > > > From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001 > > From: Peter Xu <peterx@redhat.com> > > Date: Tue, 10 Sep 2024 12:27:49 -0400 > > Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer > > allocations > > > > Provide a device_state->buf_prealloc so that the buffers can be reused if > > possible. Provide a set of APIs to use it right. Please see the > > documentation for the API in the code. > > > > The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of > > now. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > include/hw/vfio/vfio-common.h | 9 ++++ > > include/migration/misc.h | 12 ++++- > > migration/multifd.h | 11 +++- > > hw/vfio/migration.c | 43 ++++++++------- > > migration/multifd-device-state.c | 93 +++++++++++++++++++++++++------- > > migration/multifd.c | 9 ---- > > 6 files changed, 126 insertions(+), 51 deletions(-) > > > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > > index 4578a0ca6a..c1f2f4ae55 100644 > > --- a/include/hw/vfio/vfio-common.h > > +++ b/include/hw/vfio/vfio-common.h > > @@ -61,6 +61,13 @@ typedef struct VFIORegion { > > uint8_t nr; /* cache the region number for debug */ > > } VFIORegion; > > > > +typedef struct VFIODeviceStatePacket { > > + uint32_t version; > > + uint32_t idx; > > + uint32_t flags; > > + uint8_t data[0]; > > +} QEMU_PACKED VFIODeviceStatePacket; > > + > > typedef struct VFIOMigration { > > struct VFIODevice *vbasedev; > > VMChangeStateEntry *vm_state; > > @@ -168,6 +175,8 @@ typedef struct VFIODevice { > > int devid; > > IOMMUFDBackend *iommufd; > > VFIOIOASHwpt *hwpt; > > + /* Only used on sender side when multifd is enabled */ > > + VFIODeviceStatePacket *multifd_packet; > > QLIST_ENTRY(VFIODevice) hwpt_next; > > } VFIODevice; > > > > diff --git a/include/migration/misc.h b/include/migration/misc.h > > index 26f7f3140f..1a8676ed3d 100644 > > --- a/include/migration/misc.h > > +++ b/include/migration/misc.h > > @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void); > > void dirty_bitmap_mig_init(void); > > > > /* migration/multifd-device-state.c */ > > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > > - char *data, size_t len); > > +struct MultiFDDeviceState_t; > > +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t; > > + > > +MultiFDDeviceState_t * > > +multifd_device_state_prepare(char *idstr, uint32_t instance_id); > > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, > > + int64_t *buf_len); > > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, > > + void *buf, int64_t buf_len); > > + > > bool migration_has_device_state_support(void); > > > > void > > diff --git a/migration/multifd.h b/migration/multifd.h > > index 1eaa5d4496..1ccdeeb8c5 100644 > > --- a/migration/multifd.h > > +++ b/migration/multifd.h > > @@ -15,6 +15,7 @@ > > > > #include "exec/target_page.h" > > #include "ram.h" > > +#include "migration/misc.h" > > > > typedef struct MultiFDRecvData MultiFDRecvData; > > typedef struct MultiFDSendData MultiFDSendData; > > @@ -114,16 +115,22 @@ struct MultiFDRecvData { > > off_t file_offset; > > }; > > > > -typedef struct { > > +struct MultiFDDeviceState_t { > > /* > > * Name of the owner device. NOTE: it's caller's responsibility to > > * make sure the pointer is always valid! > > */ > > char *idstr; > > uint32_t instance_id; > > + /* > > + * Points to the buffer to send via multifd. Normally it's the same as > > + * buf_prealloc, otherwise the caller needs to make sure the buffer is > > + * avaliable through multifd running. > > "throughout multifd runtime" maybe. > > > + */ > > char *buf; > > + char *buf_prealloc; > > size_t buf_len; > > -} MultiFDDeviceState_t; > > +}; > > > > typedef enum { > > MULTIFD_PAYLOAD_NONE, > > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > > index 67996aa2df..e36422b7c5 100644 > > --- a/hw/vfio/migration.c > > +++ b/hw/vfio/migration.c > > @@ -59,13 +59,6 @@ > > > > #define VFIO_DEVICE_STATE_CONFIG_STATE (1) > > > > -typedef struct VFIODeviceStatePacket { > > - uint32_t version; > > - uint32_t idx; > > - uint32_t flags; > > - uint8_t data[0]; > > -} QEMU_PACKED VFIODeviceStatePacket; > > - > > static int64_t bytes_transferred; > > > > static const char *mig_state_to_str(enum vfio_device_mig_state state) > > @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque) > > migration->initial_data_sent = false; > > vfio_migration_cleanup(vbasedev); > > trace_vfio_save_cleanup(vbasedev->name); > > + if (vbasedev->multifd_packet) { > > + g_clear_pointer(&vbasedev->multifd_packet, g_free); > > + } > > } > > > > static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, > > @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas > > g_autoptr(QIOChannelBuffer) bioc = NULL; > > QEMUFile *f = NULL; > > int ret; > > - g_autofree VFIODeviceStatePacket *packet = NULL; > > + VFIODeviceStatePacket *packet; > > + MultiFDDeviceState_t *state; > > size_t packet_len; > > > > bioc = qio_channel_buffer_new(0); > > @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas > > } > > > > packet_len = sizeof(*packet) + bioc->usage; > > - packet = g_malloc0(packet_len); > > + > > + state = multifd_device_state_prepare(idstr, instance_id); > > + /* > > + * Do not reuse multifd buffer, but use our own due to random size. > > + * The buffer will be freed only when save cleanup. > > + */ > > + vbasedev->multifd_packet = g_malloc0(packet_len); > > + packet = vbasedev->multifd_packet; > > packet->idx = idx; > > packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; > > memcpy(&packet->data, bioc->data, bioc->usage); > > > > - if (!multifd_queue_device_state(idstr, instance_id, > > - (char *)packet, packet_len)) { > > + if (!multifd_device_state_finish(state, packet, packet_len)) { > > ret = -1; > > } > > > > @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr, > > VFIODevice *vbasedev = opaque; > > VFIOMigration *migration = vbasedev->migration; > > int ret; > > - g_autofree VFIODeviceStatePacket *packet = NULL; > > uint32_t idx; > > > > if (!migration->multifd_transfer) { > > @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr, > > goto ret_finish; > > } > > > > - packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); > > - > > for (idx = 0; ; idx++) { > > + VFIODeviceStatePacket *packet; > > + MultiFDDeviceState_t *state; > > ssize_t data_size; > > size_t packet_size; > > + int64_t buf_size; > > > > if (qatomic_read(abort_flag)) { > > ret = -ECANCELED; > > goto ret_finish; > > } > > > > + state = multifd_device_state_prepare(idstr, instance_id); > > + packet = multifd_device_state_get_buffer(state, &buf_size); > > data_size = read(migration->data_fd, &packet->data, > > - migration->data_buffer_size); > > + buf_size - sizeof(*packet)); > > if (data_size < 0) { > > if (errno != ENOMSG) { > > + multifd_device_state_finish(state, NULL, 0); > > ret = -errno; > > goto ret_finish; > > } > > @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr, > > data_size = 0; > > } > > > > - if (data_size == 0) > > + if (data_size == 0) { > > + multifd_device_state_finish(state, NULL, 0); > > break; > > + } > > > > packet->idx = idx; > > packet_size = sizeof(*packet) + data_size; > > > > - if (!multifd_queue_device_state(idstr, instance_id, > > - (char *)packet, packet_size)) { > > + if (!multifd_device_state_finish(state, packet, packet_size)) { > > ret = -1; > > goto ret_finish; > > } > > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c > > index cfd0465bac..6f0259426d 100644 > > --- a/migration/multifd-device-state.c > > +++ b/migration/multifd-device-state.c > > @@ -20,15 +20,18 @@ ThreadPool *send_threads; > > int send_threads_ret; > > bool send_threads_abort; > > > > +#define MULTIFD_DEVICE_STATE_BUFLEN (1UL << 20) > > + > > static MultiFDSendData *device_state_send; > > > > -/* TODO: use static buffers for idstr and buf */ > > void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) > > { > > + device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN); > > } > > > > void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) > > { > > + g_clear_pointer(&device_state->buf_prealloc, g_free); > > } > > > > void multifd_device_state_save_setup(void) > > @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void) > > send_threads_abort = false; > > } > > > > -void multifd_device_state_clear(MultiFDDeviceState_t *device_state) > > -{ > > - device_state->idstr = NULL; > > - g_clear_pointer(&device_state->buf, g_free); > > -} > > - > > void multifd_device_state_save_cleanup(void) > > { > > g_clear_pointer(&send_threads, thread_pool_free); > > @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p) > > multifd_device_state_fill_packet(p); > > } > > > > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > > - char *data, size_t len) > > +/* > > + * Prepare to send some device state via multifd. Returns the current idle > > + * MultiFDDeviceState_t*. > > + * > > + * As a follow up, the caller must call multifd_device_state_finish() to > > + * release the resources. > > + * > > + * One example usage of the API: > > + * > > + * // Fetch a free multifd device state object > > + * state = multifd_device_state_prepare(idstr, instance_id); > > + * > > + * // Optional: fetch the buffer to reuse > > + * buf = multifd_device_state_get_buffer(state, &buf_size); > > + * > > + * // Here len>0 means success, otherwise failure > > + * len = buffer_fill(buf, buf_size); > > + * > > + * // Finish the transaction, either enqueue or cancel the request. Here > > + * // len>0 will enqueue, <=0 will cancel. > > + * multifd_device_state_finish(state, buf, len); > > + */ > > +MultiFDDeviceState_t * > > +multifd_device_state_prepare(char *idstr, uint32_t instance_id) > > { > > - /* Device state submissions can come from multiple threads */ > > - QEMU_LOCK_GUARD(&queue_job_mutex); > > MultiFDDeviceState_t *device_state; > > > > assert(multifd_payload_empty(device_state_send)); > > > > - multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > > + /* > > + * TODO: The lock name may need change, but I'm reusing just for > > + * simplicity. > > + */ > > + qemu_mutex_lock(&queue_job_mutex); > > + > > device_state = &device_state_send->u.device_state; > > /* > > - * NOTE: here we must use a static idstr (e.g. of a savevm state > > - * entry) rather than any dynamically allocated buffer, because multifd > > + * NOTE: here we must use a static idstr (e.g. of a savevm state entry) > > + * rather than any dynamically allocated buffer, because multifd > > * assumes this pointer is always valid! > > */ > > device_state->idstr = idstr; > > device_state->instance_id = instance_id; > > - device_state->buf = g_memdup2(data, len); > > - device_state->buf_len = len; > > > > - if (!multifd_send(&device_state_send)) { > > - multifd_send_data_clear(device_state_send); > > - return false; > > + return &device_state_send->u.device_state; > > +} > > + > > +/* > > + * Need to be used after a previous call to multifd_device_state_prepare(), > > + * the buffer must not be used after invoke multifd_device_state_finish(). > > + */ > > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, > > + int64_t *buf_len) > > +{ > > + *buf_len = MULTIFD_DEVICE_STATE_BUFLEN; > > + return s->buf_prealloc; > > +} > > + > > +/* > > + * Need to be used only in pair with a previous call to > > + * multifd_device_state_prepare(). Returns true if enqueue successful, > > + * false otherwise. > > + */ > > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, > > + void *buf, int64_t buf_len) > > +{ > > + bool result = false; > > + > > + /* Currently we only have one global free buffer */ > > + assert(state == &device_state_send->u.device_state); > > + > > + if (buf_len < 0) { > > + goto out; > > } > > > > - return true; > > + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); > > + /* This normally will be the state->buf_prealloc, but not required */ > > + state->buf = buf; > > + state->buf_len = buf_len; > > + result = multifd_send(&device_state_send); > > +out: > > + qemu_mutex_unlock(&queue_job_mutex); > > + return result; > > } > > > > bool migration_has_device_state_support(void) > > diff --git a/migration/multifd.c b/migration/multifd.c > > index 5a20b831cf..2b5185e298 100644 > > --- a/migration/multifd.c > > +++ b/migration/multifd.c > > @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data) > > return; > > } > > > > - switch (data->type) { > > - case MULTIFD_PAYLOAD_DEVICE_STATE: > > - multifd_device_state_clear(&data->u.device_state); > > - break; > > - default: > > - /* Nothing to do */ > > - break; > > - } > > - > > data->type = MULTIFD_PAYLOAD_NONE; > > } > > > > -- > > 2.45.0 >
Peter Xu <peterx@redhat.com> writes: > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: >> Peter Xu <peterx@redhat.com> writes: >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you >> understand the rework is a little frustrating. > > That's OK. > > [For some reason my email sync program decided to give up working for > hours. I got more time looking at a tsc bug, which is good, but then I > miss a lot of emails..] > >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >> >> > +size_t multifd_device_state_payload_size(void) >> >> > +{ >> >> > + return sizeof(MultiFDDeviceState_t); >> >> > +} >> >> >> >> This will not be necessary because the payload size is the same as the >> >> data type. We only need it for the special case where the MultiFDPages_t >> >> is smaller than the total ram payload size. >> > >> > Today I was thinking maybe we should really clean this up, as the current >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested >> > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made >> > that feeling stronger. >> >> If we're going to commit bad code and then rewrite it a week later, we >> could have just let the original series from Maciej merge without any of >> this. > > Why it's "bad code"? > > It runs pretty well, I don't think it's bad code. You wrote it, and I > don't think it's bad at all. Code that forces us to do arithmetic in order to properly allocate memory and comes with a big comment explaining how we're dodging compiler warnings is bad code in my book. > > But now we're rethinking after reading Maciej's new series. >Personally I don't think it's a major problem. > > Note that we're not changing the design back: what was initially proposed > was the client submitting an array of multifd objects. I still don't think > that's right. > > What the change goes so far is we make the union a struct, however that's > still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the > important bits are still there (from your previous refactor series). > You fail to appreciate that before the RFC series, multifd already allocated N for the pages. The device state adds another client, so that would be another N anyway. The problem the RFC tried to solve was that multifd channels owned that 2N, so the array was added to move the memory into the client's ownership. IOW, it wasn't even the RFC series that made it 2N, that was the multifd design all along. Now in hindsight I don't think we should have went with the memory saving discussion. >> I already suggested it a couple of times, we shouldn't be doing >> core refactorings underneath contributors' patches, this is too >> fragile. Just let people contribute their code and we can change it >> later. > > I sincerely don't think a lot needs changing... only patch 1. Basically > patch 1 on top of your previous rework series will be at least what I want, > but I'm open to comments from you guys. Don't get me wrong, I'm very much in favor of what you're doing here. However, I don't think it's ok to be backtracking on our design while other people have series in flight that depend on it. You certainly know the feeling of trying to merge a feature and having maintainers ask you to rewrite a bunch code just to be able to start working. That's not ideal. I tried to quickly insert the RFC series before the device state series progressed too much, but it's 3 months later and we're still discussing it, maybe we don't need to do it this way. And ok, let's consider the current situation a special case. But I would like to avoid in the future this kind of uncertainty. > > Note that patch 2-3 will be on top of Maciej's changes and they're totally > not relevant to what we merged so far. Hence, nothing relevant there to > what you worked. And this is the diff of patch 1: > > migration/multifd.h | 16 +++++++++++----- > migration/multifd-device-state.c | 8 ++++++-- > migration/multifd-nocomp.c | 13 ++++++------- > migration/multifd.c | 25 ++++++------------------- > 4 files changed, 29 insertions(+), 33 deletions(-) > > It's only 33 lines removed (many of which are comments..), it's not a huge > lot. I don't know why you feel so bad at this... > > It's probably because we maintain migration together, or we can keep our > own way of work. I don't think we did anything wrong yet so far. > > We can definitely talk about this in next 1:1. > >> >> This is also why I've been trying hard to separate core multifd >> functionality from migration code that uses multifd to transmit their >> data. >> >> My original RFC plus the suggestion to extend multifd_ops for device >> state would have (almost) made it so that no client code would be left >> in multifd. We could have been turning this thing upside down and it >> wouldn't affect anyone in terms of code conflicts. > > Do you mean you preferred the 2*N approach? > 2*N, where N is usually not larger than 32 and the payload size is 1k. Yes, I'd trade that off no problem. >> >> The ship has already sailed, so your patches below are fine, I have just >> some small comments. > > I'm not sure what you meant about "ship sailed", but we should merge code > whenever we think is the most correct. As you put above, I agree that the important bits of the original series have been preserved, but other secondary goals were lost, such as the more abstract separation between multifd & client code and that is the ship that has sailed. That series was not: "introduce this array for no reason", we also lost the ability to abstract the payload from the multifd threads when we dropped the .alloc_fn callback for instance. The last patch you posted here now adds multifd_device_state_prepare, somewhat ignoring that the ram code also has the same pattern and it could be made to use the same API. I did accept your premise that ram+compression is one thing while device_state is another, so I'm not asking it to be changed, just pointing out that the RFC series also addressed those issues. I might not have made that clear back then. > I hope you meant after below all things look the best, or please shoot. > That's exactly what I'm requesting for as comments. What you have here is certainly an improvement from the current state. I'm just ranting about the path we took here. >> >> > >> > I think we should change it now perhaps, otherwise we'll need to introduce >> > other helpers to e.g. reset the device buffers, and that's not only slow >> > but also not good looking, IMO. >> >> I agree that part is kind of a sore thumb. >> >> > >> > So I went ahead with the idea in previous discussion, that I managed to >> > change the SendData union into struct; the memory consumption is not super >> > important yet, IMHO, but we should still stick with the object model where >> > multifd enqueue thread switch buffer with multifd, as it still sounds a >> > sane way to do. >> > >> > Then when that patch is ready, I further tried to make VFIO reuse multifd >> > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we >> > don't allocate it every time we enqueue. >> > >> > I hope it'll also work for VFIO. VFIO has a specialty on being able to >> > dump the config space so it's more complex (and I noticed Maciej's current >> > design requires the final chunk of VFIO config data be migrated in one >> > packet.. that is also part of the complexity there). So I allowed that >> > part to allocate a buffer but only that. IOW, I made some API (see below) >> > that can either reuse preallocated buffer, or use a separate one only for >> > the final bulk. >> > >> > In short, could both of you have a look at what I came up with below? I >> > did that in patches because I think it's too much to comment, so patches >> > may work better. No concern if any of below could be good changes to you, >> > then either Maciej can squash whatever into existing patches (and I feel >> > like some existing patches in this series can go away with below design), >> > or I can post pre-requisite patch but only if any of you prefer that. >> > >> > Anyway, let me know, the patches apply on top of this whole series applied >> > first. >> > >> > I also wonder whether there can be any perf difference already (I tested >> > all multifd qtest with below, but no VFIO I can run), perhaps not that >> > much, but just to mention below should avoid both buffer allocations and >> > one round of copy (so VFIO read() directly writes to the multifd buffers >> > now). >> > >> > Thanks, >> > >> > ==========8<========== >> > From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001 >> > From: Peter Xu <peterx@redhat.com> >> > Date: Tue, 10 Sep 2024 12:10:59 -0400 >> > Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct >> > >> > The newly introduced device state buffer can be used for either storing >> > VFIO's read() raw data, but already also possible to store generic device >> > states. After noticing that device states may not easily provide a max >> > buffer size (also the fact that RAM MultiFDPages_t after all also want to >> > have flexibility on managing offset[] array), it may not be a good idea to >> > stick with union on MultiFDSendData.. as it won't play well with such >> > flexibility. >> > >> > Switch MultiFDSendData to a struct. >> > >> > It won't consume a lot more space in reality, after all the real buffers >> > were already dynamically allocated, so it's so far only about the two >> > structs (pages, device_state) that will be duplicated, but they're small. >> > >> > With this, we can remove the pretty hard to understand alloc size logic. >> > Because now we can allocate offset[] together with the SendData, and >> > properly free it when the SendData is freed. >> > >> > Signed-off-by: Peter Xu <peterx@redhat.com> >> > --- >> > migration/multifd.h | 16 +++++++++++----- >> > migration/multifd-device-state.c | 8 ++++++-- >> > migration/multifd-nocomp.c | 13 ++++++------- >> > migration/multifd.c | 25 ++++++------------------- >> > 4 files changed, 29 insertions(+), 33 deletions(-) >> > >> > diff --git a/migration/multifd.h b/migration/multifd.h >> > index c15c83104c..47203334b9 100644 >> > --- a/migration/multifd.h >> > +++ b/migration/multifd.h >> > @@ -98,9 +98,13 @@ typedef struct { >> > uint32_t num; >> > /* number of normal pages */ >> > uint32_t normal_num; >> > + /* >> > + * Pointer to the ramblock. NOTE: it's caller's responsibility to make >> > + * sure the pointer is always valid! >> > + */ >> >> This could use some rewording, it's not clear what "caller" means here. >> >> > RAMBlock *block; >> > - /* offset of each page */ >> > - ram_addr_t offset[]; >> > + /* offset array of each page, managed by multifd */ >> >> I'd drop the part after the comma, it's not very accurate and also gives >> an impression that something sophisticated is being done to this. >> >> > + ram_addr_t *offset; >> > } MultiFDPages_t; >> > >> > struct MultiFDRecvData { >> > @@ -123,7 +127,7 @@ typedef enum { >> > MULTIFD_PAYLOAD_DEVICE_STATE, >> > } MultiFDPayloadType; >> > >> > -typedef union MultiFDPayload { >> > +typedef struct MultiFDPayload { >> > MultiFDPages_t ram; >> > MultiFDDeviceState_t device_state; >> > } MultiFDPayload; >> > @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void) >> > void multifd_ram_save_setup(void); >> > void multifd_ram_save_cleanup(void); >> > int multifd_ram_flush_and_sync(void); >> > -size_t multifd_ram_payload_size(void); >> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages); >> > +void multifd_ram_payload_free(MultiFDPages_t *pages); >> > void multifd_ram_fill_packet(MultiFDSendParams *p); >> > int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); >> > >> > -size_t multifd_device_state_payload_size(void); >> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state); >> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state); >> > void multifd_device_state_save_setup(void); >> > void multifd_device_state_clear(MultiFDDeviceState_t *device_state); >> > void multifd_device_state_save_cleanup(void); >> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c >> > index 9b364e8ef3..72b72b6e62 100644 >> > --- a/migration/multifd-device-state.c >> > +++ b/migration/multifd-device-state.c >> > @@ -22,9 +22,13 @@ bool send_threads_abort; >> > >> > static MultiFDSendData *device_state_send; >> > >> > -size_t multifd_device_state_payload_size(void) >> > +/* TODO: use static buffers for idstr and buf */ >> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) >> > +{ >> > +} >> > + >> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) >> > { >> > - return sizeof(MultiFDDeviceState_t); >> > } >> > >> > void multifd_device_state_save_setup(void) >> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c >> > index 0b7b543f44..c1b95fee0d 100644 >> > --- a/migration/multifd-nocomp.c >> > +++ b/migration/multifd-nocomp.c >> > @@ -22,15 +22,14 @@ >> > >> > static MultiFDSendData *multifd_ram_send; >> > >> > -size_t multifd_ram_payload_size(void) >> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages) >> > { >> > - uint32_t n = multifd_ram_page_count(); >> > + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); >> > +} >> > >> > - /* >> > - * We keep an array of page offsets at the end of MultiFDPages_t, >> > - * add space for it in the allocation. >> > - */ >> > - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); >> > +void multifd_ram_payload_free(MultiFDPages_t *pages) >> > +{ >> > + g_clear_pointer(&pages->offset, g_free); >> > } >> > >> > void multifd_ram_save_setup(void) >> > diff --git a/migration/multifd.c b/migration/multifd.c >> > index bebe5b5a9b..5a20b831cf 100644 >> > --- a/migration/multifd.c >> > +++ b/migration/multifd.c >> > @@ -101,26 +101,12 @@ struct { >> > >> > MultiFDSendData *multifd_send_data_alloc(void) >> > { >> > - size_t max_payload_size, size_minus_payload; >> > + MultiFDSendData *new = g_new0(MultiFDSendData, 1); >> > >> > - /* >> > - * MultiFDPages_t has a flexible array at the end, account for it >> > - * when allocating MultiFDSendData. Use max() in case other types >> > - * added to the union in the future are larger than >> > - * (MultiFDPages_t + flex array). >> > - */ >> > - max_payload_size = MAX(multifd_ram_payload_size(), >> > - multifd_device_state_payload_size()); >> > - max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); >> > - >> > - /* >> > - * Account for any holes the compiler might insert. We can't pack >> > - * the structure because that misaligns the members and triggers >> > - * Waddress-of-packed-member. >> > - */ >> > - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); >> > + multifd_ram_payload_alloc(&new->u.ram); >> > + multifd_device_state_payload_alloc(&new->u.device_state); >> > >> > - return g_malloc0(size_minus_payload + max_payload_size); >> > + return new; >> > } >> > >> > void multifd_send_data_clear(MultiFDSendData *data) >> > @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data) >> > return; >> > } >> > >> > - multifd_send_data_clear(data); >> > + multifd_ram_payload_free(&data->u.ram); >> > + multifd_device_state_payload_free(&data->u.device_state); >> >> The "u" needs to be dropped now. Could just change to "p". Or can we now >> move the whole struct inside MultiFDSendData? > > Yep, all your comments look good to me. > > A note here: I intentionally didn't touch "u." as that requires more > changes which doesn't help as me leaving "patch-styled comment". As I > said, feel free to see the patches as comments not patches for merging yet. > I / Maciej can prepare patch but only if the idea in general can be > accepted. > > For me as I mentioned patch 2-3 do not relevant much to current master > branch, afaiu, so if you guys like I can repost patch 1 with a formal one, > but only if Maciej thinks it's easier for him. > I don't mind either way. If it were a proper series, it could be fetched with b4, maybe that helps. >> >> > >> > g_free(data); >> > } >> > -- >> > 2.45.0 >> > >> > >> > >> > From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001 >> > From: Peter Xu <peterx@redhat.com> >> > Date: Tue, 10 Sep 2024 12:24:14 -0400 >> > Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates >> > >> > The duplication / allocation of idstr for each VFIO blob is an overkill, as >> > idstr isn't something that changes frequently. Also, the idstr always came >> > from the upper layer of se->idstr so it's always guaranteed to >> > exist (e.g. no device unplug allowed during migration). >> > >> > Signed-off-by: Peter Xu <peterx@redhat.com> >> > --- >> > migration/multifd.h | 4 ++++ >> > migration/multifd-device-state.c | 10 +++++++--- >> > 2 files changed, 11 insertions(+), 3 deletions(-) >> > >> > diff --git a/migration/multifd.h b/migration/multifd.h >> > index 47203334b9..1eaa5d4496 100644 >> > --- a/migration/multifd.h >> > +++ b/migration/multifd.h >> > @@ -115,6 +115,10 @@ struct MultiFDRecvData { >> > }; >> > >> > typedef struct { >> > + /* >> > + * Name of the owner device. NOTE: it's caller's responsibility to >> > + * make sure the pointer is always valid! >> > + */ >> >> Same comment as the other one here. >> >> > char *idstr; >> > uint32_t instance_id; >> > char *buf; >> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c >> > index 72b72b6e62..cfd0465bac 100644 >> > --- a/migration/multifd-device-state.c >> > +++ b/migration/multifd-device-state.c >> > @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void) >> > >> > void multifd_device_state_clear(MultiFDDeviceState_t *device_state) >> > { >> > - g_clear_pointer(&device_state->idstr, g_free); >> > + device_state->idstr = NULL; >> > g_clear_pointer(&device_state->buf, g_free); >> > } >> > >> > @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> > >> > multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); >> > device_state = &device_state_send->u.device_state; >> > - device_state->idstr = g_strdup(idstr); >> > + /* >> > + * NOTE: here we must use a static idstr (e.g. of a savevm state >> > + * entry) rather than any dynamically allocated buffer, because multifd >> > + * assumes this pointer is always valid! >> > + */ >> > + device_state->idstr = idstr; >> > device_state->instance_id = instance_id; >> > device_state->buf = g_memdup2(data, len); >> > device_state->buf_len = len; >> > @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque) >> > { >> > struct MultiFDDSSaveThreadData *data = opaque; >> > >> > - g_clear_pointer(&data->idstr, g_free); >> > g_free(data); >> > } >> > >> > -- >> > 2.45.0 >> > >> > >> > From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001 >> > From: Peter Xu <peterx@redhat.com> >> > Date: Tue, 10 Sep 2024 12:27:49 -0400 >> > Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer >> > allocations >> > >> > Provide a device_state->buf_prealloc so that the buffers can be reused if >> > possible. Provide a set of APIs to use it right. Please see the >> > documentation for the API in the code. >> > >> > The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of >> > now. >> > >> > Signed-off-by: Peter Xu <peterx@redhat.com> >> > --- >> > include/hw/vfio/vfio-common.h | 9 ++++ >> > include/migration/misc.h | 12 ++++- >> > migration/multifd.h | 11 +++- >> > hw/vfio/migration.c | 43 ++++++++------- >> > migration/multifd-device-state.c | 93 +++++++++++++++++++++++++------- >> > migration/multifd.c | 9 ---- >> > 6 files changed, 126 insertions(+), 51 deletions(-) >> > >> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> > index 4578a0ca6a..c1f2f4ae55 100644 >> > --- a/include/hw/vfio/vfio-common.h >> > +++ b/include/hw/vfio/vfio-common.h >> > @@ -61,6 +61,13 @@ typedef struct VFIORegion { >> > uint8_t nr; /* cache the region number for debug */ >> > } VFIORegion; >> > >> > +typedef struct VFIODeviceStatePacket { >> > + uint32_t version; >> > + uint32_t idx; >> > + uint32_t flags; >> > + uint8_t data[0]; >> > +} QEMU_PACKED VFIODeviceStatePacket; >> > + >> > typedef struct VFIOMigration { >> > struct VFIODevice *vbasedev; >> > VMChangeStateEntry *vm_state; >> > @@ -168,6 +175,8 @@ typedef struct VFIODevice { >> > int devid; >> > IOMMUFDBackend *iommufd; >> > VFIOIOASHwpt *hwpt; >> > + /* Only used on sender side when multifd is enabled */ >> > + VFIODeviceStatePacket *multifd_packet; >> > QLIST_ENTRY(VFIODevice) hwpt_next; >> > } VFIODevice; >> > >> > diff --git a/include/migration/misc.h b/include/migration/misc.h >> > index 26f7f3140f..1a8676ed3d 100644 >> > --- a/include/migration/misc.h >> > +++ b/include/migration/misc.h >> > @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void); >> > void dirty_bitmap_mig_init(void); >> > >> > /* migration/multifd-device-state.c */ >> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> > - char *data, size_t len); >> > +struct MultiFDDeviceState_t; >> > +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t; >> > + >> > +MultiFDDeviceState_t * >> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id); >> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, >> > + int64_t *buf_len); >> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, >> > + void *buf, int64_t buf_len); >> > + >> > bool migration_has_device_state_support(void); >> > >> > void >> > diff --git a/migration/multifd.h b/migration/multifd.h >> > index 1eaa5d4496..1ccdeeb8c5 100644 >> > --- a/migration/multifd.h >> > +++ b/migration/multifd.h >> > @@ -15,6 +15,7 @@ >> > >> > #include "exec/target_page.h" >> > #include "ram.h" >> > +#include "migration/misc.h" >> > >> > typedef struct MultiFDRecvData MultiFDRecvData; >> > typedef struct MultiFDSendData MultiFDSendData; >> > @@ -114,16 +115,22 @@ struct MultiFDRecvData { >> > off_t file_offset; >> > }; >> > >> > -typedef struct { >> > +struct MultiFDDeviceState_t { >> > /* >> > * Name of the owner device. NOTE: it's caller's responsibility to >> > * make sure the pointer is always valid! >> > */ >> > char *idstr; >> > uint32_t instance_id; >> > + /* >> > + * Points to the buffer to send via multifd. Normally it's the same as >> > + * buf_prealloc, otherwise the caller needs to make sure the buffer is >> > + * avaliable through multifd running. >> >> "throughout multifd runtime" maybe. >> >> > + */ >> > char *buf; >> > + char *buf_prealloc; >> > size_t buf_len; >> > -} MultiFDDeviceState_t; >> > +}; >> > >> > typedef enum { >> > MULTIFD_PAYLOAD_NONE, >> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c >> > index 67996aa2df..e36422b7c5 100644 >> > --- a/hw/vfio/migration.c >> > +++ b/hw/vfio/migration.c >> > @@ -59,13 +59,6 @@ >> > >> > #define VFIO_DEVICE_STATE_CONFIG_STATE (1) >> > >> > -typedef struct VFIODeviceStatePacket { >> > - uint32_t version; >> > - uint32_t idx; >> > - uint32_t flags; >> > - uint8_t data[0]; >> > -} QEMU_PACKED VFIODeviceStatePacket; >> > - >> > static int64_t bytes_transferred; >> > >> > static const char *mig_state_to_str(enum vfio_device_mig_state state) >> > @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque) >> > migration->initial_data_sent = false; >> > vfio_migration_cleanup(vbasedev); >> > trace_vfio_save_cleanup(vbasedev->name); >> > + if (vbasedev->multifd_packet) { >> > + g_clear_pointer(&vbasedev->multifd_packet, g_free); >> > + } >> > } >> > >> > static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, >> > @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas >> > g_autoptr(QIOChannelBuffer) bioc = NULL; >> > QEMUFile *f = NULL; >> > int ret; >> > - g_autofree VFIODeviceStatePacket *packet = NULL; >> > + VFIODeviceStatePacket *packet; >> > + MultiFDDeviceState_t *state; >> > size_t packet_len; >> > >> > bioc = qio_channel_buffer_new(0); >> > @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas >> > } >> > >> > packet_len = sizeof(*packet) + bioc->usage; >> > - packet = g_malloc0(packet_len); >> > + >> > + state = multifd_device_state_prepare(idstr, instance_id); >> > + /* >> > + * Do not reuse multifd buffer, but use our own due to random size. >> > + * The buffer will be freed only when save cleanup. >> > + */ >> > + vbasedev->multifd_packet = g_malloc0(packet_len); >> > + packet = vbasedev->multifd_packet; >> > packet->idx = idx; >> > packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; >> > memcpy(&packet->data, bioc->data, bioc->usage); >> > >> > - if (!multifd_queue_device_state(idstr, instance_id, >> > - (char *)packet, packet_len)) { >> > + if (!multifd_device_state_finish(state, packet, packet_len)) { >> > ret = -1; >> > } >> > >> > @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr, >> > VFIODevice *vbasedev = opaque; >> > VFIOMigration *migration = vbasedev->migration; >> > int ret; >> > - g_autofree VFIODeviceStatePacket *packet = NULL; >> > uint32_t idx; >> > >> > if (!migration->multifd_transfer) { >> > @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr, >> > goto ret_finish; >> > } >> > >> > - packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); >> > - >> > for (idx = 0; ; idx++) { >> > + VFIODeviceStatePacket *packet; >> > + MultiFDDeviceState_t *state; >> > ssize_t data_size; >> > size_t packet_size; >> > + int64_t buf_size; >> > >> > if (qatomic_read(abort_flag)) { >> > ret = -ECANCELED; >> > goto ret_finish; >> > } >> > >> > + state = multifd_device_state_prepare(idstr, instance_id); >> > + packet = multifd_device_state_get_buffer(state, &buf_size); >> > data_size = read(migration->data_fd, &packet->data, >> > - migration->data_buffer_size); >> > + buf_size - sizeof(*packet)); >> > if (data_size < 0) { >> > if (errno != ENOMSG) { >> > + multifd_device_state_finish(state, NULL, 0); >> > ret = -errno; >> > goto ret_finish; >> > } >> > @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr, >> > data_size = 0; >> > } >> > >> > - if (data_size == 0) >> > + if (data_size == 0) { >> > + multifd_device_state_finish(state, NULL, 0); >> > break; >> > + } >> > >> > packet->idx = idx; >> > packet_size = sizeof(*packet) + data_size; >> > >> > - if (!multifd_queue_device_state(idstr, instance_id, >> > - (char *)packet, packet_size)) { >> > + if (!multifd_device_state_finish(state, packet, packet_size)) { >> > ret = -1; >> > goto ret_finish; >> > } >> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c >> > index cfd0465bac..6f0259426d 100644 >> > --- a/migration/multifd-device-state.c >> > +++ b/migration/multifd-device-state.c >> > @@ -20,15 +20,18 @@ ThreadPool *send_threads; >> > int send_threads_ret; >> > bool send_threads_abort; >> > >> > +#define MULTIFD_DEVICE_STATE_BUFLEN (1UL << 20) >> > + >> > static MultiFDSendData *device_state_send; >> > >> > -/* TODO: use static buffers for idstr and buf */ >> > void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state) >> > { >> > + device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN); >> > } >> > >> > void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state) >> > { >> > + g_clear_pointer(&device_state->buf_prealloc, g_free); >> > } >> > >> > void multifd_device_state_save_setup(void) >> > @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void) >> > send_threads_abort = false; >> > } >> > >> > -void multifd_device_state_clear(MultiFDDeviceState_t *device_state) >> > -{ >> > - device_state->idstr = NULL; >> > - g_clear_pointer(&device_state->buf, g_free); >> > -} >> > - >> > void multifd_device_state_save_cleanup(void) >> > { >> > g_clear_pointer(&send_threads, thread_pool_free); >> > @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p) >> > multifd_device_state_fill_packet(p); >> > } >> > >> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> > - char *data, size_t len) >> > +/* >> > + * Prepare to send some device state via multifd. Returns the current idle >> > + * MultiFDDeviceState_t*. >> > + * >> > + * As a follow up, the caller must call multifd_device_state_finish() to >> > + * release the resources. >> > + * >> > + * One example usage of the API: >> > + * >> > + * // Fetch a free multifd device state object >> > + * state = multifd_device_state_prepare(idstr, instance_id); >> > + * >> > + * // Optional: fetch the buffer to reuse >> > + * buf = multifd_device_state_get_buffer(state, &buf_size); >> > + * >> > + * // Here len>0 means success, otherwise failure >> > + * len = buffer_fill(buf, buf_size); >> > + * >> > + * // Finish the transaction, either enqueue or cancel the request. Here >> > + * // len>0 will enqueue, <=0 will cancel. >> > + * multifd_device_state_finish(state, buf, len); >> > + */ >> > +MultiFDDeviceState_t * >> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id) >> > { >> > - /* Device state submissions can come from multiple threads */ >> > - QEMU_LOCK_GUARD(&queue_job_mutex); >> > MultiFDDeviceState_t *device_state; >> > >> > assert(multifd_payload_empty(device_state_send)); >> > >> > - multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); >> > + /* >> > + * TODO: The lock name may need change, but I'm reusing just for >> > + * simplicity. >> > + */ >> > + qemu_mutex_lock(&queue_job_mutex); >> > + >> > device_state = &device_state_send->u.device_state; >> > /* >> > - * NOTE: here we must use a static idstr (e.g. of a savevm state >> > - * entry) rather than any dynamically allocated buffer, because multifd >> > + * NOTE: here we must use a static idstr (e.g. of a savevm state entry) >> > + * rather than any dynamically allocated buffer, because multifd >> > * assumes this pointer is always valid! >> > */ >> > device_state->idstr = idstr; >> > device_state->instance_id = instance_id; >> > - device_state->buf = g_memdup2(data, len); >> > - device_state->buf_len = len; >> > >> > - if (!multifd_send(&device_state_send)) { >> > - multifd_send_data_clear(device_state_send); >> > - return false; >> > + return &device_state_send->u.device_state; >> > +} >> > + >> > +/* >> > + * Need to be used after a previous call to multifd_device_state_prepare(), >> > + * the buffer must not be used after invoke multifd_device_state_finish(). >> > + */ >> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s, >> > + int64_t *buf_len) >> > +{ >> > + *buf_len = MULTIFD_DEVICE_STATE_BUFLEN; >> > + return s->buf_prealloc; >> > +} >> > + >> > +/* >> > + * Need to be used only in pair with a previous call to >> > + * multifd_device_state_prepare(). Returns true if enqueue successful, >> > + * false otherwise. >> > + */ >> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state, >> > + void *buf, int64_t buf_len) >> > +{ >> > + bool result = false; >> > + >> > + /* Currently we only have one global free buffer */ >> > + assert(state == &device_state_send->u.device_state); >> > + >> > + if (buf_len < 0) { >> > + goto out; >> > } >> > >> > - return true; >> > + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); >> > + /* This normally will be the state->buf_prealloc, but not required */ >> > + state->buf = buf; >> > + state->buf_len = buf_len; >> > + result = multifd_send(&device_state_send); >> > +out: >> > + qemu_mutex_unlock(&queue_job_mutex); >> > + return result; >> > } >> > >> > bool migration_has_device_state_support(void) >> > diff --git a/migration/multifd.c b/migration/multifd.c >> > index 5a20b831cf..2b5185e298 100644 >> > --- a/migration/multifd.c >> > +++ b/migration/multifd.c >> > @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data) >> > return; >> > } >> > >> > - switch (data->type) { >> > - case MULTIFD_PAYLOAD_DEVICE_STATE: >> > - multifd_device_state_clear(&data->u.device_state); >> > - break; >> > - default: >> > - /* Nothing to do */ >> > - break; >> > - } >> > - >> > data->type = MULTIFD_PAYLOAD_NONE; >> > } >> > >> > -- >> > 2.45.0 >>
On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > > > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: > >> Peter Xu <peterx@redhat.com> writes: > >> > >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you > >> understand the rework is a little frustrating. > > > > That's OK. > > > > [For some reason my email sync program decided to give up working for > > hours. I got more time looking at a tsc bug, which is good, but then I > > miss a lot of emails..] > > > >> > >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > >> >> > +size_t multifd_device_state_payload_size(void) > >> >> > +{ > >> >> > + return sizeof(MultiFDDeviceState_t); > >> >> > +} > >> >> > >> >> This will not be necessary because the payload size is the same as the > >> >> data type. We only need it for the special case where the MultiFDPages_t > >> >> is smaller than the total ram payload size. > >> > > >> > Today I was thinking maybe we should really clean this up, as the current > >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > >> > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > >> > that feeling stronger. > >> > >> If we're going to commit bad code and then rewrite it a week later, we > >> could have just let the original series from Maciej merge without any of > >> this. > > > > Why it's "bad code"? > > > > It runs pretty well, I don't think it's bad code. You wrote it, and I > > don't think it's bad at all. > > Code that forces us to do arithmetic in order to properly allocate > memory and comes with a big comment explaining how we're dodging > compiler warnings is bad code in my book. > > > > > But now we're rethinking after reading Maciej's new series. > >Personally I don't think it's a major problem. > > > > Note that we're not changing the design back: what was initially proposed > > was the client submitting an array of multifd objects. I still don't think > > that's right. > > > > What the change goes so far is we make the union a struct, however that's > > still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the > > important bits are still there (from your previous refactor series). > > > > You fail to appreciate that before the RFC series, multifd already > allocated N for the pages. It depends on how you see it, IMHO. I think it allocates N not for the "pages" but for the "threads", because the threads can be busy with those buffers, no matter if it's "page" or "device data". > The device state adds another client, so that > would be another N anyway. The problem the RFC tried to solve was that > multifd channels owned that 2N, so the array was added to move the > memory into the client's ownership. IOW, it wasn't even the RFC series > that made it 2N, that was the multifd design all along. Now in hindsight > I don't think we should have went with the memory saving discussion. I think I could have made that feeling that I only wanted to save memory, if so, I'm sorry. But do you still remember I mentioned "we can make it a struct, too" before your series landed? Then you think it's ok to keep using union, and I'm ok too! At least at that time. I don't think that's a huge deal. I don't think each route we go must be perfect, but we try the best to make it as good. I don't think any discussion must not happen. I agree memory consumption is not the 1st thing to worry, but I don't see why it can't be discussed. Note that I never said we can't save those memory either - I plan to have follow up patches (for this, after Maciej's series land.. that's why I even didn't yet mention) to allow modules report device state buffer size. I just didn't say, yet, and don't plan to worry before vfio series lands. When with that, we'll save 1M*N when no vfio device attached (but we'll need to handle hotplug). So I think we don't need to lose any finally. However I think we need to get straight with the base design on how vfio should use multifd, because it's important bit and I don't want to rework important bits after a huge feature, if we know a better directions. I don't even think what I proposed patch 1-3 here is a must to me, I should be clear again here just in case we have similar discussions afterwards.. that I'm ok with below be done after Maciej's: - Avoid memory allocations per-packet (done in patch 2-3) - Avoid unnecessary data copy (done in patch 2-3) - Avoid allocate device buffers when no device will use (not proposed) But I'm not ok building everything on top of the idea of not using multifd buffers in the current way, because that can involve a lot of changes: that's where buffer passes from up to down or backwards, and the interface needs change a lot too. We already have that in master so it's not a problem now. > > >> I already suggested it a couple of times, we shouldn't be doing > >> core refactorings underneath contributors' patches, this is too > >> fragile. Just let people contribute their code and we can change it > >> later. > > > > I sincerely don't think a lot needs changing... only patch 1. Basically > > patch 1 on top of your previous rework series will be at least what I want, > > but I'm open to comments from you guys. > > Don't get me wrong, I'm very much in favor of what you're doing > here. However, I don't think it's ok to be backtracking on our design > while other people have series in flight that depend on it. You > certainly know the feeling of trying to merge a feature and having > maintainers ask you to rewrite a bunch code just to be able to start > working. That's not ideal. I as a patch writer always like to do that when it's essential. Normally the case is I don't have enough reviewer resources to help me get a better design, or discuss about it. When vfio is the new user of multifd vfio needs to do the heavy lifting to draft the api. > > I tried to quickly insert the RFC series before the device state series > progressed too much, but it's 3 months later and we're still discussing > it, maybe we don't need to do it this way. Can I get that of your feeling from when you were working on mapped-ram? That series does take long enough, I agree. Not so bad yet with the VFIO series - it's good to have you around because you provide great reviews. I'm also trying the best to not let a series dangle for more than a year. I don't think 3 months is long with this feature: this is the 1st multifd extrenal user (and file mapping is also in another angle), it can take some time. Sorry if it's so, but sorry again I don't think I get convinced: I think we need to go this way to build blocks one by one, and we need to make sure lower blocks are hopefully solid enough to take the upper ones. Again I'm ok with small things that go against it, but not major designs. We shouldn't go rewrite major designs if we seem to know a better one. > > And ok, let's consider the current situation a special case. But I would > like to avoid in the future this kind of uncertainty. > > > > > Note that patch 2-3 will be on top of Maciej's changes and they're totally > > not relevant to what we merged so far. Hence, nothing relevant there to > > what you worked. And this is the diff of patch 1: > > > > migration/multifd.h | 16 +++++++++++----- > > migration/multifd-device-state.c | 8 ++++++-- > > migration/multifd-nocomp.c | 13 ++++++------- > > migration/multifd.c | 25 ++++++------------------- > > 4 files changed, 29 insertions(+), 33 deletions(-) > > > > It's only 33 lines removed (many of which are comments..), it's not a huge > > lot. I don't know why you feel so bad at this... > > > > It's probably because we maintain migration together, or we can keep our > > own way of work. I don't think we did anything wrong yet so far. > > > > We can definitely talk about this in next 1:1. > > > >> > >> This is also why I've been trying hard to separate core multifd > >> functionality from migration code that uses multifd to transmit their > >> data. > >> > >> My original RFC plus the suggestion to extend multifd_ops for device > >> state would have (almost) made it so that no client code would be left > >> in multifd. We could have been turning this thing upside down and it > >> wouldn't affect anyone in terms of code conflicts. > > > > Do you mean you preferred the 2*N approach? > > > > 2*N, where N is usually not larger than 32 and the payload size is > 1k. Yes, I'd trade that off no problem. I think it's a problem. Vdpa when involved with exactly the same pattern of how vfio uses it (as they're really alike underneath) then vdpa will need its own array of buffers, or it'll need to take the same vfio lock which doesn't make sense to me. N+2, or, N+M (M is the user) is the minimum buffers we need. N because multifd can be worst case 100% busy on all threads occupying the buffers. M because M users can be worst case 100% pre-filling. It's either about memory consumption, or about logical sensibility. > > >> > >> The ship has already sailed, so your patches below are fine, I have just > >> some small comments. > > > > I'm not sure what you meant about "ship sailed", but we should merge code > > whenever we think is the most correct. > > As you put above, I agree that the important bits of the original series > have been preserved, but other secondary goals were lost, such as the > more abstract separation between multifd & client code and that is the > ship that has sailed. > > That series was not: "introduce this array for no reason", we also lost > the ability to abstract the payload from the multifd threads when we > dropped the .alloc_fn callback for instance. The last patch you posted I don't remember the details there, but my memory was that it was too flexible while we seem to reach the consensus that we only process either RAM or device, nothing else. > here now adds multifd_device_state_prepare, somewhat ignoring that the > ram code also has the same pattern and it could be made to use the same > API. I need some further elaborations to understand. multifd_device_state_prepare currently does a few things: taking ownership of the temp device state object, fill in idstr / instance_id, taking the lock (so far is needed because we only have one device state object). None of them seems to be needed for RAM yet. Feel free to send a rfc patch if that helps. > > I did accept your premise that ram+compression is one thing while > device_state is another, so I'm not asking it to be changed, just > pointing out that the RFC series also addressed those issues. I might > not have made that clear back then. > > > I hope you meant after below all things look the best, or please shoot. > > That's exactly what I'm requesting for as comments. > > What you have here is certainly an improvement from the current > state. I'm just ranting about the path we took here.
Peter Xu <peterx@redhat.com> writes: > On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote: >> Peter Xu <peterx@redhat.com> writes: >> >> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: >> >> Peter Xu <peterx@redhat.com> writes: >> >> >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you >> >> understand the rework is a little frustrating. >> > >> > That's OK. >> > >> > [For some reason my email sync program decided to give up working for >> > hours. I got more time looking at a tsc bug, which is good, but then I >> > miss a lot of emails..] >> > >> >> >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >> >> >> > +size_t multifd_device_state_payload_size(void) >> >> >> > +{ >> >> >> > + return sizeof(MultiFDDeviceState_t); >> >> >> > +} >> >> >> >> >> >> This will not be necessary because the payload size is the same as the >> >> >> data type. We only need it for the special case where the MultiFDPages_t >> >> >> is smaller than the total ram payload size. >> >> > >> >> > Today I was thinking maybe we should really clean this up, as the current >> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested >> >> > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr >> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made >> >> > that feeling stronger. >> >> >> >> If we're going to commit bad code and then rewrite it a week later, we >> >> could have just let the original series from Maciej merge without any of >> >> this. >> > >> > Why it's "bad code"? >> > >> > It runs pretty well, I don't think it's bad code. You wrote it, and I >> > don't think it's bad at all. >> >> Code that forces us to do arithmetic in order to properly allocate >> memory and comes with a big comment explaining how we're dodging >> compiler warnings is bad code in my book. >> >> > >> > But now we're rethinking after reading Maciej's new series. >> >Personally I don't think it's a major problem. >> > >> > Note that we're not changing the design back: what was initially proposed >> > was the client submitting an array of multifd objects. I still don't think >> > that's right. >> > >> > What the change goes so far is we make the union a struct, however that's >> > still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the >> > important bits are still there (from your previous refactor series). >> > >> >> You fail to appreciate that before the RFC series, multifd already >> allocated N for the pages. > > It depends on how you see it, IMHO. I think it allocates N not for the > "pages" but for the "threads", because the threads can be busy with those > buffers, no matter if it's "page" or "device data". Each MultiFD*Params had a p->pages, so N channels, N p->pages. The device state series would add p->device_state, one per channel. So 2N + 2 (for the extra slot). > >> The device state adds another client, so that >> would be another N anyway. The problem the RFC tried to solve was that >> multifd channels owned that 2N, so the array was added to move the >> memory into the client's ownership. IOW, it wasn't even the RFC series >> that made it 2N, that was the multifd design all along. Now in hindsight >> I don't think we should have went with the memory saving discussion. > > I think I could have made that feeling that I only wanted to save memory, > if so, I'm sorry. But do you still remember I mentioned "we can make it a > struct, too" before your series landed? Then you think it's ok to keep > using union, and I'm ok too! At least at that time. I don't think that's a > huge deal. I don't think each route we go must be perfect, but we try the > best to make it as good. Yep, I did agree with all of this. I'm just saying I now think I shouldn't have. > > I don't think any discussion must not happen. I agree memory consumption > is not the 1st thing to worry, but I don't see why it can't be discussed. It can be discussed, sure, but then 3 months pass and we're still talking about it. Saved ~64k and spent 3 months. We could have just as well said: "let's do a pass to optimize memory consumption after the device state series is in". > > Note that I never said we can't save those memory either - I plan to have > follow up patches (for this, after Maciej's series land.. that's why I even > didn't yet mention) to allow modules report device state buffer size. I > just didn't say, yet, and don't plan to worry before vfio series lands. > When with that, we'll save 1M*N when no vfio device attached (but we'll > need to handle hotplug). So I think we don't need to lose any finally. > > However I think we need to get straight with the base design on how vfio > should use multifd, because it's important bit and I don't want to rework > important bits after a huge feature, if we know a better directions. > > I don't even think what I proposed patch 1-3 here is a must to me, I should > be clear again here just in case we have similar discussions > afterwards.. that I'm ok with below be done after Maciej's: > > - Avoid memory allocations per-packet (done in patch 2-3) > - Avoid unnecessary data copy (done in patch 2-3) > - Avoid allocate device buffers when no device will use (not proposed) > > But I'm not ok building everything on top of the idea of not using multifd > buffers in the current way, because that can involve a lot of changes: > that's where buffer passes from up to down or backwards, and the interface > needs change a lot too. We already have that in master so it's not a > problem now. > >> >> >> I already suggested it a couple of times, we shouldn't be doing >> >> core refactorings underneath contributors' patches, this is too >> >> fragile. Just let people contribute their code and we can change it >> >> later. >> > >> > I sincerely don't think a lot needs changing... only patch 1. Basically >> > patch 1 on top of your previous rework series will be at least what I want, >> > but I'm open to comments from you guys. >> >> Don't get me wrong, I'm very much in favor of what you're doing >> here. However, I don't think it's ok to be backtracking on our design >> while other people have series in flight that depend on it. You >> certainly know the feeling of trying to merge a feature and having >> maintainers ask you to rewrite a bunch code just to be able to start >> working. That's not ideal. > > I as a patch writer always like to do that when it's essential. Normally > the case is I don't have enough reviewer resources to help me get a better > design, or discuss about it. Right, but we can't keep providing a moving target. See the thread pool discussion for an example. It's hard to work that way. The discussion here is similar, we introduced the union, now we're moving to the struct. And you're right that the changes here are small, so let's not get caught in that. > > When vfio is the new user of multifd vfio needs to do the heavy lifting to > draft the api. Well, multifd could have provided a flexible API to being with. That's entirely on us. I have been toying with allowing more clients since at least 1 year ago. We just couldn't get there in time. > >> >> I tried to quickly insert the RFC series before the device state series >> progressed too much, but it's 3 months later and we're still discussing >> it, maybe we don't need to do it this way. > > Can I get that of your feeling from when you were working on > mapped-ram? At that time I had already committed to helping maintain the code, so the time spent there already went into the maintainer bucket anyway. If I were instead just trying to drive-by, then that would have been a pain. > That series does take long enough, I agree. Not so bad yet with the VFIO > series - it's good to have you around because you provide great reviews. > I'm also trying the best to not let a series dangle for more than a year. > I don't think 3 months is long with this feature: this is the 1st multifd > extrenal user (and file mapping is also in another angle), it can take some > time. Oh, I don't mean the VFIO series is taking long. That's a complex feature indeed. I just mean going from p->pages to p->data could have taken less time. I'm suggesting we might have overdone there a bit. > > Sorry if it's so, but sorry again I don't think I get convinced: I think we > need to go this way to build blocks one by one, and we need to make sure > lower blocks are hopefully solid enough to take the upper ones. Again I'm > ok with small things that go against it, but not major designs. We > shouldn't go rewrite major designs if we seem to know a better one. > >> >> And ok, let's consider the current situation a special case. But I would >> like to avoid in the future this kind of uncertainty. >> >> > >> > Note that patch 2-3 will be on top of Maciej's changes and they're totally >> > not relevant to what we merged so far. Hence, nothing relevant there to >> > what you worked. And this is the diff of patch 1: >> > >> > migration/multifd.h | 16 +++++++++++----- >> > migration/multifd-device-state.c | 8 ++++++-- >> > migration/multifd-nocomp.c | 13 ++++++------- >> > migration/multifd.c | 25 ++++++------------------- >> > 4 files changed, 29 insertions(+), 33 deletions(-) >> > >> > It's only 33 lines removed (many of which are comments..), it's not a huge >> > lot. I don't know why you feel so bad at this... >> > >> > It's probably because we maintain migration together, or we can keep our >> > own way of work. I don't think we did anything wrong yet so far. >> > >> > We can definitely talk about this in next 1:1. >> > >> >> >> >> This is also why I've been trying hard to separate core multifd >> >> functionality from migration code that uses multifd to transmit their >> >> data. >> >> >> >> My original RFC plus the suggestion to extend multifd_ops for device >> >> state would have (almost) made it so that no client code would be left >> >> in multifd. We could have been turning this thing upside down and it >> >> wouldn't affect anyone in terms of code conflicts. >> > >> > Do you mean you preferred the 2*N approach? >> > >> >> 2*N, where N is usually not larger than 32 and the payload size is >> 1k. Yes, I'd trade that off no problem. > > I think it's a problem. > > Vdpa when involved with exactly the same pattern of how vfio uses it (as > they're really alike underneath) then vdpa will need its own array of > buffers, or it'll need to take the same vfio lock which doesn't make sense > to me. > > N+2, or, N+M (M is the user) is the minimum buffers we need. N because > multifd can be worst case 100% busy on all threads occupying the buffers. > M because M users can be worst case 100% pre-filling. It's either about > memory consumption, or about logical sensibility. I'm aware of the memory consumption. Still, we're not forced to use the minimum amount of space we can. If using more memory can lead to a better design in the medium term, we're allowed to make that choice. Hey, I'm not even saying we *should* have gone with 2N. I think it's good that we're now N+M. But I think we also lost some design flexibility due to that. > >> >> >> >> >> The ship has already sailed, so your patches below are fine, I have just >> >> some small comments. >> > >> > I'm not sure what you meant about "ship sailed", but we should merge code >> > whenever we think is the most correct. >> >> As you put above, I agree that the important bits of the original series >> have been preserved, but other secondary goals were lost, such as the >> more abstract separation between multifd & client code and that is the >> ship that has sailed. >> >> That series was not: "introduce this array for no reason", we also lost >> the ability to abstract the payload from the multifd threads when we >> dropped the .alloc_fn callback for instance. The last patch you posted > > I don't remember the details there, but my memory was that it was too > flexible while we seem to reach the consensus that we only process either > RAM or device, nothing else. Indeed. I'm being unfair here, sorry. > >> here now adds multifd_device_state_prepare, somewhat ignoring that the >> ram code also has the same pattern and it could be made to use the same >> API. > > I need some further elaborations to understand. > > multifd_device_state_prepare currently does a few things: taking ownership > of the temp device state object, fill in idstr / instance_id, taking the > lock (so far is needed because we only have one device state object). None > of them seems to be needed for RAM yet. > > Feel free to send a rfc patch if that helps. What if I don't send a patch, wait for it to get merged and then send a refactoring on top so we don't add yet another detour to this conversation? =) > >> >> I did accept your premise that ram+compression is one thing while >> device_state is another, so I'm not asking it to be changed, just >> pointing out that the RFC series also addressed those issues. I might >> not have made that clear back then. >> >> > I hope you meant after below all things look the best, or please shoot. >> > That's exactly what I'm requesting for as comments. >> >> What you have here is certainly an improvement from the current >> state. I'm just ranting about the path we took here.
On Fri, Sep 13, 2024 at 12:04:00PM -0300, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > > > On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote: > >> Peter Xu <peterx@redhat.com> writes: > >> > >> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: > >> >> Peter Xu <peterx@redhat.com> writes: > >> >> > >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you > >> >> understand the rework is a little frustrating. > >> > > >> > That's OK. > >> > > >> > [For some reason my email sync program decided to give up working for > >> > hours. I got more time looking at a tsc bug, which is good, but then I > >> > miss a lot of emails..] > >> > > >> >> > >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > >> >> >> > +size_t multifd_device_state_payload_size(void) > >> >> >> > +{ > >> >> >> > + return sizeof(MultiFDDeviceState_t); > >> >> >> > +} > >> >> >> > >> >> >> This will not be necessary because the payload size is the same as the > >> >> >> data type. We only need it for the special case where the MultiFDPages_t > >> >> >> is smaller than the total ram payload size. > >> >> > > >> >> > Today I was thinking maybe we should really clean this up, as the current > >> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > >> >> > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > >> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > >> >> > that feeling stronger. > >> >> > >> >> If we're going to commit bad code and then rewrite it a week later, we > >> >> could have just let the original series from Maciej merge without any of > >> >> this. > >> > > >> > Why it's "bad code"? > >> > > >> > It runs pretty well, I don't think it's bad code. You wrote it, and I > >> > don't think it's bad at all. > >> > >> Code that forces us to do arithmetic in order to properly allocate > >> memory and comes with a big comment explaining how we're dodging > >> compiler warnings is bad code in my book. > >> > >> > > >> > But now we're rethinking after reading Maciej's new series. > >> >Personally I don't think it's a major problem. > >> > > >> > Note that we're not changing the design back: what was initially proposed > >> > was the client submitting an array of multifd objects. I still don't think > >> > that's right. > >> > > >> > What the change goes so far is we make the union a struct, however that's > >> > still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the > >> > important bits are still there (from your previous refactor series). > >> > > >> > >> You fail to appreciate that before the RFC series, multifd already > >> allocated N for the pages. > > > > It depends on how you see it, IMHO. I think it allocates N not for the > > "pages" but for the "threads", because the threads can be busy with those > > buffers, no matter if it's "page" or "device data". > > Each MultiFD*Params had a p->pages, so N channels, N p->pages. The > device state series would add p->device_state, one per channel. So 2N + > 2 (for the extra slot). Then it makes sense to have SendData covering pages+device_state. I think it's what we have now, but maybe I missed the point. > > > > >> The device state adds another client, so that > >> would be another N anyway. The problem the RFC tried to solve was that > >> multifd channels owned that 2N, so the array was added to move the > >> memory into the client's ownership. IOW, it wasn't even the RFC series > >> that made it 2N, that was the multifd design all along. Now in hindsight > >> I don't think we should have went with the memory saving discussion. > > > > I think I could have made that feeling that I only wanted to save memory, > > if so, I'm sorry. But do you still remember I mentioned "we can make it a > > struct, too" before your series landed? Then you think it's ok to keep > > using union, and I'm ok too! At least at that time. I don't think that's a > > huge deal. I don't think each route we go must be perfect, but we try the > > best to make it as good. > > Yep, I did agree with all of this. I'm just saying I now think I > shouldn't have. > > > > > I don't think any discussion must not happen. I agree memory consumption > > is not the 1st thing to worry, but I don't see why it can't be discussed. > > It can be discussed, sure, but then 3 months pass and we're still > talking about it. Saved ~64k and spent 3 months. We could have just as > well said: "let's do a pass to optimize memory consumption after the > device state series is in". We didn't discuss 3 months on discussing memory consumption only! It's unfair to think it like that. > > > > > Note that I never said we can't save those memory either - I plan to have > > follow up patches (for this, after Maciej's series land.. that's why I even > > didn't yet mention) to allow modules report device state buffer size. I > > just didn't say, yet, and don't plan to worry before vfio series lands. > > When with that, we'll save 1M*N when no vfio device attached (but we'll > > need to handle hotplug). So I think we don't need to lose any finally. > > > > However I think we need to get straight with the base design on how vfio > > should use multifd, because it's important bit and I don't want to rework > > important bits after a huge feature, if we know a better directions. > > > > I don't even think what I proposed patch 1-3 here is a must to me, I should > > be clear again here just in case we have similar discussions > > afterwards.. that I'm ok with below be done after Maciej's: > > > > - Avoid memory allocations per-packet (done in patch 2-3) > > - Avoid unnecessary data copy (done in patch 2-3) > > - Avoid allocate device buffers when no device will use (not proposed) > > > > But I'm not ok building everything on top of the idea of not using multifd > > buffers in the current way, because that can involve a lot of changes: > > that's where buffer passes from up to down or backwards, and the interface > > needs change a lot too. We already have that in master so it's not a > > problem now. > > > >> > >> >> I already suggested it a couple of times, we shouldn't be doing > >> >> core refactorings underneath contributors' patches, this is too > >> >> fragile. Just let people contribute their code and we can change it > >> >> later. > >> > > >> > I sincerely don't think a lot needs changing... only patch 1. Basically > >> > patch 1 on top of your previous rework series will be at least what I want, > >> > but I'm open to comments from you guys. > >> > >> Don't get me wrong, I'm very much in favor of what you're doing > >> here. However, I don't think it's ok to be backtracking on our design > >> while other people have series in flight that depend on it. You > >> certainly know the feeling of trying to merge a feature and having > >> maintainers ask you to rewrite a bunch code just to be able to start > >> working. That's not ideal. > > > > I as a patch writer always like to do that when it's essential. Normally > > the case is I don't have enough reviewer resources to help me get a better > > design, or discuss about it. > > Right, but we can't keep providing a moving target. See the thread pool > discussion for an example. It's hard to work that way. The discussion > here is similar, we introduced the union, now we're moving to the > struct. And you're right that the changes here are small, so let's not > get caught in that. What's your suggestion on the thread pool? Should we merge the change where vfio creates the threads on its own (assuming vfio maintainers are ok with it)? I would say no, that's what I suggested. I'd start with reusing ThreadPool, then we found issue when Stefan reported worry on abusing the API. All these discussions seem sensible to me so far. We can't rush on these until we figure things out step by step. I don't see a way. I saw Cedric suggesting to not even create a thread on recv side. I am not sure whether that's easy, but I'd agree with Cedric if possible. I think Maciej could have a point where it can block mutlifd threads, aka, IO threads, which might be unwanted. However said that, I still think device (even if threads needed) should not randomly create threads during migration. It'll be a nightmare. > > > > > When vfio is the new user of multifd vfio needs to do the heavy lifting to > > draft the api. > > Well, multifd could have provided a flexible API to being with. That's > entirely on us. I have been toying with allowing more clients since at > least 1 year ago. We just couldn't get there in time. > > > > >> > >> I tried to quickly insert the RFC series before the device state series > >> progressed too much, but it's 3 months later and we're still discussing > >> it, maybe we don't need to do it this way. > > > > Can I get that of your feeling from when you were working on > > mapped-ram? > > At that time I had already committed to helping maintain the code, so > the time spent there already went into the maintainer bucket anyway. If > I were instead just trying to drive-by, then that would have been a > pain. I don't think you became a maintainer changed how I would review mapped-ram series. OTOH, "I became a maintainer" could, because I know I am more responsible to a chunk of code until I leave (and please let me know any time when you think you're ready to take migration on your own). That's a real difference to me. > > > That series does take long enough, I agree. Not so bad yet with the VFIO > > series - it's good to have you around because you provide great reviews. > > I'm also trying the best to not let a series dangle for more than a year. > > I don't think 3 months is long with this feature: this is the 1st multifd > > extrenal user (and file mapping is also in another angle), it can take some > > time. > > Oh, I don't mean the VFIO series is taking long. That's a complex > feature indeed. I just mean going from p->pages to p->data could have > taken less time. I'm suggesting we might have overdone there a bit. > > > > > Sorry if it's so, but sorry again I don't think I get convinced: I think we > > need to go this way to build blocks one by one, and we need to make sure > > lower blocks are hopefully solid enough to take the upper ones. Again I'm > > ok with small things that go against it, but not major designs. We > > shouldn't go rewrite major designs if we seem to know a better one. > > > >> > >> And ok, let's consider the current situation a special case. But I would > >> like to avoid in the future this kind of uncertainty. > >> > >> > > >> > Note that patch 2-3 will be on top of Maciej's changes and they're totally > >> > not relevant to what we merged so far. Hence, nothing relevant there to > >> > what you worked. And this is the diff of patch 1: > >> > > >> > migration/multifd.h | 16 +++++++++++----- > >> > migration/multifd-device-state.c | 8 ++++++-- > >> > migration/multifd-nocomp.c | 13 ++++++------- > >> > migration/multifd.c | 25 ++++++------------------- > >> > 4 files changed, 29 insertions(+), 33 deletions(-) > >> > > >> > It's only 33 lines removed (many of which are comments..), it's not a huge > >> > lot. I don't know why you feel so bad at this... > >> > > >> > It's probably because we maintain migration together, or we can keep our > >> > own way of work. I don't think we did anything wrong yet so far. > >> > > >> > We can definitely talk about this in next 1:1. > >> > > >> >> > >> >> This is also why I've been trying hard to separate core multifd > >> >> functionality from migration code that uses multifd to transmit their > >> >> data. > >> >> > >> >> My original RFC plus the suggestion to extend multifd_ops for device > >> >> state would have (almost) made it so that no client code would be left > >> >> in multifd. We could have been turning this thing upside down and it > >> >> wouldn't affect anyone in terms of code conflicts. > >> > > >> > Do you mean you preferred the 2*N approach? > >> > > >> > >> 2*N, where N is usually not larger than 32 and the payload size is > >> 1k. Yes, I'd trade that off no problem. > > > > I think it's a problem. > > > > Vdpa when involved with exactly the same pattern of how vfio uses it (as > > they're really alike underneath) then vdpa will need its own array of > > buffers, or it'll need to take the same vfio lock which doesn't make sense > > to me. > > > > N+2, or, N+M (M is the user) is the minimum buffers we need. N because > > multifd can be worst case 100% busy on all threads occupying the buffers. > > M because M users can be worst case 100% pre-filling. It's either about > > memory consumption, or about logical sensibility. > > I'm aware of the memory consumption. Still, we're not forced to use the > minimum amount of space we can. If using more memory can lead to a > better design in the medium term, we're allowed to make that choice. > > Hey, I'm not even saying we *should* have gone with 2N. I think it's > good that we're now N+M. But I think we also lost some design > flexibility due to that. > > > > >> > >> >> > >> >> The ship has already sailed, so your patches below are fine, I have just > >> >> some small comments. > >> > > >> > I'm not sure what you meant about "ship sailed", but we should merge code > >> > whenever we think is the most correct. > >> > >> As you put above, I agree that the important bits of the original series > >> have been preserved, but other secondary goals were lost, such as the > >> more abstract separation between multifd & client code and that is the > >> ship that has sailed. > >> > >> That series was not: "introduce this array for no reason", we also lost > >> the ability to abstract the payload from the multifd threads when we > >> dropped the .alloc_fn callback for instance. The last patch you posted > > > > I don't remember the details there, but my memory was that it was too > > flexible while we seem to reach the consensus that we only process either > > RAM or device, nothing else. > > Indeed. I'm being unfair here, sorry. > > > > >> here now adds multifd_device_state_prepare, somewhat ignoring that the > >> ram code also has the same pattern and it could be made to use the same > >> API. > > > > I need some further elaborations to understand. > > > > multifd_device_state_prepare currently does a few things: taking ownership > > of the temp device state object, fill in idstr / instance_id, taking the > > lock (so far is needed because we only have one device state object). None > > of them seems to be needed for RAM yet. > > > > Feel free to send a rfc patch if that helps. > > What if I don't send a patch, wait for it to get merged and then send a > refactoring on top so we don't add yet another detour to this > conversation? =) I thought it shouldn't conflict much if with ram only, and I used to mean that can be a "comment in the form of patch". But yeah, sure thing.
Peter Xu <peterx@redhat.com> writes: > On Fri, Sep 13, 2024 at 12:04:00PM -0300, Fabiano Rosas wrote: >> Peter Xu <peterx@redhat.com> writes: >> >> > On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote: >> >> Peter Xu <peterx@redhat.com> writes: >> >> >> >> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote: >> >> >> Peter Xu <peterx@redhat.com> writes: >> >> >> >> >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you >> >> >> understand the rework is a little frustrating. >> >> > >> >> > That's OK. >> >> > >> >> > [For some reason my email sync program decided to give up working for >> >> > hours. I got more time looking at a tsc bug, which is good, but then I >> >> > miss a lot of emails..] >> >> > >> >> >> >> >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >> >> >> >> > +size_t multifd_device_state_payload_size(void) >> >> >> >> > +{ >> >> >> >> > + return sizeof(MultiFDDeviceState_t); >> >> >> >> > +} >> >> >> >> >> >> >> >> This will not be necessary because the payload size is the same as the >> >> >> >> data type. We only need it for the special case where the MultiFDPages_t >> >> >> >> is smaller than the total ram payload size. >> >> >> > >> >> >> > Today I was thinking maybe we should really clean this up, as the current >> >> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested >> >> >> > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr >> >> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made >> >> >> > that feeling stronger. >> >> >> >> >> >> If we're going to commit bad code and then rewrite it a week later, we >> >> >> could have just let the original series from Maciej merge without any of >> >> >> this. >> >> > >> >> > Why it's "bad code"? >> >> > >> >> > It runs pretty well, I don't think it's bad code. You wrote it, and I >> >> > don't think it's bad at all. >> >> >> >> Code that forces us to do arithmetic in order to properly allocate >> >> memory and comes with a big comment explaining how we're dodging >> >> compiler warnings is bad code in my book. >> >> >> >> > >> >> > But now we're rethinking after reading Maciej's new series. >> >> >Personally I don't think it's a major problem. >> >> > >> >> > Note that we're not changing the design back: what was initially proposed >> >> > was the client submitting an array of multifd objects. I still don't think >> >> > that's right. >> >> > >> >> > What the change goes so far is we make the union a struct, however that's >> >> > still N+2 objects not 2*N, where 2 came from RAM+VFIO. I think the >> >> > important bits are still there (from your previous refactor series). >> >> > >> >> >> >> You fail to appreciate that before the RFC series, multifd already >> >> allocated N for the pages. >> > >> > It depends on how you see it, IMHO. I think it allocates N not for the >> > "pages" but for the "threads", because the threads can be busy with those >> > buffers, no matter if it's "page" or "device data". >> >> Each MultiFD*Params had a p->pages, so N channels, N p->pages. The >> device state series would add p->device_state, one per channel. So 2N + >> 2 (for the extra slot). > > Then it makes sense to have SendData covering pages+device_state. I think > it's what we have now, but maybe I missed the point. I misunderstood you. You're saying that you see the N as per-thread instead of per-client-per-thread. That's one perspective indeed. It was not what the device state series had going for it, so I still think that this could have been a separate discussion independent of the p->pages -> p->data change. But let's not drag this argument on, it's been discussed and the code has been merged. > >> >> > >> >> The device state adds another client, so that >> >> would be another N anyway. The problem the RFC tried to solve was that >> >> multifd channels owned that 2N, so the array was added to move the >> >> memory into the client's ownership. IOW, it wasn't even the RFC series >> >> that made it 2N, that was the multifd design all along. Now in hindsight >> >> I don't think we should have went with the memory saving discussion. >> > >> > I think I could have made that feeling that I only wanted to save memory, >> > if so, I'm sorry. But do you still remember I mentioned "we can make it a >> > struct, too" before your series landed? Then you think it's ok to keep >> > using union, and I'm ok too! At least at that time. I don't think that's a >> > huge deal. I don't think each route we go must be perfect, but we try the >> > best to make it as good. >> >> Yep, I did agree with all of this. I'm just saying I now think I >> shouldn't have. >> >> > >> > I don't think any discussion must not happen. I agree memory consumption >> > is not the 1st thing to worry, but I don't see why it can't be discussed. >> >> It can be discussed, sure, but then 3 months pass and we're still >> talking about it. Saved ~64k and spent 3 months. We could have just as >> well said: "let's do a pass to optimize memory consumption after the >> device state series is in". > > We didn't discuss 3 months on discussing memory consumption only! It's > unfair to think it like that. Ok, you're right. > >> >> > >> > Note that I never said we can't save those memory either - I plan to have >> > follow up patches (for this, after Maciej's series land.. that's why I even >> > didn't yet mention) to allow modules report device state buffer size. I >> > just didn't say, yet, and don't plan to worry before vfio series lands. >> > When with that, we'll save 1M*N when no vfio device attached (but we'll >> > need to handle hotplug). So I think we don't need to lose any finally. >> > >> > However I think we need to get straight with the base design on how vfio >> > should use multifd, because it's important bit and I don't want to rework >> > important bits after a huge feature, if we know a better directions. >> > >> > I don't even think what I proposed patch 1-3 here is a must to me, I should >> > be clear again here just in case we have similar discussions >> > afterwards.. that I'm ok with below be done after Maciej's: >> > >> > - Avoid memory allocations per-packet (done in patch 2-3) >> > - Avoid unnecessary data copy (done in patch 2-3) >> > - Avoid allocate device buffers when no device will use (not proposed) >> > >> > But I'm not ok building everything on top of the idea of not using multifd >> > buffers in the current way, because that can involve a lot of changes: >> > that's where buffer passes from up to down or backwards, and the interface >> > needs change a lot too. We already have that in master so it's not a >> > problem now. >> > >> >> >> >> >> I already suggested it a couple of times, we shouldn't be doing >> >> >> core refactorings underneath contributors' patches, this is too >> >> >> fragile. Just let people contribute their code and we can change it >> >> >> later. >> >> > >> >> > I sincerely don't think a lot needs changing... only patch 1. Basically >> >> > patch 1 on top of your previous rework series will be at least what I want, >> >> > but I'm open to comments from you guys. >> >> >> >> Don't get me wrong, I'm very much in favor of what you're doing >> >> here. However, I don't think it's ok to be backtracking on our design >> >> while other people have series in flight that depend on it. You >> >> certainly know the feeling of trying to merge a feature and having >> >> maintainers ask you to rewrite a bunch code just to be able to start >> >> working. That's not ideal. >> > >> > I as a patch writer always like to do that when it's essential. Normally >> > the case is I don't have enough reviewer resources to help me get a better >> > design, or discuss about it. >> >> Right, but we can't keep providing a moving target. See the thread pool >> discussion for an example. It's hard to work that way. The discussion >> here is similar, we introduced the union, now we're moving to the >> struct. And you're right that the changes here are small, so let's not >> get caught in that. > > What's your suggestion on the thread pool? Should we merge the change > where vfio creates the threads on its own (assuming vfio maintainers are ok > with it)? This is not a simple answer and I'm not exactly sure where to draw the line, but in this case I'm inclined to say: yes. > > I would say no, that's what I suggested. I'd start with reusing > ThreadPool, then we found issue when Stefan reported worry on abusing the > API. All these discussions seem sensible to me so far. We can't rush on > these until we figure things out step by step. I don't see a way. The problem is that using a thread pool is something that we've agreed on for a while and is even in the migration TODO list. It's not something that came up as a result of the device state series. I know this is not anyone's intent, but it starts to feel like gatekeeping. The fact that migration lacks a thread pool, that multifd threads have historically caused issues and that what's in util/thread-pool.c is only useful to the block layer are all preexisting problems of this code base. We could be (and are) working to improve those regardless of what new features are being contributed. I'm not sure it's productive to pick problems we have had for a while and present those as prerequisites for merging new code. But as I said, there's a line to be drawn somewhere and I don't know exactly where it lies. I understand the points that were brought up in favor of first figuring out the thread pool situation, those are not at all unreasonable. > > I saw Cedric suggesting to not even create a thread on recv side. I am not > sure whether that's easy, but I'd agree with Cedric if possible. I think > Maciej could have a point where it can block mutlifd threads, aka, IO > threads, which might be unwanted. > > However said that, I still think device (even if threads needed) should not > randomly create threads during migration. It'll be a nightmare. The thread-pool approach is being looked at to solve this particular problem, but we have also discussed in the past that multifd threads themselves should be managed by a thread pool. Will we add this requirement to what is being done now? Otherwise, don't we risk having an implementation that doesn't serve the rest of multifd? Do we even know what the requirements are? Keep in mind that we're already not modifying the existing ThreadPool, but planning to write a new one. > >> >> > >> > When vfio is the new user of multifd vfio needs to do the heavy lifting to >> > draft the api. >> >> Well, multifd could have provided a flexible API to being with. That's >> entirely on us. I have been toying with allowing more clients since at >> least 1 year ago. We just couldn't get there in time. >> >> > >> >> >> >> I tried to quickly insert the RFC series before the device state series >> >> progressed too much, but it's 3 months later and we're still discussing >> >> it, maybe we don't need to do it this way. >> > >> > Can I get that of your feeling from when you were working on >> > mapped-ram? >> >> At that time I had already committed to helping maintain the code, so >> the time spent there already went into the maintainer bucket anyway. If >> I were instead just trying to drive-by, then that would have been a >> pain. > > I don't think you became a maintainer changed how I would review mapped-ram > series. > > OTOH, "I became a maintainer" could, because I know I am more responsible > to a chunk of code until I leave (and please let me know any time when you > think you're ready to take migration on your own). That's a real > difference to me. Right, that's my point. I don't mind that what I'm doing takes time. I'm not going anywhere. I do mind that because we're not going anywhere, we start to drag people into a constant state of improving the next little thing. Again, our definition of what constitutes "a little thing" is of course different. > >> >> > That series does take long enough, I agree. Not so bad yet with the VFIO >> > series - it's good to have you around because you provide great reviews. >> > I'm also trying the best to not let a series dangle for more than a year. >> > I don't think 3 months is long with this feature: this is the 1st multifd >> > extrenal user (and file mapping is also in another angle), it can take some >> > time. >> >> Oh, I don't mean the VFIO series is taking long. That's a complex >> feature indeed. I just mean going from p->pages to p->data could have >> taken less time. I'm suggesting we might have overdone there a bit. >> >> > >> > Sorry if it's so, but sorry again I don't think I get convinced: I think we >> > need to go this way to build blocks one by one, and we need to make sure >> > lower blocks are hopefully solid enough to take the upper ones. Again I'm >> > ok with small things that go against it, but not major designs. We >> > shouldn't go rewrite major designs if we seem to know a better one. >> > >> >> >> >> And ok, let's consider the current situation a special case. But I would >> >> like to avoid in the future this kind of uncertainty. >> >> >> >> > >> >> > Note that patch 2-3 will be on top of Maciej's changes and they're totally >> >> > not relevant to what we merged so far. Hence, nothing relevant there to >> >> > what you worked. And this is the diff of patch 1: >> >> > >> >> > migration/multifd.h | 16 +++++++++++----- >> >> > migration/multifd-device-state.c | 8 ++++++-- >> >> > migration/multifd-nocomp.c | 13 ++++++------- >> >> > migration/multifd.c | 25 ++++++------------------- >> >> > 4 files changed, 29 insertions(+), 33 deletions(-) >> >> > >> >> > It's only 33 lines removed (many of which are comments..), it's not a huge >> >> > lot. I don't know why you feel so bad at this... >> >> > >> >> > It's probably because we maintain migration together, or we can keep our >> >> > own way of work. I don't think we did anything wrong yet so far. >> >> > >> >> > We can definitely talk about this in next 1:1. >> >> > >> >> >> >> >> >> This is also why I've been trying hard to separate core multifd >> >> >> functionality from migration code that uses multifd to transmit their >> >> >> data. >> >> >> >> >> >> My original RFC plus the suggestion to extend multifd_ops for device >> >> >> state would have (almost) made it so that no client code would be left >> >> >> in multifd. We could have been turning this thing upside down and it >> >> >> wouldn't affect anyone in terms of code conflicts. >> >> > >> >> > Do you mean you preferred the 2*N approach? >> >> > >> >> >> >> 2*N, where N is usually not larger than 32 and the payload size is >> >> 1k. Yes, I'd trade that off no problem. >> > >> > I think it's a problem. >> > >> > Vdpa when involved with exactly the same pattern of how vfio uses it (as >> > they're really alike underneath) then vdpa will need its own array of >> > buffers, or it'll need to take the same vfio lock which doesn't make sense >> > to me. >> > >> > N+2, or, N+M (M is the user) is the minimum buffers we need. N because >> > multifd can be worst case 100% busy on all threads occupying the buffers. >> > M because M users can be worst case 100% pre-filling. It's either about >> > memory consumption, or about logical sensibility. >> >> I'm aware of the memory consumption. Still, we're not forced to use the >> minimum amount of space we can. If using more memory can lead to a >> better design in the medium term, we're allowed to make that choice. >> >> Hey, I'm not even saying we *should* have gone with 2N. I think it's >> good that we're now N+M. But I think we also lost some design >> flexibility due to that. >> >> > >> >> >> >> >> >> >> >> The ship has already sailed, so your patches below are fine, I have just >> >> >> some small comments. >> >> > >> >> > I'm not sure what you meant about "ship sailed", but we should merge code >> >> > whenever we think is the most correct. >> >> >> >> As you put above, I agree that the important bits of the original series >> >> have been preserved, but other secondary goals were lost, such as the >> >> more abstract separation between multifd & client code and that is the >> >> ship that has sailed. >> >> >> >> That series was not: "introduce this array for no reason", we also lost >> >> the ability to abstract the payload from the multifd threads when we >> >> dropped the .alloc_fn callback for instance. The last patch you posted >> > >> > I don't remember the details there, but my memory was that it was too >> > flexible while we seem to reach the consensus that we only process either >> > RAM or device, nothing else. >> >> Indeed. I'm being unfair here, sorry. >> >> > >> >> here now adds multifd_device_state_prepare, somewhat ignoring that the >> >> ram code also has the same pattern and it could be made to use the same >> >> API. >> > >> > I need some further elaborations to understand. >> > >> > multifd_device_state_prepare currently does a few things: taking ownership >> > of the temp device state object, fill in idstr / instance_id, taking the >> > lock (so far is needed because we only have one device state object). None >> > of them seems to be needed for RAM yet. >> > >> > Feel free to send a rfc patch if that helps. >> >> What if I don't send a patch, wait for it to get merged and then send a >> refactoring on top so we don't add yet another detour to this >> conversation? =) > > I thought it shouldn't conflict much if with ram only, and I used to mean > that can be a "comment in the form of patch". But yeah, sure thing.
On Fri, Sep 13, 2024 at 03:26:43PM -0300, Fabiano Rosas wrote: > The thread-pool approach is being looked at to solve this particular > problem, but we have also discussed in the past that multifd threads > themselves should be managed by a thread pool. Will we add this > requirement to what is being done now? Otherwise, don't we risk having > an implementation that doesn't serve the rest of multifd? Do we even > know what the requirements are? Keep in mind that we're already not > modifying the existing ThreadPool, but planning to write a new one. Multifd currently has below specialties: - Multifd thread has 1:1 mapping with iochannels - Multifd thread num should be relevant to target bandwidth (e.g., the NIC performance) While for a generic thread pool: - Thread has no correlation to any iochannel, but some async cpu intensive workloads during migration (either during switchover, or maybe even before that?) - Thread number should have no correlation to NIC/bandwidth, a sane start could be $(nproc), but maybe not.. I don't remember what I was thinking previously, but now it sounds ok to keep multifd separate as of now to me, because multifd does service a slightly different purpose (maximum network throughput) v.s. where we want a pool of threads doing async tasks (which can be anything).
[ ... ] >>> I as a patch writer always like to do that when it's essential. Normally >>> the case is I don't have enough reviewer resources to help me get a better >>> design, or discuss about it. >> >> Right, but we can't keep providing a moving target. See the thread pool >> discussion for an example. It's hard to work that way. The discussion >> here is similar, we introduced the union, now we're moving to the >> struct. And you're right that the changes here are small, so let's not >> get caught in that. > > What's your suggestion on the thread pool? Should we merge the change > where vfio creates the threads on its own (assuming vfio maintainers are ok > with it)? > > I would say no, that's what I suggested. I'd start with reusing > ThreadPool, then we found issue when Stefan reported worry on abusing the > API. All these discussions seem sensible to me so far. We can't rush on > these until we figure things out step by step. I don't see a way. > > I saw Cedric suggesting to not even create a thread on recv side. I am not > sure whether that's easy, but I'd agree with Cedric if possible. I think > Maciej could have a point where it can block mutlifd threads, aka, IO > threads, which might be unwanted. Sorry, If I am adding noise on this topic. I made this suggestion because I spotted some asymmetry in the proposal. The send and recv implementation in VFIO relies on different interfaces with different level of complexity. The send part is using a set of multifd callbacks called from multifd threads, if I am correct. Whereas the recv part is directly implemented in VFIO with local thread(s?) doing their own state receive cookery. I was expecting a common interface to minimize assumptions on both ends. It doesn't have to be callback based. It could be a set of services a subsystem could use to transfer state in parallel. <side note> VFIO migration is driven by numerous callbacks and it is difficult to understand the context in which these are called. Adding more callbacks might not be the best approach. </side note> The other comment was on optimisation. If this is an optimisation then I would expect, first, a non-optimized version not using threads (on the recv side). VFIO Migration is a "new" feature which needs some more run-in. That said, it is stable, MLX5 VFs devices have good support, you can rely on me to evaluate the future respins. Thanks, C.
On Tue, Sep 17, 2024 at 07:07:10PM +0200, Cédric Le Goater wrote: > [ ... ] > > > > > I as a patch writer always like to do that when it's essential. Normally > > > > the case is I don't have enough reviewer resources to help me get a better > > > > design, or discuss about it. > > > > > > Right, but we can't keep providing a moving target. See the thread pool > > > discussion for an example. It's hard to work that way. The discussion > > > here is similar, we introduced the union, now we're moving to the > > > struct. And you're right that the changes here are small, so let's not > > > get caught in that. > > > > What's your suggestion on the thread pool? Should we merge the change > > where vfio creates the threads on its own (assuming vfio maintainers are ok > > with it)? > > > > I would say no, that's what I suggested. I'd start with reusing > > ThreadPool, then we found issue when Stefan reported worry on abusing the > > API. All these discussions seem sensible to me so far. We can't rush on > > these until we figure things out step by step. I don't see a way. > > > > I saw Cedric suggesting to not even create a thread on recv side. I am not > > sure whether that's easy, but I'd agree with Cedric if possible. I think > > Maciej could have a point where it can block mutlifd threads, aka, IO > > threads, which might be unwanted. > > Sorry, If I am adding noise on this topic. I made this suggestion > because I spotted some asymmetry in the proposal. > > The send and recv implementation in VFIO relies on different > interfaces with different level of complexity. The send part is > using a set of multifd callbacks called from multifd threads, > if I am correct. Whereas the recv part is directly implemented > in VFIO with local thread(s?) doing their own state receive cookery. Yeh, the send/recv sides are indeed not fully symmetrical in the case of multifd - the recv side is more IO-driven, e.g., QEMU reacts based on what it receives (which was encoded in the headers of the received packets). The src is more of a generic consumer / producer model where threads can enqueue tasks / data to different iochannels. > > I was expecting a common interface to minimize assumptions on both > ends. It doesn't have to be callback based. It could be a set of > services a subsystem could use to transfer state in parallel. > <side note> > VFIO migration is driven by numerous callbacks and it is > difficult to understand the context in which these are called. > Adding more callbacks might not be the best approach. > </side note> > > The other comment was on optimisation. If this is an optimisation > then I would expect, first, a non-optimized version not using threads > (on the recv side). As commented in a previous email, I had a feeling that Maciej wanted to avoid blocking multifd threads when applying VFIO data chunks to the kernel driver, but Maciej please correct me.. I could be wrong. To me I think I'm fine even if it blocks multifd threads, as it'll only happen when with VFIO (we may want to consider n_multifd_threads to be based on num of vfio devices then, so we still always have some idle threads taking IOs out of the NIC buffers). So I agree with Cedric that if we can provide a functional working version first then we can at least go with the simpler approach first. > > VFIO Migration is a "new" feature which needs some more run-in. > That said, it is stable, MLX5 VFs devices have good support, you > can rely on me to evaluate the future respins. > > Thanks, > > C. >
On 10.09.2024 21:48, Peter Xu wrote: > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >>> +size_t multifd_device_state_payload_size(void) >>> +{ >>> + return sizeof(MultiFDDeviceState_t); >>> +} >> >> This will not be necessary because the payload size is the same as the >> data type. We only need it for the special case where the MultiFDPages_t >> is smaller than the total ram payload size. > > Today I was thinking maybe we should really clean this up, as the current > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > that feeling stronger. > > I think we should change it now perhaps, otherwise we'll need to introduce > other helpers to e.g. reset the device buffers, and that's not only slow > but also not good looking, IMO. > > So I went ahead with the idea in previous discussion, that I managed to > change the SendData union into struct; the memory consumption is not super > important yet, IMHO, but we should still stick with the object model where > multifd enqueue thread switch buffer with multifd, as it still sounds a > sane way to do. > > Then when that patch is ready, I further tried to make VFIO reuse multifd > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we > don't allocate it every time we enqueue. > > I hope it'll also work for VFIO. VFIO has a specialty on being able to > dump the config space so it's more complex (and I noticed Maciej's current > design requires the final chunk of VFIO config data be migrated in one > packet.. that is also part of the complexity there). So I allowed that > part to allocate a buffer but only that. IOW, I made some API (see below) > that can either reuse preallocated buffer, or use a separate one only for > the final bulk. > > In short, could both of you have a look at what I came up with below? I > did that in patches because I think it's too much to comment, so patches > may work better. No concern if any of below could be good changes to you, > then either Maciej can squash whatever into existing patches (and I feel > like some existing patches in this series can go away with below design), > or I can post pre-requisite patch but only if any of you prefer that. > > Anyway, let me know, the patches apply on top of this whole series applied > first. > > I also wonder whether there can be any perf difference already (I tested > all multifd qtest with below, but no VFIO I can run), perhaps not that > much, but just to mention below should avoid both buffer allocations and > one round of copy (so VFIO read() directly writes to the multifd buffers > now). I am not against making MultiFDSendData a struct and maybe introducing some pre-allocated buffer. But to be honest, that manual memory management with having to remember to call multifd_device_state_finish() on error paths as in your proposed patch 3 really invites memory leaks. Will think about some other way to have a reusable buffer. In terms of not making idstr copy (your proposed patch 2) I am not 100% sure that avoiding such tiny allocation really justifies the risk of possible use-after-free of a dangling pointer. Not 100% against it either if you are confident that it will never happen. By the way, I guess it makes sense to carry these changes in the main patch set rather than as a separate changes? > Thanks, Thanks, Maciej
On 10.09.2024 18:06, Peter Xu wrote: > On Tue, Aug 27, 2024 at 07:54:31PM +0200, Maciej S. Szmigiero wrote: >> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, >> + char *data, size_t len) >> +{ >> + /* Device state submissions can come from multiple threads */ >> + QEMU_LOCK_GUARD(&queue_job_mutex); > > Ah, just notice there's the mutex. > > So please consider the reply in the other thread, IIUC we can make it for > multifd_send() to be a generic mutex to simplify the other patch too, then > drop here. > > I assume the ram code should be fine taking one more mutex even without > vfio, if it only takes once for each ~128 pages to enqueue, and only take > in the main thread, then each update should be also in the hot path > (e.g. no cache bouncing). > Will check whether it is possible to use a common mutex here for both RAM and device state submission without drop in performance. Thanks, Maciej
On 9.09.2024 21:40, Peter Xu wrote: > On Fri, Aug 30, 2024 at 10:02:40AM -0300, Fabiano Rosas wrote: >>>>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) >>>>> >>>>> p = &multifd_send_state->params[i]; >>>>> /* >>>>> - * Lockless read to p->pending_job is safe, because only multifd >>>>> - * sender thread can clear it. >>>>> + * Lockless RMW on p->pending_job_preparing is safe, because only multifd >>>>> + * sender thread can clear it after it had seen p->pending_job being set. >>>>> + * >>>>> + * Pairs with qatomic_store_release() in multifd_send_thread(). >>>>> */ >>>>> - if (qatomic_read(&p->pending_job) == false) { >>>>> + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { >>>> >>>> What's the motivation for this change? It would be better to have it in >>>> a separate patch with a proper justification. >>> >>> The original RFC patch set used dedicated device state multifd channels. >>> >>> Peter and other people wanted this functionality removed, however this caused >>> a performance (downtime) regression. >>> >>> One of the things that seemed to help mitigate this regression was making >>> the multifd channel selection more fair via this change. >>> >>> But I can split out it to a separate commit in the next patch set version and >>> then see what performance improvement it currently brings. >> >> Yes, better to have it separate if anything for documentation of the >> rationale. > > And when drafting that patch, please add a comment explaining the field. > Currently it's missing: > > /* > * The sender thread has work to do if either of below boolean is set. > * > * @pending_job: a job is pending > * @pending_sync: a sync request is pending > * > * For both of these fields, they're only set by the requesters, and > * cleared by the multifd sender threads. > */ > bool pending_job; > bool pending_job_preparing; > bool pending_sync; > Will do if these variables end staying in the patch (instead of being replaced by the common send mutex, for example). Thanks, Maciej
On 17.09.2024 19:50, Peter Xu wrote: > On Tue, Sep 17, 2024 at 07:07:10PM +0200, Cédric Le Goater wrote: >> [ ... ] >> >>>>> I as a patch writer always like to do that when it's essential. Normally >>>>> the case is I don't have enough reviewer resources to help me get a better >>>>> design, or discuss about it. >>>> >>>> Right, but we can't keep providing a moving target. See the thread pool >>>> discussion for an example. It's hard to work that way. The discussion >>>> here is similar, we introduced the union, now we're moving to the >>>> struct. And you're right that the changes here are small, so let's not >>>> get caught in that. >>> >>> What's your suggestion on the thread pool? Should we merge the change >>> where vfio creates the threads on its own (assuming vfio maintainers are ok >>> with it)? >>> >>> I would say no, that's what I suggested. I'd start with reusing >>> ThreadPool, then we found issue when Stefan reported worry on abusing the >>> API. All these discussions seem sensible to me so far. We can't rush on >>> these until we figure things out step by step. I don't see a way. >>> >>> I saw Cedric suggesting to not even create a thread on recv side. I am not >>> sure whether that's easy, but I'd agree with Cedric if possible. I think >>> Maciej could have a point where it can block mutlifd threads, aka, IO >>> threads, which might be unwanted. >> >> Sorry, If I am adding noise on this topic. I made this suggestion >> because I spotted some asymmetry in the proposal. >> >> The send and recv implementation in VFIO relies on different >> interfaces with different level of complexity. The send part is >> using a set of multifd callbacks called from multifd threads, >> if I am correct. Whereas the recv part is directly implemented >> in VFIO with local thread(s?) doing their own state receive cookery. > > Yeh, the send/recv sides are indeed not fully symmetrical in the case of > multifd - the recv side is more IO-driven, e.g., QEMU reacts based on what > it receives (which was encoded in the headers of the received packets). > > The src is more of a generic consumer / producer model where threads can > enqueue tasks / data to different iochannels. Currently, the best case happens if both sides are I/O bound with respect to the VFIO devices - reading device state from the source device as fast as it can produce it and loading device state into the target device as fast as it can consume it. These devices aren't normally seriously network bandwidth constrained here. >> >> I was expecting a common interface to minimize assumptions on both >> ends. It doesn't have to be callback based. It could be a set of >> services a subsystem could use to transfer state in parallel. >> <side note> >> VFIO migration is driven by numerous callbacks and it is >> difficult to understand the context in which these are called. >> Adding more callbacks might not be the best approach. >> </side note> >> >> The other comment was on optimisation. If this is an optimisation >> then I would expect, first, a non-optimized version not using threads >> (on the recv side). > > As commented in a previous email, I had a feeling that Maciej wanted to > avoid blocking multifd threads when applying VFIO data chunks to the kernel > driver, but Maciej please correct me.. I could be wrong. Yes, we don't want the case that loading device state into one VFIO device blocks loading such state into another VFIO device and so the second VFIO device ends up being partially idle during that time. > To me I think I'm fine even if it blocks multifd threads, as it'll only > happen when with VFIO (we may want to consider n_multifd_threads to be > based on num of vfio devices then, so we still always have some idle > threads taking IOs out of the NIC buffers). The current design uses exactly one loading thread per VFIO device. > So I agree with Cedric that if we can provide a functional working version > first then we can at least go with the simpler approach first. > >> >> VFIO Migration is a "new" feature which needs some more run-in. >> That said, it is stable, MLX5 VFs devices have good support, you >> can rely on me to evaluate the future respins. >> >> Thanks, >> >> C. >> > Thanks, Maciej
On Thu, Sep 19, 2024 at 09:49:43PM +0200, Maciej S. Szmigiero wrote: > On 10.09.2024 21:48, Peter Xu wrote: > > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > > > > +size_t multifd_device_state_payload_size(void) > > > > +{ > > > > + return sizeof(MultiFDDeviceState_t); > > > > +} > > > > > > This will not be necessary because the payload size is the same as the > > > data type. We only need it for the special case where the MultiFDPages_t > > > is smaller than the total ram payload size. > > > > Today I was thinking maybe we should really clean this up, as the current > > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > > that feeling stronger. > > > > I think we should change it now perhaps, otherwise we'll need to introduce > > other helpers to e.g. reset the device buffers, and that's not only slow > > but also not good looking, IMO. > > > > So I went ahead with the idea in previous discussion, that I managed to > > change the SendData union into struct; the memory consumption is not super > > important yet, IMHO, but we should still stick with the object model where > > multifd enqueue thread switch buffer with multifd, as it still sounds a > > sane way to do. > > > > Then when that patch is ready, I further tried to make VFIO reuse multifd > > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we > > don't allocate it every time we enqueue. > > > > I hope it'll also work for VFIO. VFIO has a specialty on being able to > > dump the config space so it's more complex (and I noticed Maciej's current > > design requires the final chunk of VFIO config data be migrated in one > > packet.. that is also part of the complexity there). So I allowed that > > part to allocate a buffer but only that. IOW, I made some API (see below) > > that can either reuse preallocated buffer, or use a separate one only for > > the final bulk. > > > > In short, could both of you have a look at what I came up with below? I > > did that in patches because I think it's too much to comment, so patches > > may work better. No concern if any of below could be good changes to you, > > then either Maciej can squash whatever into existing patches (and I feel > > like some existing patches in this series can go away with below design), > > or I can post pre-requisite patch but only if any of you prefer that. > > > > Anyway, let me know, the patches apply on top of this whole series applied > > first. > > > > I also wonder whether there can be any perf difference already (I tested > > all multifd qtest with below, but no VFIO I can run), perhaps not that > > much, but just to mention below should avoid both buffer allocations and > > one round of copy (so VFIO read() directly writes to the multifd buffers > > now). > > I am not against making MultiFDSendData a struct and maybe introducing > some pre-allocated buffer. > > But to be honest, that manual memory management with having to remember > to call multifd_device_state_finish() on error paths as in your > proposed patch 3 really invites memory leaks. > > Will think about some other way to have a reusable buffer. Sure. That's patch 3, and I suppose then it looks like patch 1 is still OK in one way or another. > > In terms of not making idstr copy (your proposed patch 2) I am not > 100% sure that avoiding such tiny allocation really justifies the risk > of possible use-after-free of a dangling pointer. Why there's risk? Someone strdup() on the stack? That only goes via VFIO itself, so I thought it wasn't that complicated. But yeah as I said this part (patch 2) is optional. > Not 100% against it either if you are confident that it will never happen. > > By the way, I guess it makes sense to carry these changes in the main patch > set rather than as a separate changes? Whatever you prefer. I wrote those patches only because I thought maybe you'd like to run some perf test to see whether they would help at all, and when the patches are there it'll be much easier for you, then you can decide whether it's worth intergrating already, or leave that for later. If not I'd say they're even lower priority, so feel free to stick with whatever easier for you. I'm ok there. However it'll be always good we can still have patch 1 as I mentioned before (as part of your series, if you won't disagree), to make the SendData interface slightly cleaner and easier to follow.
On Thu, Sep 19, 2024 at 09:49:57PM +0200, Maciej S. Szmigiero wrote: > On 10.09.2024 18:06, Peter Xu wrote: > > On Tue, Aug 27, 2024 at 07:54:31PM +0200, Maciej S. Szmigiero wrote: > > > +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, > > > + char *data, size_t len) > > > +{ > > > + /* Device state submissions can come from multiple threads */ > > > + QEMU_LOCK_GUARD(&queue_job_mutex); > > > > Ah, just notice there's the mutex. > > > > So please consider the reply in the other thread, IIUC we can make it for > > multifd_send() to be a generic mutex to simplify the other patch too, then > > drop here. > > > > I assume the ram code should be fine taking one more mutex even without > > vfio, if it only takes once for each ~128 pages to enqueue, and only take > > in the main thread, then each update should be also in the hot path > > (e.g. no cache bouncing). > > > > Will check whether it is possible to use a common mutex here for both RAM > and device state submission without drop in performance. Thanks.
On 19.09.2024 23:17, Peter Xu wrote: > On Thu, Sep 19, 2024 at 09:49:43PM +0200, Maciej S. Szmigiero wrote: >> On 10.09.2024 21:48, Peter Xu wrote: >>> On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: >>>>> +size_t multifd_device_state_payload_size(void) >>>>> +{ >>>>> + return sizeof(MultiFDDeviceState_t); >>>>> +} >>>> >>>> This will not be necessary because the payload size is the same as the >>>> data type. We only need it for the special case where the MultiFDPages_t >>>> is smaller than the total ram payload size. >>> >>> Today I was thinking maybe we should really clean this up, as the current >>> multifd_send_data_alloc() is indeed too tricky (blame me.. who requested >>> that more or less). Knowing that VFIO can use dynamic buffers with ->idstr >>> and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made >>> that feeling stronger. >>> >>> I think we should change it now perhaps, otherwise we'll need to introduce >>> other helpers to e.g. reset the device buffers, and that's not only slow >>> but also not good looking, IMO. >>> >>> So I went ahead with the idea in previous discussion, that I managed to >>> change the SendData union into struct; the memory consumption is not super >>> important yet, IMHO, but we should still stick with the object model where >>> multifd enqueue thread switch buffer with multifd, as it still sounds a >>> sane way to do. >>> >>> Then when that patch is ready, I further tried to make VFIO reuse multifd >>> buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we >>> don't allocate it every time we enqueue. >>> >>> I hope it'll also work for VFIO. VFIO has a specialty on being able to >>> dump the config space so it's more complex (and I noticed Maciej's current >>> design requires the final chunk of VFIO config data be migrated in one >>> packet.. that is also part of the complexity there). So I allowed that >>> part to allocate a buffer but only that. IOW, I made some API (see below) >>> that can either reuse preallocated buffer, or use a separate one only for >>> the final bulk. >>> >>> In short, could both of you have a look at what I came up with below? I >>> did that in patches because I think it's too much to comment, so patches >>> may work better. No concern if any of below could be good changes to you, >>> then either Maciej can squash whatever into existing patches (and I feel >>> like some existing patches in this series can go away with below design), >>> or I can post pre-requisite patch but only if any of you prefer that. >>> >>> Anyway, let me know, the patches apply on top of this whole series applied >>> first. >>> >>> I also wonder whether there can be any perf difference already (I tested >>> all multifd qtest with below, but no VFIO I can run), perhaps not that >>> much, but just to mention below should avoid both buffer allocations and >>> one round of copy (so VFIO read() directly writes to the multifd buffers >>> now). >> >> I am not against making MultiFDSendData a struct and maybe introducing >> some pre-allocated buffer. >> >> But to be honest, that manual memory management with having to remember >> to call multifd_device_state_finish() on error paths as in your >> proposed patch 3 really invites memory leaks. >> >> Will think about some other way to have a reusable buffer. > > Sure. That's patch 3, and I suppose then it looks like patch 1 is still > OK in one way or another. > >> >> In terms of not making idstr copy (your proposed patch 2) I am not >> 100% sure that avoiding such tiny allocation really justifies the risk >> of possible use-after-free of a dangling pointer. > > Why there's risk? Someone strdup() on the stack? That only goes via VFIO > itself, so I thought it wasn't that complicated. But yeah as I said this > part (patch 2) is optional. I mean the risk here is somebody providing idstr that somehow gets free'd or overwritten before the device state buffer gets sent. With a static idstr that's obviously not an issue, but I see that, for example, vmstate_register_with_alias_id() generates idstr dynamically and this API is used by all qdevs that have a VMSD (in device_set_realized()). >> Not 100% against it either if you are confident that it will never happen. >> >> By the way, I guess it makes sense to carry these changes in the main patch >> set rather than as a separate changes? > > Whatever you prefer. > > I wrote those patches only because I thought maybe you'd like to run some > perf test to see whether they would help at all, and when the patches are > there it'll be much easier for you, then you can decide whether it's worth > intergrating already, or leave that for later. > > If not I'd say they're even lower priority, so feel free to stick with > whatever easier for you. I'm ok there. > > However it'll be always good we can still have patch 1 as I mentioned > before (as part of your series, if you won't disagree), to make the > SendData interface slightly cleaner and easier to follow. > Will try to include these patches in my patch set if they don't cause any downtime regressions. Thanks, Maciej
On Fri, Sep 20, 2024 at 05:23:20PM +0200, Maciej S. Szmigiero wrote: > On 19.09.2024 23:17, Peter Xu wrote: > > On Thu, Sep 19, 2024 at 09:49:43PM +0200, Maciej S. Szmigiero wrote: > > > On 10.09.2024 21:48, Peter Xu wrote: > > > > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote: > > > > > > +size_t multifd_device_state_payload_size(void) > > > > > > +{ > > > > > > + return sizeof(MultiFDDeviceState_t); > > > > > > +} > > > > > > > > > > This will not be necessary because the payload size is the same as the > > > > > data type. We only need it for the special case where the MultiFDPages_t > > > > > is smaller than the total ram payload size. > > > > > > > > Today I was thinking maybe we should really clean this up, as the current > > > > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested > > > > that more or less). Knowing that VFIO can use dynamic buffers with ->idstr > > > > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made > > > > that feeling stronger. > > > > > > > > I think we should change it now perhaps, otherwise we'll need to introduce > > > > other helpers to e.g. reset the device buffers, and that's not only slow > > > > but also not good looking, IMO. > > > > > > > > So I went ahead with the idea in previous discussion, that I managed to > > > > change the SendData union into struct; the memory consumption is not super > > > > important yet, IMHO, but we should still stick with the object model where > > > > multifd enqueue thread switch buffer with multifd, as it still sounds a > > > > sane way to do. > > > > > > > > Then when that patch is ready, I further tried to make VFIO reuse multifd > > > > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we > > > > don't allocate it every time we enqueue. > > > > > > > > I hope it'll also work for VFIO. VFIO has a specialty on being able to > > > > dump the config space so it's more complex (and I noticed Maciej's current > > > > design requires the final chunk of VFIO config data be migrated in one > > > > packet.. that is also part of the complexity there). So I allowed that > > > > part to allocate a buffer but only that. IOW, I made some API (see below) > > > > that can either reuse preallocated buffer, or use a separate one only for > > > > the final bulk. > > > > > > > > In short, could both of you have a look at what I came up with below? I > > > > did that in patches because I think it's too much to comment, so patches > > > > may work better. No concern if any of below could be good changes to you, > > > > then either Maciej can squash whatever into existing patches (and I feel > > > > like some existing patches in this series can go away with below design), > > > > or I can post pre-requisite patch but only if any of you prefer that. > > > > > > > > Anyway, let me know, the patches apply on top of this whole series applied > > > > first. > > > > > > > > I also wonder whether there can be any perf difference already (I tested > > > > all multifd qtest with below, but no VFIO I can run), perhaps not that > > > > much, but just to mention below should avoid both buffer allocations and > > > > one round of copy (so VFIO read() directly writes to the multifd buffers > > > > now). > > > > > > I am not against making MultiFDSendData a struct and maybe introducing > > > some pre-allocated buffer. > > > > > > But to be honest, that manual memory management with having to remember > > > to call multifd_device_state_finish() on error paths as in your > > > proposed patch 3 really invites memory leaks. > > > > > > Will think about some other way to have a reusable buffer. > > > > Sure. That's patch 3, and I suppose then it looks like patch 1 is still > > OK in one way or another. > > > > > > > > In terms of not making idstr copy (your proposed patch 2) I am not > > > 100% sure that avoiding such tiny allocation really justifies the risk > > > of possible use-after-free of a dangling pointer. > > > > Why there's risk? Someone strdup() on the stack? That only goes via VFIO > > itself, so I thought it wasn't that complicated. But yeah as I said this > > part (patch 2) is optional. > > I mean the risk here is somebody providing idstr that somehow gets free'd > or overwritten before the device state buffer gets sent. > > With a static idstr that's obviously not an issue, but I see that, for example, > vmstate_register_with_alias_id() generates idstr dynamically and this API > is used by all qdevs that have a VMSD (in device_set_realized()). > > > > Not 100% against it either if you are confident that it will never happen. > > > > > > By the way, I guess it makes sense to carry these changes in the main patch > > > set rather than as a separate changes? > > > > Whatever you prefer. > > > > I wrote those patches only because I thought maybe you'd like to run some > > perf test to see whether they would help at all, and when the patches are > > there it'll be much easier for you, then you can decide whether it's worth > > intergrating already, or leave that for later. > > > > If not I'd say they're even lower priority, so feel free to stick with > > whatever easier for you. I'm ok there. > > > > However it'll be always good we can still have patch 1 as I mentioned > > before (as part of your series, if you won't disagree), to make the > > SendData interface slightly cleaner and easier to follow. > > > > Will try to include these patches in my patch set if they don't cause any > downtime regressions. Thanks, note that it's not my request to have patch 2-3. :) Please take your own judgement. Again, I'd run a round of perf (only if my patches can directly run through without heavy debugging on top of yours), if nothing shows a benefit I'd go with what you have right now, and we drop patch 2-3 - they're not justified if they provide zero perf benefit.
diff --git a/include/migration/misc.h b/include/migration/misc.h index bfadc5613bac..7266b1b77d1f 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void); /* migration/block-dirty-bitmap.c */ void dirty_bitmap_mig_init(void); +/* migration/multifd-device-state.c */ +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len); + #endif diff --git a/migration/meson.build b/migration/meson.build index 77f3abf08eb1..00853595894f 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -21,6 +21,7 @@ system_ss.add(files( 'migration-hmp-cmds.c', 'migration.c', 'multifd.c', + 'multifd-device-state.c', 'multifd-nocomp.c', 'multifd-zlib.c', 'multifd-zero-page.c', diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c new file mode 100644 index 000000000000..c9b44f0b5ab9 --- /dev/null +++ b/migration/multifd-device-state.c @@ -0,0 +1,99 @@ +/* + * Multifd device state migration + * + * Copyright (C) 2024 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/lockable.h" +#include "migration/misc.h" +#include "multifd.h" + +static QemuMutex queue_job_mutex; + +static MultiFDSendData *device_state_send; + +size_t multifd_device_state_payload_size(void) +{ + return sizeof(MultiFDDeviceState_t); +} + +void multifd_device_state_save_setup(void) +{ + qemu_mutex_init(&queue_job_mutex); + + device_state_send = multifd_send_data_alloc(); +} + +void multifd_device_state_clear(MultiFDDeviceState_t *device_state) +{ + g_clear_pointer(&device_state->idstr, g_free); + g_clear_pointer(&device_state->buf, g_free); +} + +void multifd_device_state_save_cleanup(void) +{ + g_clear_pointer(&device_state_send, multifd_send_data_free); + + qemu_mutex_destroy(&queue_job_mutex); +} + +static void multifd_device_state_fill_packet(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + MultiFDPacketDeviceState_t *packet = p->packet_device_state; + + packet->hdr.flags = cpu_to_be32(p->flags); + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr)); + packet->instance_id = cpu_to_be32(device_state->instance_id); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); +} + +void multifd_device_state_send_prepare(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + + assert(multifd_payload_device_state(p->data)); + + multifd_send_prepare_header_device_state(p); + + assert(!(p->flags & MULTIFD_FLAG_SYNC)); + + p->next_packet_size = device_state->buf_len; + if (p->next_packet_size > 0) { + p->iov[p->iovs_num].iov_base = device_state->buf; + p->iov[p->iovs_num].iov_len = p->next_packet_size; + p->iovs_num++; + } + + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; + + multifd_device_state_fill_packet(p); +} + +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len) +{ + /* Device state submissions can come from multiple threads */ + QEMU_LOCK_GUARD(&queue_job_mutex); + MultiFDDeviceState_t *device_state; + + assert(multifd_payload_empty(device_state_send)); + + multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE); + device_state = &device_state_send->u.device_state; + device_state->idstr = g_strdup(idstr); + device_state->instance_id = instance_id; + device_state->buf = g_memdup2(data, len); + device_state->buf_len = len; + + if (!multifd_send(&device_state_send)) { + multifd_send_data_clear(device_state_send); + return false; + } + + return true; +} diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 39eb77c9b3b7..0b7b543f44db 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -116,13 +116,13 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) * Only !zerocopy needs the header in IOV; zerocopy will * send it separately. */ - multifd_send_prepare_header(p); + multifd_send_prepare_header_ram(p); } multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); if (use_zero_copy_send) { /* Send header first, without zerocopy */ @@ -371,7 +371,7 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) return false; } - multifd_send_prepare_header(p); + multifd_send_prepare_header_ram(p); return true; } diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 75041a4c4dfe..bd6b5b6a3868 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -490,7 +490,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) out: p->flags |= MULTIFD_FLAG_QPL; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); return 0; } diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index db2549f59bfe..6e2d26010742 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -198,7 +198,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) } out: p->flags |= MULTIFD_FLAG_UADK; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); return 0; } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6787538762d2..62a1fe59ad3e 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -156,7 +156,7 @@ static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp) out: p->flags |= MULTIFD_FLAG_ZLIB; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 1576b1e2adc6..f98b07e7f9f5 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -143,7 +143,7 @@ static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp) out: p->flags |= MULTIFD_FLAG_ZSTD; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); return 0; } diff --git a/migration/multifd.c b/migration/multifd.c index a74e8a5cc891..bebe5b5a9b9c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/iov.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -19,6 +20,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "file.h" +#include "migration/misc.h" #include "migration.h" #include "migration-stats.h" #include "savevm.h" @@ -107,7 +109,9 @@ MultiFDSendData *multifd_send_data_alloc(void) * added to the union in the future are larger than * (MultiFDPages_t + flex array). */ - max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload)); + max_payload_size = MAX(multifd_ram_payload_size(), + multifd_device_state_payload_size()); + max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); /* * Account for any holes the compiler might insert. We can't pack @@ -126,6 +130,9 @@ void multifd_send_data_clear(MultiFDSendData *data) } switch (data->type) { + case MULTIFD_PAYLOAD_DEVICE_STATE: + multifd_device_state_clear(&data->u.device_state); + break; default: /* Nothing to do */ break; @@ -228,7 +235,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } -void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet_ram(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; uint64_t packet_num; @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data) p = &multifd_send_state->params[i]; /* - * Lockless read to p->pending_job is safe, because only multifd - * sender thread can clear it. + * Lockless RMW on p->pending_job_preparing is safe, because only multifd + * sender thread can clear it after it had seen p->pending_job being set. + * + * Pairs with qatomic_store_release() in multifd_send_thread(). */ - if (qatomic_read(&p->pending_job) == false) { + if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) { break; } } - /* - * Make sure we read p->pending_job before all the rest. Pairs with - * qatomic_store_release() in multifd_send_thread(). - */ - smp_mb_acquire(); - assert(multifd_payload_empty(p->data)); /* @@ -534,6 +537,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) p->name = NULL; g_clear_pointer(&p->data, multifd_send_data_free); p->packet_len = 0; + g_clear_pointer(&p->packet_device_state, g_free); g_free(p->packet); p->packet = NULL; multifd_send_state->ops->send_cleanup(p, errp); @@ -545,6 +549,7 @@ static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); + multifd_device_state_save_cleanup(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); @@ -670,19 +675,29 @@ static void *multifd_send_thread(void *opaque) * qatomic_store_release() in multifd_send(). */ if (qatomic_load_acquire(&p->pending_job)) { + bool is_device_state = multifd_payload_device_state(p->data); + size_t total_size; + p->flags = 0; p->iovs_num = 0; assert(!multifd_payload_empty(p->data)); - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - break; + if (is_device_state) { + multifd_device_state_send_prepare(p); + } else { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; + } } if (migrate_mapped_ram()) { + assert(!is_device_state); + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, &p->data->u.ram, &local_err); } else { + total_size = iov_size(p->iov, p->iovs_num); ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); @@ -692,18 +707,27 @@ static void *multifd_send_thread(void *opaque) break; } - stat64_add(&mig_stats.multifd_bytes, - p->next_packet_size + p->packet_len); + if (is_device_state) { + stat64_add(&mig_stats.multifd_bytes, total_size); + } else { + /* + * Can't just always add total_size since IOVs do not include + * packet header in the zerocopy RAM case. + */ + stat64_add(&mig_stats.multifd_bytes, + p->next_packet_size + p->packet_len); + } p->next_packet_size = 0; multifd_send_data_clear(p->data); /* * Making sure p->data is published before saying "we're - * free". Pairs with the smp_mb_acquire() in + * free". Pairs with the qatomic_cmpxchg() in * multifd_send(). */ qatomic_store_release(&p->pending_job, false); + qatomic_store_release(&p->pending_job_preparing, false); } else { /* * If not a normal job, must be a sync request. Note that @@ -714,7 +738,7 @@ static void *multifd_send_thread(void *opaque) if (use_packets) { p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); + multifd_send_fill_packet_ram(p); ret = qio_channel_write_all(p->c, (void *)p->packet, p->packet_len, &local_err); if (ret != 0) { @@ -910,6 +934,9 @@ bool multifd_send_setup(void) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); } p->name = g_strdup_printf("mig/src/send_%d", i); p->write_flags = 0; @@ -944,6 +971,8 @@ bool multifd_send_setup(void) } } + multifd_device_state_save_setup(); + return true; err: diff --git a/migration/multifd.h b/migration/multifd.h index a0853622153e..c15c83104c8b 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -120,10 +120,12 @@ typedef struct { typedef enum { MULTIFD_PAYLOAD_NONE, MULTIFD_PAYLOAD_RAM, + MULTIFD_PAYLOAD_DEVICE_STATE, } MultiFDPayloadType; typedef union MultiFDPayload { MultiFDPages_t ram; + MultiFDDeviceState_t device_state; } MultiFDPayload; struct MultiFDSendData { @@ -136,6 +138,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) return data->type == MULTIFD_PAYLOAD_NONE; } +static inline bool multifd_payload_device_state(MultiFDSendData *data) +{ + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; +} + static inline void multifd_set_payload_type(MultiFDSendData *data, MultiFDPayloadType type) { @@ -182,13 +189,15 @@ typedef struct { * cleared by the multifd sender threads. */ bool pending_job; + bool pending_job_preparing; bool pending_sync; MultiFDSendData *data; /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_device_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ @@ -276,18 +285,25 @@ typedef struct { } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); -void multifd_send_fill_packet(MultiFDSendParams *p); +void multifd_send_fill_packet_ram(MultiFDSendParams *p); bool multifd_send_prepare_common(MultiFDSendParams *p); void multifd_send_zero_page_detect(MultiFDSendParams *p); void multifd_recv_zero_page_process(MultiFDRecvParams *p); -static inline void multifd_send_prepare_header(MultiFDSendParams *p) +static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p) { p->iov[0].iov_len = p->packet_len; p->iov[0].iov_base = p->packet; p->iovs_num++; } +static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p) +{ + p->iov[0].iov_len = sizeof(*p->packet_device_state); + p->iov[0].iov_base = p->packet_device_state; + p->iovs_num++; +} + void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); bool multifd_send(MultiFDSendData **send_data); MultiFDSendData *multifd_send_data_alloc(void); @@ -310,4 +326,11 @@ int multifd_ram_flush_and_sync(void); size_t multifd_ram_payload_size(void); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); + +size_t multifd_device_state_payload_size(void); +void multifd_device_state_save_setup(void); +void multifd_device_state_clear(MultiFDDeviceState_t *device_state); +void multifd_device_state_save_cleanup(void); +void multifd_device_state_send_prepare(MultiFDSendParams *p); + #endif