diff mbox series

[v2,08/17] migration: Add load_finish handler and associated functions

Message ID 1a7599896decdbae61cee385739dc0badc9b4364.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive)
State New, archived
Headers show
Series Multifd | expand

Commit Message

Maciej S. Szmigiero Aug. 27, 2024, 5:54 p.m. UTC
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

load_finish SaveVMHandler allows migration code to poll whether
a device-specific asynchronous device state loading operation had finished.

In order to avoid calling this handler needlessly the device is supposed
to notify the migration code of its possible readiness via a call to
qemu_loadvm_load_finish_ready_broadcast() while holding
qemu_loadvm_load_finish_ready_lock.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
 include/migration/register.h | 21 +++++++++++++++
 migration/migration.c        |  6 +++++
 migration/migration.h        |  3 +++
 migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
 migration/savevm.h           |  4 +++
 5 files changed, 86 insertions(+)

Comments

Fabiano Rosas Aug. 30, 2024, 7:28 p.m. UTC | #1
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes:

> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> load_finish SaveVMHandler allows migration code to poll whether
> a device-specific asynchronous device state loading operation had finished.
>
> In order to avoid calling this handler needlessly the device is supposed
> to notify the migration code of its possible readiness via a call to
> qemu_loadvm_load_finish_ready_broadcast() while holding
> qemu_loadvm_load_finish_ready_lock.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>  include/migration/register.h | 21 +++++++++++++++
>  migration/migration.c        |  6 +++++
>  migration/migration.h        |  3 +++
>  migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>  migration/savevm.h           |  4 +++
>  5 files changed, 86 insertions(+)
>
> diff --git a/include/migration/register.h b/include/migration/register.h
> index 4a578f140713..44d8cf5192ae 100644
> --- a/include/migration/register.h
> +++ b/include/migration/register.h
> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>      int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>                               Error **errp);
>  
> +    /**
> +     * @load_finish
> +     *
> +     * Poll whether all asynchronous device state loading had finished.
> +     * Not called on the load failure path.
> +     *
> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> +     *
> +     * If this method signals "not ready" then it might not be called
> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> +     * while holding qemu_loadvm_load_finish_ready_lock.
> +     *
> +     * @opaque: data pointer passed to register_savevm_live()
> +     * @is_finished: whether the loading had finished (output parameter)
> +     * @errp: pointer to Error*, to store an error if it happens.
> +     *
> +     * Returns zero to indicate success and negative for error
> +     * It's not an error that the loading still hasn't finished.
> +     */
> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> +
>      /**
>       * @load_setup
>       *
> diff --git a/migration/migration.c b/migration/migration.c
> index 3dea06d57732..d61e7b055e07 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -259,6 +259,9 @@ void migration_object_init(void)
>  
>      current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
>  
> +    qemu_mutex_init(&current_incoming->load_finish_ready_mutex);
> +    qemu_cond_init(&current_incoming->load_finish_ready_cond);
> +
>      migration_object_check(current_migration, &error_fatal);
>  
>      ram_mig_init();
> @@ -410,6 +413,9 @@ void migration_incoming_state_destroy(void)
>          mis->postcopy_qemufile_dst = NULL;
>      }
>  
> +    qemu_mutex_destroy(&mis->load_finish_ready_mutex);
> +    qemu_cond_destroy(&mis->load_finish_ready_cond);
> +
>      yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>  }
>  
> diff --git a/migration/migration.h b/migration/migration.h
> index 38aa1402d516..4e2443e6c8ec 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -230,6 +230,9 @@ struct MigrationIncomingState {
>  
>      /* Do exit on incoming migration failure */
>      bool exit_on_error;
> +
> +    QemuCond load_finish_ready_cond;
> +    QemuMutex load_finish_ready_mutex;

With these moved to MigrationState:

Reviewed-by: Fabiano Rosas <farosas@suse.de>
Avihai Horon Sept. 5, 2024, 3:13 p.m. UTC | #2
On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> load_finish SaveVMHandler allows migration code to poll whether
> a device-specific asynchronous device state loading operation had finished.
>
> In order to avoid calling this handler needlessly the device is supposed
> to notify the migration code of its possible readiness via a call to
> qemu_loadvm_load_finish_ready_broadcast() while holding
> qemu_loadvm_load_finish_ready_lock.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>   include/migration/register.h | 21 +++++++++++++++
>   migration/migration.c        |  6 +++++
>   migration/migration.h        |  3 +++
>   migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>   migration/savevm.h           |  4 +++
>   5 files changed, 86 insertions(+)
>
> diff --git a/include/migration/register.h b/include/migration/register.h
> index 4a578f140713..44d8cf5192ae 100644
> --- a/include/migration/register.h
> +++ b/include/migration/register.h
> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>       int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>                                Error **errp);
>
> +    /**
> +     * @load_finish
> +     *
> +     * Poll whether all asynchronous device state loading had finished.
> +     * Not called on the load failure path.
> +     *
> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> +     *
> +     * If this method signals "not ready" then it might not be called
> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> +     * while holding qemu_loadvm_load_finish_ready_lock.
> +     *
> +     * @opaque: data pointer passed to register_savevm_live()
> +     * @is_finished: whether the loading had finished (output parameter)
> +     * @errp: pointer to Error*, to store an error if it happens.
> +     *
> +     * Returns zero to indicate success and negative for error
> +     * It's not an error that the loading still hasn't finished.
> +     */
> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> +
>       /**
>        * @load_setup
>        *
> diff --git a/migration/migration.c b/migration/migration.c
> index 3dea06d57732..d61e7b055e07 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -259,6 +259,9 @@ void migration_object_init(void)
>
>       current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
>
> +    qemu_mutex_init(&current_incoming->load_finish_ready_mutex);
> +    qemu_cond_init(&current_incoming->load_finish_ready_cond);
> +
>       migration_object_check(current_migration, &error_fatal);
>
>       ram_mig_init();
> @@ -410,6 +413,9 @@ void migration_incoming_state_destroy(void)
>           mis->postcopy_qemufile_dst = NULL;
>       }
>
> +    qemu_mutex_destroy(&mis->load_finish_ready_mutex);
> +    qemu_cond_destroy(&mis->load_finish_ready_cond);
> +
>       yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>   }
>
> diff --git a/migration/migration.h b/migration/migration.h
> index 38aa1402d516..4e2443e6c8ec 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -230,6 +230,9 @@ struct MigrationIncomingState {
>
>       /* Do exit on incoming migration failure */
>       bool exit_on_error;
> +
> +    QemuCond load_finish_ready_cond;
> +    QemuMutex load_finish_ready_mutex;
>   };
>
>   MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 3fde5ca8c26b..33c9200d1e78 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -3022,6 +3022,37 @@ int qemu_loadvm_state(QEMUFile *f)
>           return ret;
>       }
>
> +    qemu_loadvm_load_finish_ready_lock();
> +    while (!ret) { /* Don't call load_finish() handlers on the load failure path */
> +        bool all_ready = true;

Nit: Maybe rename all_ready to all_finished to be consistent with 
load_finish() terminology? Same for this_ready.

> +        SaveStateEntry *se = NULL;
> +
> +        QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +            bool this_ready;
> +
> +            if (!se->ops || !se->ops->load_finish) {
> +                continue;
> +            }
> +
> +            ret = se->ops->load_finish(se->opaque, &this_ready, &local_err);
> +            if (ret) {
> +                error_report_err(local_err);
> +
> +                qemu_loadvm_load_finish_ready_unlock();
> +                return -EINVAL;
> +            } else if (!this_ready) {
> +                all_ready = false;
> +            }
> +        }
> +
> +        if (all_ready) {
> +            break;
> +        }
> +
> +        qemu_cond_wait(&mis->load_finish_ready_cond, &mis->load_finish_ready_mutex);
> +    }
> +    qemu_loadvm_load_finish_ready_unlock();
> +
>       if (ret == 0) {
>           ret = qemu_file_get_error(f);
>       }
> @@ -3126,6 +3157,27 @@ int qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
>       return 0;
>   }
>
> +void qemu_loadvm_load_finish_ready_lock(void)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +
> +    qemu_mutex_lock(&mis->load_finish_ready_mutex);
> +}
> +
> +void qemu_loadvm_load_finish_ready_unlock(void)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +
> +    qemu_mutex_unlock(&mis->load_finish_ready_mutex);
> +}
> +
> +void qemu_loadvm_load_finish_ready_broadcast(void)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +
> +    qemu_cond_broadcast(&mis->load_finish_ready_cond);

Do we need a broadcast? isn't signal enough as we only have one waiter 
thread?

Thanks.

> +}
> +
>   bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
>                     bool has_devices, strList *devices, Error **errp)
>   {
> diff --git a/migration/savevm.h b/migration/savevm.h
> index d388f1bfca98..69ae22cded7a 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -73,4 +73,8 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
>   int qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
>                                     char *buf, size_t len, Error **errp);
>
> +void qemu_loadvm_load_finish_ready_lock(void);
> +void qemu_loadvm_load_finish_ready_unlock(void);
> +void qemu_loadvm_load_finish_ready_broadcast(void);
> +
>   #endif
Maciej S. Szmigiero Sept. 9, 2024, 6:05 p.m. UTC | #3
On 5.09.2024 17:13, Avihai Horon wrote:
> 
> On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> load_finish SaveVMHandler allows migration code to poll whether
>> a device-specific asynchronous device state loading operation had finished.
>>
>> In order to avoid calling this handler needlessly the device is supposed
>> to notify the migration code of its possible readiness via a call to
>> qemu_loadvm_load_finish_ready_broadcast() while holding
>> qemu_loadvm_load_finish_ready_lock.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   include/migration/register.h | 21 +++++++++++++++
>>   migration/migration.c        |  6 +++++
>>   migration/migration.h        |  3 +++
>>   migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>   migration/savevm.h           |  4 +++
>>   5 files changed, 86 insertions(+)
>>
>> diff --git a/include/migration/register.h b/include/migration/register.h
>> index 4a578f140713..44d8cf5192ae 100644
>> --- a/include/migration/register.h
>> +++ b/include/migration/register.h
>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>       int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>                                Error **errp);
>>
>> +    /**
>> +     * @load_finish
>> +     *
>> +     * Poll whether all asynchronous device state loading had finished.
>> +     * Not called on the load failure path.
>> +     *
>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>> +     *
>> +     * If this method signals "not ready" then it might not be called
>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>> +     *
>> +     * @opaque: data pointer passed to register_savevm_live()
>> +     * @is_finished: whether the loading had finished (output parameter)
>> +     * @errp: pointer to Error*, to store an error if it happens.
>> +     *
>> +     * Returns zero to indicate success and negative for error
>> +     * It's not an error that the loading still hasn't finished.
>> +     */
>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>> +
>>       /**
>>        * @load_setup
>>        *
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 3dea06d57732..d61e7b055e07 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -259,6 +259,9 @@ void migration_object_init(void)
>>
>>       current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
>>
>> +    qemu_mutex_init(&current_incoming->load_finish_ready_mutex);
>> +    qemu_cond_init(&current_incoming->load_finish_ready_cond);
>> +
>>       migration_object_check(current_migration, &error_fatal);
>>
>>       ram_mig_init();
>> @@ -410,6 +413,9 @@ void migration_incoming_state_destroy(void)
>>           mis->postcopy_qemufile_dst = NULL;
>>       }
>>
>> +    qemu_mutex_destroy(&mis->load_finish_ready_mutex);
>> +    qemu_cond_destroy(&mis->load_finish_ready_cond);
>> +
>>       yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>>   }
>>
>> diff --git a/migration/migration.h b/migration/migration.h
>> index 38aa1402d516..4e2443e6c8ec 100644
>> --- a/migration/migration.h
>> +++ b/migration/migration.h
>> @@ -230,6 +230,9 @@ struct MigrationIncomingState {
>>
>>       /* Do exit on incoming migration failure */
>>       bool exit_on_error;
>> +
>> +    QemuCond load_finish_ready_cond;
>> +    QemuMutex load_finish_ready_mutex;
>>   };
>>
>>   MigrationIncomingState *migration_incoming_get_current(void);
>> diff --git a/migration/savevm.c b/migration/savevm.c
>> index 3fde5ca8c26b..33c9200d1e78 100644
>> --- a/migration/savevm.c
>> +++ b/migration/savevm.c
>> @@ -3022,6 +3022,37 @@ int qemu_loadvm_state(QEMUFile *f)
>>           return ret;
>>       }
>>
>> +    qemu_loadvm_load_finish_ready_lock();
>> +    while (!ret) { /* Don't call load_finish() handlers on the load failure path */
>> +        bool all_ready = true;
> 
> Nit: Maybe rename all_ready to all_finished to be consistent with load_finish() terminology? Same for this_ready.

Will rename it accordingly.

>> +        SaveStateEntry *se = NULL;
>> +
>> +        QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
>> +            bool this_ready;
>> +
>> +            if (!se->ops || !se->ops->load_finish) {
>> +                continue;
>> +            }
>> +
>> +            ret = se->ops->load_finish(se->opaque, &this_ready, &local_err);
>> +            if (ret) {
>> +                error_report_err(local_err);
>> +
>> +                qemu_loadvm_load_finish_ready_unlock();
>> +                return -EINVAL;
>> +            } else if (!this_ready) {
>> +                all_ready = false;
>> +            }
>> +        }
>> +
>> +        if (all_ready) {
>> +            break;
>> +        }
>> +
>> +        qemu_cond_wait(&mis->load_finish_ready_cond, &mis->load_finish_ready_mutex);
>> +    }
>> +    qemu_loadvm_load_finish_ready_unlock();
>> +
>>       if (ret == 0) {
>>           ret = qemu_file_get_error(f);
>>       }
>> @@ -3126,6 +3157,27 @@ int qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
>>       return 0;
>>   }
>>
>> +void qemu_loadvm_load_finish_ready_lock(void)
>> +{
>> +    MigrationIncomingState *mis = migration_incoming_get_current();
>> +
>> +    qemu_mutex_lock(&mis->load_finish_ready_mutex);
>> +}
>> +
>> +void qemu_loadvm_load_finish_ready_unlock(void)
>> +{
>> +    MigrationIncomingState *mis = migration_incoming_get_current();
>> +
>> +    qemu_mutex_unlock(&mis->load_finish_ready_mutex);
>> +}
>> +
>> +void qemu_loadvm_load_finish_ready_broadcast(void)
>> +{
>> +    MigrationIncomingState *mis = migration_incoming_get_current();
>> +
>> +    qemu_cond_broadcast(&mis->load_finish_ready_cond);
> 
> Do we need a broadcast? isn't signal enough as we only have one waiter thread?

Currently, there's just one waiter but looking at the relatively small
implementation difference between pthread_cond_signal() and
pthread_cond_broadcast() I'm not sure whether it is worth changing it
it to _signal() and not having a possibility of signalling multiple
waiters upfront.

> Thanks.

Thanks,
Maciej
Peter Xu Sept. 9, 2024, 8:03 p.m. UTC | #4
On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> 
> load_finish SaveVMHandler allows migration code to poll whether
> a device-specific asynchronous device state loading operation had finished.
> 
> In order to avoid calling this handler needlessly the device is supposed
> to notify the migration code of its possible readiness via a call to
> qemu_loadvm_load_finish_ready_broadcast() while holding
> qemu_loadvm_load_finish_ready_lock.
> 
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>  include/migration/register.h | 21 +++++++++++++++
>  migration/migration.c        |  6 +++++
>  migration/migration.h        |  3 +++
>  migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>  migration/savevm.h           |  4 +++
>  5 files changed, 86 insertions(+)
> 
> diff --git a/include/migration/register.h b/include/migration/register.h
> index 4a578f140713..44d8cf5192ae 100644
> --- a/include/migration/register.h
> +++ b/include/migration/register.h
> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>      int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>                               Error **errp);
>  
> +    /**
> +     * @load_finish
> +     *
> +     * Poll whether all asynchronous device state loading had finished.
> +     * Not called on the load failure path.
> +     *
> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> +     *
> +     * If this method signals "not ready" then it might not be called
> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> +     * while holding qemu_loadvm_load_finish_ready_lock.

[1]

> +     *
> +     * @opaque: data pointer passed to register_savevm_live()
> +     * @is_finished: whether the loading had finished (output parameter)
> +     * @errp: pointer to Error*, to store an error if it happens.
> +     *
> +     * Returns zero to indicate success and negative for error
> +     * It's not an error that the loading still hasn't finished.
> +     */
> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);

The load_finish() semantics is a bit weird, especially above [1] on "only
allowed to be called once if ..." and also on the locks.

It looks to me vfio_load_finish() also does the final load of the device.

I wonder whether that final load can be done in the threads, then after
everything loaded the device post a semaphore telling the main thread to
continue.  See e.g.:

    if (migrate_switchover_ack()) {
        qemu_loadvm_state_switchover_ack_needed(mis);
    }

IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
when all things are loaded?  We can then get rid of this slightly awkward
interface.  I had a feeling that things can be simplified (e.g., if the
thread will take care of loading the final vmstate then the mutex is also
not needed? etc.).
Maciej S. Szmigiero Sept. 19, 2024, 7:49 p.m. UTC | #5
On 9.09.2024 22:03, Peter Xu wrote:
> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> load_finish SaveVMHandler allows migration code to poll whether
>> a device-specific asynchronous device state loading operation had finished.
>>
>> In order to avoid calling this handler needlessly the device is supposed
>> to notify the migration code of its possible readiness via a call to
>> qemu_loadvm_load_finish_ready_broadcast() while holding
>> qemu_loadvm_load_finish_ready_lock.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   include/migration/register.h | 21 +++++++++++++++
>>   migration/migration.c        |  6 +++++
>>   migration/migration.h        |  3 +++
>>   migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>   migration/savevm.h           |  4 +++
>>   5 files changed, 86 insertions(+)
>>
>> diff --git a/include/migration/register.h b/include/migration/register.h
>> index 4a578f140713..44d8cf5192ae 100644
>> --- a/include/migration/register.h
>> +++ b/include/migration/register.h
>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>       int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>                                Error **errp);
>>   
>> +    /**
>> +     * @load_finish
>> +     *
>> +     * Poll whether all asynchronous device state loading had finished.
>> +     * Not called on the load failure path.
>> +     *
>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>> +     *
>> +     * If this method signals "not ready" then it might not be called
>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>> +     * while holding qemu_loadvm_load_finish_ready_lock.
> 
> [1]
> 
>> +     *
>> +     * @opaque: data pointer passed to register_savevm_live()
>> +     * @is_finished: whether the loading had finished (output parameter)
>> +     * @errp: pointer to Error*, to store an error if it happens.
>> +     *
>> +     * Returns zero to indicate success and negative for error
>> +     * It's not an error that the loading still hasn't finished.
>> +     */
>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> 
> The load_finish() semantics is a bit weird, especially above [1] on "only
> allowed to be called once if ..." and also on the locks.

The point of this remark is that a driver needs to call
qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
core to call its load_finish handler again.

> It looks to me vfio_load_finish() also does the final load of the device.
> 
> I wonder whether that final load can be done in the threads, 

Here, the problem is that current VFIO VMState has to be loaded from the main
migration thread as it internally calls QEMU core address space modification
methods which explode if called from another thread(s).

> then after
> everything loaded the device post a semaphore telling the main thread to
> continue.  See e.g.:
> 
>      if (migrate_switchover_ack()) {
>          qemu_loadvm_state_switchover_ack_needed(mis);
>      }
> 
> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> when all things are loaded?  We can then get rid of this slightly awkward
> interface.  I had a feeling that things can be simplified (e.g., if the
> thread will take care of loading the final vmstate then the mutex is also
> not needed? etc.).

With just a single call to switchover_ack_needed per VFIO device it would
need to do a blocking wait for the device buffers and config state load
to finish, therefore blocking other VFIO devices from potentially loading
their config state if they are ready to begin this operation earlier.

Thanks,
Maciej
Peter Xu Sept. 19, 2024, 9:11 p.m. UTC | #6
On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> On 9.09.2024 22:03, Peter Xu wrote:
> > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > 
> > > load_finish SaveVMHandler allows migration code to poll whether
> > > a device-specific asynchronous device state loading operation had finished.
> > > 
> > > In order to avoid calling this handler needlessly the device is supposed
> > > to notify the migration code of its possible readiness via a call to
> > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > qemu_loadvm_load_finish_ready_lock.
> > > 
> > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > ---
> > >   include/migration/register.h | 21 +++++++++++++++
> > >   migration/migration.c        |  6 +++++
> > >   migration/migration.h        |  3 +++
> > >   migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > >   migration/savevm.h           |  4 +++
> > >   5 files changed, 86 insertions(+)
> > > 
> > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > index 4a578f140713..44d8cf5192ae 100644
> > > --- a/include/migration/register.h
> > > +++ b/include/migration/register.h
> > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > >       int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > >                                Error **errp);
> > > +    /**
> > > +     * @load_finish
> > > +     *
> > > +     * Poll whether all asynchronous device state loading had finished.
> > > +     * Not called on the load failure path.
> > > +     *
> > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > +     *
> > > +     * If this method signals "not ready" then it might not be called
> > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > 
> > [1]
> > 
> > > +     *
> > > +     * @opaque: data pointer passed to register_savevm_live()
> > > +     * @is_finished: whether the loading had finished (output parameter)
> > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > +     *
> > > +     * Returns zero to indicate success and negative for error
> > > +     * It's not an error that the loading still hasn't finished.
> > > +     */
> > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > 
> > The load_finish() semantics is a bit weird, especially above [1] on "only
> > allowed to be called once if ..." and also on the locks.
> 
> The point of this remark is that a driver needs to call
> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> core to call its load_finish handler again.
> 
> > It looks to me vfio_load_finish() also does the final load of the device.
> > 
> > I wonder whether that final load can be done in the threads,
> 
> Here, the problem is that current VFIO VMState has to be loaded from the main
> migration thread as it internally calls QEMU core address space modification
> methods which explode if called from another thread(s).

Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
BQL if possible, when that's ready then in your case here IIUC you can
simply take BQL in whichever thread that loads it.. but yeah it's not ready
at least..

Would it be possible vfio_save_complete_precopy_async_thread_config_state()
be done in VFIO's save_live_complete_precopy() through the main channel
somehow?  IOW, does it rely on iterative data to be fetched first from
kernel, or completely separate states?  And just curious: how large is it
normally (and I suppose this decides whether it's applicable to be sent via
the main channel at all..)?

> 
> > then after
> > everything loaded the device post a semaphore telling the main thread to
> > continue.  See e.g.:
> > 
> >      if (migrate_switchover_ack()) {
> >          qemu_loadvm_state_switchover_ack_needed(mis);
> >      }
> > 
> > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > when all things are loaded?  We can then get rid of this slightly awkward
> > interface.  I had a feeling that things can be simplified (e.g., if the
> > thread will take care of loading the final vmstate then the mutex is also
> > not needed? etc.).
> 
> With just a single call to switchover_ack_needed per VFIO device it would
> need to do a blocking wait for the device buffers and config state load
> to finish, therefore blocking other VFIO devices from potentially loading
> their config state if they are ready to begin this operation earlier.

I am not sure I get you here, loading VFIO device states (I mean, the
non-iterable part) will need to be done sequentially IIUC due to what you
said and should rely on BQL, so I don't know how that could happen
concurrently for now.  But I think indeed BQL is a problem.

So IMHO this recv side interface so far is the major pain that I really
want to avoid (comparing to the rest) in the series.  Let's see whether we
can come up with something better..

One other (probably not pretty..) idea is when waiting here in the main
thread it yields BQL, then other threads can take it and load the VFIO
final chunk of data.  But I could miss something else.
Maciej S. Szmigiero Sept. 20, 2024, 3:23 p.m. UTC | #7
On 19.09.2024 23:11, Peter Xu wrote:
> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>> On 9.09.2024 22:03, Peter Xu wrote:
>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>
>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>> a device-specific asynchronous device state loading operation had finished.
>>>>
>>>> In order to avoid calling this handler needlessly the device is supposed
>>>> to notify the migration code of its possible readiness via a call to
>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>> qemu_loadvm_load_finish_ready_lock.
>>>>
>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>> ---
>>>>    include/migration/register.h | 21 +++++++++++++++
>>>>    migration/migration.c        |  6 +++++
>>>>    migration/migration.h        |  3 +++
>>>>    migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>>>    migration/savevm.h           |  4 +++
>>>>    5 files changed, 86 insertions(+)
>>>>
>>>> diff --git a/include/migration/register.h b/include/migration/register.h
>>>> index 4a578f140713..44d8cf5192ae 100644
>>>> --- a/include/migration/register.h
>>>> +++ b/include/migration/register.h
>>>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>>>        int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>>>                                 Error **errp);
>>>> +    /**
>>>> +     * @load_finish
>>>> +     *
>>>> +     * Poll whether all asynchronous device state loading had finished.
>>>> +     * Not called on the load failure path.
>>>> +     *
>>>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>> +     *
>>>> +     * If this method signals "not ready" then it might not be called
>>>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>>>
>>> [1]
>>>
>>>> +     *
>>>> +     * @opaque: data pointer passed to register_savevm_live()
>>>> +     * @is_finished: whether the loading had finished (output parameter)
>>>> +     * @errp: pointer to Error*, to store an error if it happens.
>>>> +     *
>>>> +     * Returns zero to indicate success and negative for error
>>>> +     * It's not an error that the loading still hasn't finished.
>>>> +     */
>>>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>
>>> The load_finish() semantics is a bit weird, especially above [1] on "only
>>> allowed to be called once if ..." and also on the locks.
>>
>> The point of this remark is that a driver needs to call
>> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
>> core to call its load_finish handler again.
>>
>>> It looks to me vfio_load_finish() also does the final load of the device.
>>>
>>> I wonder whether that final load can be done in the threads,
>>
>> Here, the problem is that current VFIO VMState has to be loaded from the main
>> migration thread as it internally calls QEMU core address space modification
>> methods which explode if called from another thread(s).
> 
> Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> BQL if possible, when that's ready then in your case here IIUC you can
> simply take BQL in whichever thread that loads it.. but yeah it's not ready
> at least..

Yeah, long term we might want to work on making these QEMU core address space
modification methods somehow callable from multiple threads but that's
definitely not something for the initial patch set.

> Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> be done in VFIO's save_live_complete_precopy() through the main channel
> somehow?  IOW, does it rely on iterative data to be fetched first from
> kernel, or completely separate states? 

The device state data needs to be fully loaded first before "activating"
the device by loading its config state.

> And just curious: how large is it
> normally (and I suppose this decides whether it's applicable to be sent via
> the main channel at all..)?

Config data is *much* smaller than device state data - as far as I remember
it was on order of kilobytes.

>>
>>> then after
>>> everything loaded the device post a semaphore telling the main thread to
>>> continue.  See e.g.:
>>>
>>>       if (migrate_switchover_ack()) {
>>>           qemu_loadvm_state_switchover_ack_needed(mis);
>>>       }
>>>
>>> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
>>> when all things are loaded?  We can then get rid of this slightly awkward
>>> interface.  I had a feeling that things can be simplified (e.g., if the
>>> thread will take care of loading the final vmstate then the mutex is also
>>> not needed? etc.).
>>
>> With just a single call to switchover_ack_needed per VFIO device it would
>> need to do a blocking wait for the device buffers and config state load
>> to finish, therefore blocking other VFIO devices from potentially loading
>> their config state if they are ready to begin this operation earlier.
> 
> I am not sure I get you here, loading VFIO device states (I mean, the
> non-iterable part) will need to be done sequentially IIUC due to what you
> said and should rely on BQL, so I don't know how that could happen
> concurrently for now.  But I think indeed BQL is a problem.
Consider that we have two VFIO devices (A and B), with the following order
of switchover_ack_needed handler calls for them: first A get this call,
once the call for A finishes then B gets this call.

Now consider what happens if B had loaded all its buffers (in the loading
thread) and it is ready for its config load before A finished loading its
buffers.

B has to wait idle in this situation (even though it could have been already
loading its config) since the switchover_ack_needed handler for A won't
return until A is fully done.

> So IMHO this recv side interface so far is the major pain that I really
> want to avoid (comparing to the rest) in the series.  Let's see whether we
> can come up with something better..
> 
> One other (probably not pretty..) idea is when waiting here in the main
> thread it yields BQL, then other threads can take it and load the VFIO
> final chunk of data.  But I could miss something else.
> 

I think temporary dropping BQL deep inside migration code is similar
to running QEMU event loop deep inside migration code (about which
people complained in my generic thread pool implementation): it's easy
to miss some subtle dependency/race somewhere and accidentally cause rare
hard to debug deadlock.

That's why I think that it's ultimately probably better to make QEMU core
address space modification methods thread safe / re-entrant instead.

Thanks,
Maciej
Peter Xu Sept. 20, 2024, 4:45 p.m. UTC | #8
On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
> On 19.09.2024 23:11, Peter Xu wrote:
> > On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> > > On 9.09.2024 22:03, Peter Xu wrote:
> > > > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > > > 
> > > > > load_finish SaveVMHandler allows migration code to poll whether
> > > > > a device-specific asynchronous device state loading operation had finished.
> > > > > 
> > > > > In order to avoid calling this handler needlessly the device is supposed
> > > > > to notify the migration code of its possible readiness via a call to
> > > > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > > > qemu_loadvm_load_finish_ready_lock.
> > > > > 
> > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > > > ---
> > > > >    include/migration/register.h | 21 +++++++++++++++
> > > > >    migration/migration.c        |  6 +++++
> > > > >    migration/migration.h        |  3 +++
> > > > >    migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > > > >    migration/savevm.h           |  4 +++
> > > > >    5 files changed, 86 insertions(+)
> > > > > 
> > > > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > > > index 4a578f140713..44d8cf5192ae 100644
> > > > > --- a/include/migration/register.h
> > > > > +++ b/include/migration/register.h
> > > > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > > > >        int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > > > >                                 Error **errp);
> > > > > +    /**
> > > > > +     * @load_finish
> > > > > +     *
> > > > > +     * Poll whether all asynchronous device state loading had finished.
> > > > > +     * Not called on the load failure path.
> > > > > +     *
> > > > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > > > +     *
> > > > > +     * If this method signals "not ready" then it might not be called
> > > > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > > > 
> > > > [1]
> > > > 
> > > > > +     *
> > > > > +     * @opaque: data pointer passed to register_savevm_live()
> > > > > +     * @is_finished: whether the loading had finished (output parameter)
> > > > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > > > +     *
> > > > > +     * Returns zero to indicate success and negative for error
> > > > > +     * It's not an error that the loading still hasn't finished.
> > > > > +     */
> > > > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > 
> > > > The load_finish() semantics is a bit weird, especially above [1] on "only
> > > > allowed to be called once if ..." and also on the locks.
> > > 
> > > The point of this remark is that a driver needs to call
> > > qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> > > core to call its load_finish handler again.
> > > 
> > > > It looks to me vfio_load_finish() also does the final load of the device.
> > > > 
> > > > I wonder whether that final load can be done in the threads,
> > > 
> > > Here, the problem is that current VFIO VMState has to be loaded from the main
> > > migration thread as it internally calls QEMU core address space modification
> > > methods which explode if called from another thread(s).
> > 
> > Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> > BQL if possible, when that's ready then in your case here IIUC you can
> > simply take BQL in whichever thread that loads it.. but yeah it's not ready
> > at least..
> 
> Yeah, long term we might want to work on making these QEMU core address space
> modification methods somehow callable from multiple threads but that's
> definitely not something for the initial patch set.
> 
> > Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> > be done in VFIO's save_live_complete_precopy() through the main channel
> > somehow?  IOW, does it rely on iterative data to be fetched first from
> > kernel, or completely separate states?
> 
> The device state data needs to be fully loaded first before "activating"
> the device by loading its config state.
> 
> > And just curious: how large is it
> > normally (and I suppose this decides whether it's applicable to be sent via
> > the main channel at all..)?
> 
> Config data is *much* smaller than device state data - as far as I remember
> it was on order of kilobytes.
> 
> > > 
> > > > then after
> > > > everything loaded the device post a semaphore telling the main thread to
> > > > continue.  See e.g.:
> > > > 
> > > >       if (migrate_switchover_ack()) {
> > > >           qemu_loadvm_state_switchover_ack_needed(mis);
> > > >       }
> > > > 
> > > > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > > > when all things are loaded?  We can then get rid of this slightly awkward
> > > > interface.  I had a feeling that things can be simplified (e.g., if the
> > > > thread will take care of loading the final vmstate then the mutex is also
> > > > not needed? etc.).
> > > 
> > > With just a single call to switchover_ack_needed per VFIO device it would
> > > need to do a blocking wait for the device buffers and config state load
> > > to finish, therefore blocking other VFIO devices from potentially loading
> > > their config state if they are ready to begin this operation earlier.
> > 
> > I am not sure I get you here, loading VFIO device states (I mean, the
> > non-iterable part) will need to be done sequentially IIUC due to what you
> > said and should rely on BQL, so I don't know how that could happen
> > concurrently for now.  But I think indeed BQL is a problem.
> Consider that we have two VFIO devices (A and B), with the following order
> of switchover_ack_needed handler calls for them: first A get this call,
> once the call for A finishes then B gets this call.
> 
> Now consider what happens if B had loaded all its buffers (in the loading
> thread) and it is ready for its config load before A finished loading its
> buffers.
> 
> B has to wait idle in this situation (even though it could have been already
> loading its config) since the switchover_ack_needed handler for A won't
> return until A is fully done.

This sounds like a performance concern, and I wonder how much this impacts
the real workload (that you run a test and measure, with/without such
concurrency) when we can save two devices in parallel anyway; I would
expect the real diff is small due to the fact I mentioned that we save >1
VFIO devices concurrently via multifd.

Do you think we can start with a simpler approach?

So what I'm thinking could be very clean is, we just discussed about
MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
wonder when with it why not we move one step further to have
MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
ready to send non-iterable".  It can be controlled by the same migration
property so we only send these two flags in 9.2+ machine types.

Then IIUC VFIO can send config data through main wire (just like most of
other pci devices! which is IMHO a good fit..) and on destination VFIO
holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.

Side note: when looking again, I really think we should cleanup some
migration switchover phase functions, e.g. I think
qemu_savevm_state_complete_precopy() parameters are pretty confusing,
especially iterable_only, even if inside it it also have some postcopy
implicit checks, urgh.. but this is not relevant to our discussion, and I
won't draft that before your series land; that can complicate stuff.

> 
> > So IMHO this recv side interface so far is the major pain that I really
> > want to avoid (comparing to the rest) in the series.  Let's see whether we
> > can come up with something better..
> > 
> > One other (probably not pretty..) idea is when waiting here in the main
> > thread it yields BQL, then other threads can take it and load the VFIO
> > final chunk of data.  But I could miss something else.
> > 
> 
> I think temporary dropping BQL deep inside migration code is similar
> to running QEMU event loop deep inside migration code (about which
> people complained in my generic thread pool implementation): it's easy
> to miss some subtle dependency/race somewhere and accidentally cause rare
> hard to debug deadlock.
> 
> That's why I think that it's ultimately probably better to make QEMU core
> address space modification methods thread safe / re-entrant instead.

Right, let's see how you think about above.

Thanks,
Maciej S. Szmigiero Sept. 26, 2024, 10:34 p.m. UTC | #9
On 20.09.2024 18:45, Peter Xu wrote:
> On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
>> On 19.09.2024 23:11, Peter Xu wrote:
>>> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>>>> On 9.09.2024 22:03, Peter Xu wrote:
>>>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>>>
>>>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>>>> a device-specific asynchronous device state loading operation had finished.
>>>>>>
>>>>>> In order to avoid calling this handler needlessly the device is supposed
>>>>>> to notify the migration code of its possible readiness via a call to
>>>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>>>> qemu_loadvm_load_finish_ready_lock.
>>>>>>
>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>>>> ---
>>>>>>     include/migration/register.h | 21 +++++++++++++++
>>>>>>     migration/migration.c        |  6 +++++
>>>>>>     migration/migration.h        |  3 +++
>>>>>>     migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>>>>>     migration/savevm.h           |  4 +++
>>>>>>     5 files changed, 86 insertions(+)
>>>>>>
>>>>>> diff --git a/include/migration/register.h b/include/migration/register.h
>>>>>> index 4a578f140713..44d8cf5192ae 100644
>>>>>> --- a/include/migration/register.h
>>>>>> +++ b/include/migration/register.h
>>>>>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>>>>>         int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>>>>>                                  Error **errp);
>>>>>> +    /**
>>>>>> +     * @load_finish
>>>>>> +     *
>>>>>> +     * Poll whether all asynchronous device state loading had finished.
>>>>>> +     * Not called on the load failure path.
>>>>>> +     *
>>>>>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>>>> +     *
>>>>>> +     * If this method signals "not ready" then it might not be called
>>>>>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>>>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>>>>>
>>>>> [1]
>>>>>
>>>>>> +     *
>>>>>> +     * @opaque: data pointer passed to register_savevm_live()
>>>>>> +     * @is_finished: whether the loading had finished (output parameter)
>>>>>> +     * @errp: pointer to Error*, to store an error if it happens.
>>>>>> +     *
>>>>>> +     * Returns zero to indicate success and negative for error
>>>>>> +     * It's not an error that the loading still hasn't finished.
>>>>>> +     */
>>>>>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>>>
>>>>> The load_finish() semantics is a bit weird, especially above [1] on "only
>>>>> allowed to be called once if ..." and also on the locks.
>>>>
>>>> The point of this remark is that a driver needs to call
>>>> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
>>>> core to call its load_finish handler again.
>>>>
>>>>> It looks to me vfio_load_finish() also does the final load of the device.
>>>>>
>>>>> I wonder whether that final load can be done in the threads,
>>>>
>>>> Here, the problem is that current VFIO VMState has to be loaded from the main
>>>> migration thread as it internally calls QEMU core address space modification
>>>> methods which explode if called from another thread(s).
>>>
>>> Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
>>> BQL if possible, when that's ready then in your case here IIUC you can
>>> simply take BQL in whichever thread that loads it.. but yeah it's not ready
>>> at least..
>>
>> Yeah, long term we might want to work on making these QEMU core address space
>> modification methods somehow callable from multiple threads but that's
>> definitely not something for the initial patch set.
>>
>>> Would it be possible vfio_save_complete_precopy_async_thread_config_state()
>>> be done in VFIO's save_live_complete_precopy() through the main channel
>>> somehow?  IOW, does it rely on iterative data to be fetched first from
>>> kernel, or completely separate states?
>>
>> The device state data needs to be fully loaded first before "activating"
>> the device by loading its config state.
>>
>>> And just curious: how large is it
>>> normally (and I suppose this decides whether it's applicable to be sent via
>>> the main channel at all..)?
>>
>> Config data is *much* smaller than device state data - as far as I remember
>> it was on order of kilobytes.
>>
>>>>
>>>>> then after
>>>>> everything loaded the device post a semaphore telling the main thread to
>>>>> continue.  See e.g.:
>>>>>
>>>>>        if (migrate_switchover_ack()) {
>>>>>            qemu_loadvm_state_switchover_ack_needed(mis);
>>>>>        }
>>>>>
>>>>> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
>>>>> when all things are loaded?  We can then get rid of this slightly awkward
>>>>> interface.  I had a feeling that things can be simplified (e.g., if the
>>>>> thread will take care of loading the final vmstate then the mutex is also
>>>>> not needed? etc.).
>>>>
>>>> With just a single call to switchover_ack_needed per VFIO device it would
>>>> need to do a blocking wait for the device buffers and config state load
>>>> to finish, therefore blocking other VFIO devices from potentially loading
>>>> their config state if they are ready to begin this operation earlier.
>>>
>>> I am not sure I get you here, loading VFIO device states (I mean, the
>>> non-iterable part) will need to be done sequentially IIUC due to what you
>>> said and should rely on BQL, so I don't know how that could happen
>>> concurrently for now.  But I think indeed BQL is a problem.
>> Consider that we have two VFIO devices (A and B), with the following order
>> of switchover_ack_needed handler calls for them: first A get this call,
>> once the call for A finishes then B gets this call.
>>
>> Now consider what happens if B had loaded all its buffers (in the loading
>> thread) and it is ready for its config load before A finished loading its
>> buffers.
>>
>> B has to wait idle in this situation (even though it could have been already
>> loading its config) since the switchover_ack_needed handler for A won't
>> return until A is fully done.
> 
> This sounds like a performance concern, and I wonder how much this impacts
> the real workload (that you run a test and measure, with/without such
> concurrency) when we can save two devices in parallel anyway; I would
> expect the real diff is small due to the fact I mentioned that we save >1
> VFIO devices concurrently via multifd.
> 
> Do you think we can start with a simpler approach?

I don't think introducing a performance/scalability issue like that is
a good thing, especially that we already have a design that avoids it.

Unfortunately, my current setup does not allow live migrating VMs with
more than 4 VFs so I can't benchmark that.

But I almost certain that with more VFs the situation with devices being
ready out-of-order will get even more likely.

> So what I'm thinking could be very clean is, we just discussed about
> MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
> wonder when with it why not we move one step further to have
> MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
> ready to send non-iterable".  It can be controlled by the same migration
> property so we only send these two flags in 9.2+ machine types.
> 
> Then IIUC VFIO can send config data through main wire (just like most of
> other pci devices! which is IMHO a good fit..) and on destination VFIO
> holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.

Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
to the considerations above) also delay starting the config load until all
iterable devices were read/transferred/loaded and also would complicate
future efforts at loading that config data in parallel.

> 
> Thanks,
> 

Thanks,
Maciej
Peter Xu Sept. 27, 2024, 12:53 a.m. UTC | #10
On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
> On 20.09.2024 18:45, Peter Xu wrote:
> > On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
> > > On 19.09.2024 23:11, Peter Xu wrote:
> > > > On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> > > > > On 9.09.2024 22:03, Peter Xu wrote:
> > > > > > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > > > > > 
> > > > > > > load_finish SaveVMHandler allows migration code to poll whether
> > > > > > > a device-specific asynchronous device state loading operation had finished.
> > > > > > > 
> > > > > > > In order to avoid calling this handler needlessly the device is supposed
> > > > > > > to notify the migration code of its possible readiness via a call to
> > > > > > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > > > > > qemu_loadvm_load_finish_ready_lock.
> > > > > > > 
> > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > > > > > ---
> > > > > > >     include/migration/register.h | 21 +++++++++++++++
> > > > > > >     migration/migration.c        |  6 +++++
> > > > > > >     migration/migration.h        |  3 +++
> > > > > > >     migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > > > > > >     migration/savevm.h           |  4 +++
> > > > > > >     5 files changed, 86 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > > > > > index 4a578f140713..44d8cf5192ae 100644
> > > > > > > --- a/include/migration/register.h
> > > > > > > +++ b/include/migration/register.h
> > > > > > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > > > > > >         int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > > > > > >                                  Error **errp);
> > > > > > > +    /**
> > > > > > > +     * @load_finish
> > > > > > > +     *
> > > > > > > +     * Poll whether all asynchronous device state loading had finished.
> > > > > > > +     * Not called on the load failure path.
> > > > > > > +     *
> > > > > > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > > > > > +     *
> > > > > > > +     * If this method signals "not ready" then it might not be called
> > > > > > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > > > > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > > > > > 
> > > > > > [1]
> > > > > > 
> > > > > > > +     *
> > > > > > > +     * @opaque: data pointer passed to register_savevm_live()
> > > > > > > +     * @is_finished: whether the loading had finished (output parameter)
> > > > > > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > > > > > +     *
> > > > > > > +     * Returns zero to indicate success and negative for error
> > > > > > > +     * It's not an error that the loading still hasn't finished.
> > > > > > > +     */
> > > > > > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > > > 
> > > > > > The load_finish() semantics is a bit weird, especially above [1] on "only
> > > > > > allowed to be called once if ..." and also on the locks.
> > > > > 
> > > > > The point of this remark is that a driver needs to call
> > > > > qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> > > > > core to call its load_finish handler again.
> > > > > 
> > > > > > It looks to me vfio_load_finish() also does the final load of the device.
> > > > > > 
> > > > > > I wonder whether that final load can be done in the threads,
> > > > > 
> > > > > Here, the problem is that current VFIO VMState has to be loaded from the main
> > > > > migration thread as it internally calls QEMU core address space modification
> > > > > methods which explode if called from another thread(s).
> > > > 
> > > > Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> > > > BQL if possible, when that's ready then in your case here IIUC you can
> > > > simply take BQL in whichever thread that loads it.. but yeah it's not ready
> > > > at least..
> > > 
> > > Yeah, long term we might want to work on making these QEMU core address space
> > > modification methods somehow callable from multiple threads but that's
> > > definitely not something for the initial patch set.
> > > 
> > > > Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> > > > be done in VFIO's save_live_complete_precopy() through the main channel
> > > > somehow?  IOW, does it rely on iterative data to be fetched first from
> > > > kernel, or completely separate states?
> > > 
> > > The device state data needs to be fully loaded first before "activating"
> > > the device by loading its config state.
> > > 
> > > > And just curious: how large is it
> > > > normally (and I suppose this decides whether it's applicable to be sent via
> > > > the main channel at all..)?
> > > 
> > > Config data is *much* smaller than device state data - as far as I remember
> > > it was on order of kilobytes.
> > > 
> > > > > 
> > > > > > then after
> > > > > > everything loaded the device post a semaphore telling the main thread to
> > > > > > continue.  See e.g.:
> > > > > > 
> > > > > >        if (migrate_switchover_ack()) {
> > > > > >            qemu_loadvm_state_switchover_ack_needed(mis);
> > > > > >        }
> > > > > > 
> > > > > > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > > > > > when all things are loaded?  We can then get rid of this slightly awkward
> > > > > > interface.  I had a feeling that things can be simplified (e.g., if the
> > > > > > thread will take care of loading the final vmstate then the mutex is also
> > > > > > not needed? etc.).
> > > > > 
> > > > > With just a single call to switchover_ack_needed per VFIO device it would
> > > > > need to do a blocking wait for the device buffers and config state load
> > > > > to finish, therefore blocking other VFIO devices from potentially loading
> > > > > their config state if they are ready to begin this operation earlier.
> > > > 
> > > > I am not sure I get you here, loading VFIO device states (I mean, the
> > > > non-iterable part) will need to be done sequentially IIUC due to what you
> > > > said and should rely on BQL, so I don't know how that could happen
> > > > concurrently for now.  But I think indeed BQL is a problem.
> > > Consider that we have two VFIO devices (A and B), with the following order
> > > of switchover_ack_needed handler calls for them: first A get this call,
> > > once the call for A finishes then B gets this call.
> > > 
> > > Now consider what happens if B had loaded all its buffers (in the loading
> > > thread) and it is ready for its config load before A finished loading its
> > > buffers.
> > > 
> > > B has to wait idle in this situation (even though it could have been already
> > > loading its config) since the switchover_ack_needed handler for A won't
> > > return until A is fully done.
> > 
> > This sounds like a performance concern, and I wonder how much this impacts
> > the real workload (that you run a test and measure, with/without such
> > concurrency) when we can save two devices in parallel anyway; I would
> > expect the real diff is small due to the fact I mentioned that we save >1
> > VFIO devices concurrently via multifd.
> > 
> > Do you think we can start with a simpler approach?
> 
> I don't think introducing a performance/scalability issue like that is
> a good thing, especially that we already have a design that avoids it.
> 
> Unfortunately, my current setup does not allow live migrating VMs with
> more than 4 VFs so I can't benchmark that.

/me wonders why benchmarking it requires more than 4 VFs.

> 
> But I almost certain that with more VFs the situation with devices being
> ready out-of-order will get even more likely.

If the config space is small, why loading it in sequence would be a
problem?

Have you measured how much time it needs to load one VF's config space that
you're using?  I suppose that's vfio_load_device_config_state() alone?

> 
> > So what I'm thinking could be very clean is, we just discussed about
> > MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
> > wonder when with it why not we move one step further to have
> > MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
> > ready to send non-iterable".  It can be controlled by the same migration
> > property so we only send these two flags in 9.2+ machine types.
> > 
> > Then IIUC VFIO can send config data through main wire (just like most of
> > other pci devices! which is IMHO a good fit..) and on destination VFIO
> > holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
> 
> Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
> to the considerations above) also delay starting the config load until all
> iterable devices were read/transferred/loaded and also would complicate
> future efforts at loading that config data in parallel.

However I wonder whether we can keep it simple in that VFIO's config space
is still always saved in vfio_save_state().  I still think it's easier we
stick with the main channel whenever possible.  For this specific case, if
the config space is small I think it's tricky you bypass this with:

    if (migration->multifd_transfer) {
        /* Emit dummy NOP data */
        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
        return;
    }

Then squash this as the tail of the iterable data.

On the src, I think it could use a per-device semaphore, so that iterable
save() thread will post() only if it finishes dumping all the data, then
that orders VFIO iterable data v.s. config space save().

On the dst, after a 2nd thought, MIG_CMD_SEND_NON_ITERABE may not work or
needed indeed, because multifd bypasses the main channel, so if we send
anything like MIG_CMD_SEND_NON_ITERABE on the main channel it won't
guarantee multifd load all complete.  However IIUC that can be used in a
similar way as the src qemu I mentioned above with a per-device semaphore,
so that only all the iterable data of this device loaded and applied to the
HW would it post(), before that, vfio_load_state() should wait() on that
sem waiting for data to ready (while multifd threads will be doing that
part).  I wonder whether we may reuse the multifd recv thread in the
initial version, so maybe we don't need any other threads on destination.

The load_finish() interface is currently not able to be reused right,
afaict.  Just have a look at its definition:

    /**
     * @load_finish
     *
     * Poll whether all asynchronous device state loading had finished.
     * Not called on the load failure path.
     *
     * Called while holding the qemu_loadvm_load_finish_ready_lock.
     *
     * If this method signals "not ready" then it might not be called
     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
     * while holding qemu_loadvm_load_finish_ready_lock.
     *
     * @opaque: data pointer passed to register_savevm_live()
     * @is_finished: whether the loading had finished (output parameter)
     * @errp: pointer to Error*, to store an error if it happens.
     *
     * Returns zero to indicate success and negative for error
     * It's not an error that the loading still hasn't finished.
     */
    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);

It's over complicated on defining all its details:

  - Not re-entrant by default.. this is so weirdly designed so that the
    caller needs to know which is even the "1st invocation of the
    function"... It is just weird.

  - Requires one more global mutex that non vmstate handler ever requested,
    that I feel like perhaps can be replaced by a sem (then to drop the
    condvar)?

  - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
    above..

So if you really think it matters to load whatever VFIO device who's
iterable data is ready first, then let's try come up with some better
interface..  I can try to think about it too, but please answer me
questions above so I can understand what I am missing on why that's
important.  Numbers could help, even if 4 VF and I wonder how much diff
there can be.  Mostly, I don't know why it's slow right now if it is; I
thought it should be pretty fast, at least not a concern in VFIO migration
world (which can take seconds of downtime or more..).

IOW, it sounds more reasonalbe to me that no matter whether vfio will
support multifd, it'll be nice we stick with vfio_load_state() /
vfio_save_state() for config space, and hopefully it's also easier it
always go via the main channel to everyone.  In these two hooks, VFIO can
do whatever it wants to sync with other things (on src, sync with
concurrent thread pool saving iterable data and dumping things to multifd
channels; on dst, sync with multifd concurrent loads). I think it can
remove the requirement on the load_finish() interface completely.  Yes,
this can only load VFIO's pci config space one by one, but I think this is
much simpler, and I hope it's also not that slow, but I'm not sure.

Thanks,
Maciej S. Szmigiero Sept. 30, 2024, 7:25 p.m. UTC | #11
On 27.09.2024 02:53, Peter Xu wrote:
> On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
>> On 20.09.2024 18:45, Peter Xu wrote:
>>> On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
>>>> On 19.09.2024 23:11, Peter Xu wrote:
>>>>> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>>>>>> On 9.09.2024 22:03, Peter Xu wrote:
>>>>>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>>>>>
>>>>>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>>>>>> a device-specific asynchronous device state loading operation had finished.
>>>>>>>>
>>>>>>>> In order to avoid calling this handler needlessly the device is supposed
>>>>>>>> to notify the migration code of its possible readiness via a call to
>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>>>>>> qemu_loadvm_load_finish_ready_lock.
>>>>>>>>
>>>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>>>>>> ---
>>>>>>>>      include/migration/register.h | 21 +++++++++++++++
>>>>>>>>      migration/migration.c        |  6 +++++
>>>>>>>>      migration/migration.h        |  3 +++
>>>>>>>>      migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>>>>>>>      migration/savevm.h           |  4 +++
>>>>>>>>      5 files changed, 86 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/migration/register.h b/include/migration/register.h
>>>>>>>> index 4a578f140713..44d8cf5192ae 100644
>>>>>>>> --- a/include/migration/register.h
>>>>>>>> +++ b/include/migration/register.h
>>>>>>>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>>>>>>>          int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>>>>>>>                                   Error **errp);
>>>>>>>> +    /**
>>>>>>>> +     * @load_finish
>>>>>>>> +     *
>>>>>>>> +     * Poll whether all asynchronous device state loading had finished.
>>>>>>>> +     * Not called on the load failure path.
>>>>>>>> +     *
>>>>>>>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>>>>>> +     *
>>>>>>>> +     * If this method signals "not ready" then it might not be called
>>>>>>>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>>>>>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>>>>>>>
>>>>>>> [1]
>>>>>>>
>>>>>>>> +     *
>>>>>>>> +     * @opaque: data pointer passed to register_savevm_live()
>>>>>>>> +     * @is_finished: whether the loading had finished (output parameter)
>>>>>>>> +     * @errp: pointer to Error*, to store an error if it happens.
>>>>>>>> +     *
>>>>>>>> +     * Returns zero to indicate success and negative for error
>>>>>>>> +     * It's not an error that the loading still hasn't finished.
>>>>>>>> +     */
>>>>>>>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>>>>>
>>>>>>> The load_finish() semantics is a bit weird, especially above [1] on "only
>>>>>>> allowed to be called once if ..." and also on the locks.
>>>>>>
>>>>>> The point of this remark is that a driver needs to call
>>>>>> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
>>>>>> core to call its load_finish handler again.
>>>>>>
>>>>>>> It looks to me vfio_load_finish() also does the final load of the device.
>>>>>>>
>>>>>>> I wonder whether that final load can be done in the threads,
>>>>>>
>>>>>> Here, the problem is that current VFIO VMState has to be loaded from the main
>>>>>> migration thread as it internally calls QEMU core address space modification
>>>>>> methods which explode if called from another thread(s).
>>>>>
>>>>> Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
>>>>> BQL if possible, when that's ready then in your case here IIUC you can
>>>>> simply take BQL in whichever thread that loads it.. but yeah it's not ready
>>>>> at least..
>>>>
>>>> Yeah, long term we might want to work on making these QEMU core address space
>>>> modification methods somehow callable from multiple threads but that's
>>>> definitely not something for the initial patch set.
>>>>
>>>>> Would it be possible vfio_save_complete_precopy_async_thread_config_state()
>>>>> be done in VFIO's save_live_complete_precopy() through the main channel
>>>>> somehow?  IOW, does it rely on iterative data to be fetched first from
>>>>> kernel, or completely separate states?
>>>>
>>>> The device state data needs to be fully loaded first before "activating"
>>>> the device by loading its config state.
>>>>
>>>>> And just curious: how large is it
>>>>> normally (and I suppose this decides whether it's applicable to be sent via
>>>>> the main channel at all..)?
>>>>
>>>> Config data is *much* smaller than device state data - as far as I remember
>>>> it was on order of kilobytes.
>>>>
>>>>>>
>>>>>>> then after
>>>>>>> everything loaded the device post a semaphore telling the main thread to
>>>>>>> continue.  See e.g.:
>>>>>>>
>>>>>>>         if (migrate_switchover_ack()) {
>>>>>>>             qemu_loadvm_state_switchover_ack_needed(mis);
>>>>>>>         }
>>>>>>>
>>>>>>> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
>>>>>>> when all things are loaded?  We can then get rid of this slightly awkward
>>>>>>> interface.  I had a feeling that things can be simplified (e.g., if the
>>>>>>> thread will take care of loading the final vmstate then the mutex is also
>>>>>>> not needed? etc.).
>>>>>>
>>>>>> With just a single call to switchover_ack_needed per VFIO device it would
>>>>>> need to do a blocking wait for the device buffers and config state load
>>>>>> to finish, therefore blocking other VFIO devices from potentially loading
>>>>>> their config state if they are ready to begin this operation earlier.
>>>>>
>>>>> I am not sure I get you here, loading VFIO device states (I mean, the
>>>>> non-iterable part) will need to be done sequentially IIUC due to what you
>>>>> said and should rely on BQL, so I don't know how that could happen
>>>>> concurrently for now.  But I think indeed BQL is a problem.
>>>> Consider that we have two VFIO devices (A and B), with the following order
>>>> of switchover_ack_needed handler calls for them: first A get this call,
>>>> once the call for A finishes then B gets this call.
>>>>
>>>> Now consider what happens if B had loaded all its buffers (in the loading
>>>> thread) and it is ready for its config load before A finished loading its
>>>> buffers.
>>>>
>>>> B has to wait idle in this situation (even though it could have been already
>>>> loading its config) since the switchover_ack_needed handler for A won't
>>>> return until A is fully done.
>>>
>>> This sounds like a performance concern, and I wonder how much this impacts
>>> the real workload (that you run a test and measure, with/without such
>>> concurrency) when we can save two devices in parallel anyway; I would
>>> expect the real diff is small due to the fact I mentioned that we save >1
>>> VFIO devices concurrently via multifd.
>>>
>>> Do you think we can start with a simpler approach?
>>
>> I don't think introducing a performance/scalability issue like that is
>> a good thing, especially that we already have a design that avoids it.
>>
>> Unfortunately, my current setup does not allow live migrating VMs with
>> more than 4 VFs so I can't benchmark that.
> 
> /me wonders why benchmarking it requires more than 4 VFs.

My point here was that the scalability problem will most likely get more
pronounced with more VFs.

>>
>> But I almost certain that with more VFs the situation with devices being
>> ready out-of-order will get even more likely.
> 
> If the config space is small, why loading it in sequence would be a
> problem?
> 
> Have you measured how much time it needs to load one VF's config space that
> you're using?  I suppose that's vfio_load_device_config_state() alone?

It's not the amount of data to load matters here but that these address
space operations are slow.

The whole config load takes ~70 ms per device - that's time equivalent
of transferring 875 MiB of device state via a 100 GBit/s link.

>>
>>> So what I'm thinking could be very clean is, we just discussed about
>>> MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
>>> wonder when with it why not we move one step further to have
>>> MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
>>> ready to send non-iterable".  It can be controlled by the same migration
>>> property so we only send these two flags in 9.2+ machine types.
>>>
>>> Then IIUC VFIO can send config data through main wire (just like most of
>>> other pci devices! which is IMHO a good fit..) and on destination VFIO
>>> holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
>>
>> Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
>> to the considerations above) also delay starting the config load until all
>> iterable devices were read/transferred/loaded and also would complicate
>> future efforts at loading that config data in parallel.
> 
> However I wonder whether we can keep it simple in that VFIO's config space
> is still always saved in vfio_save_state().  I still think it's easier we
> stick with the main channel whenever possible.  For this specific case, if
> the config space is small I think it's tricky you bypass this with:
> 
>      if (migration->multifd_transfer) {
>          /* Emit dummy NOP data */
>          qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>          return;
>      }
> 
> Then squash this as the tail of the iterable data.
> 
> On the src, I think it could use a per-device semaphore, so that iterable
> save() thread will post() only if it finishes dumping all the data, then
> that orders VFIO iterable data v.s. config space save().

In the future we want to not only transfer but also load the config data
in parallel.

So going back to transferring this data serialized via the main migration
channel would be taking a step back here.

By the way, we already have a serialization point in
qemu_savevm_state_complete_precopy_iterable() after iterables have been sent -
waiting for device state sending threads to finish their work.

Whether this thread_pool_wait() operation will be implemented using
semaphores I'm not sure yet - will depend on how well this will fit other
GThreadPool internals.

> On the dst, after a 2nd thought, MIG_CMD_SEND_NON_ITERABE may not work or
> needed indeed, because multifd bypasses the main channel, so if we send
> anything like MIG_CMD_SEND_NON_ITERABE on the main channel it won't
> guarantee multifd load all complete.  However IIUC that can be used in a
> similar way as the src qemu I mentioned above with a per-device semaphore,
> so that only all the iterable data of this device loaded and applied to the
> HW would it post(), before that, vfio_load_state() should wait() on that
> sem waiting for data to ready (while multifd threads will be doing that
> part).  I wonder whether we may reuse the multifd recv thread in the
> initial version, so maybe we don't need any other threads on destination.
> 
> The load_finish() interface is currently not able to be reused right,
> afaict.  Just have a look at its definition:
> 
>      /**
>       * @load_finish
>       *
>       * Poll whether all asynchronous device state loading had finished.
>       * Not called on the load failure path.
>       *
>       * Called while holding the qemu_loadvm_load_finish_ready_lock.
>       *
>       * If this method signals "not ready" then it might not be called
>       * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>       * while holding qemu_loadvm_load_finish_ready_lock.
>       *
>       * @opaque: data pointer passed to register_savevm_live()
>       * @is_finished: whether the loading had finished (output parameter)
>       * @errp: pointer to Error*, to store an error if it happens.
>       *
>       * Returns zero to indicate success and negative for error
>       * It's not an error that the loading still hasn't finished.
>       */
>      int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> 
> It's over complicated on defining all its details:
> 
>    - Not re-entrant by default.. 

What do you mean by "re-entrant" here?

This handler is called only from single migration thread, so it cannot
be re-entered anyway since the control doesn't return to the migration
code until this handler exits (and obviously the handler won't call
itself recursively).

> this is so weirdly designed so that the
>      caller needs to know which is even the "1st invocation of the
>      function"... It is just weird.

I don't quite understand that - why do you think that caller needs to
know whether this is the "1st invocation of the function"?

The caller only tracks whether all these handlers reported that they
finished their work:
>       bool all_ready = true;
>       QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
>           bool this_ready;
> 
>           ret = se->ops->load_finish(se->opaque, &this_ready, &local_err);
>           if (ret) {
>           } else if (!this_ready) {
>               all_ready = false;
>           }
>
>       }
>       if (all_ready) {
>             break;
>       }


>    - Requires one more global mutex that non vmstate handler ever requested,

Could you elaborate what do you mean by "that non vmstate handler ever requested"?

>      that I feel like perhaps can be replaced by a sem (then to drop the
>      condvar)?

Once we have ability to load device config state outside main migration
thread replacing "load_finish" handler with a semaphore should indeed be
possible (that's internal migration API so there should be no issue
removing it as not necessary anymore at this point).

But for now, the devices need to have ability to run their config load
code on the main migration thread, and for that they need to be called
from this handler "load_finish".

>    - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
>      above..
> 
> So if you really think it matters to load whatever VFIO device who's
> iterable data is ready first, then let's try come up with some better
> interface..  I can try to think about it too, but please answer me
> questions above so I can understand what I am missing on why that's
> important.  Numbers could help, even if 4 VF and I wonder how much diff
> there can be.  Mostly, I don't know why it's slow right now if it is; I
> thought it should be pretty fast, at least not a concern in VFIO migration
> world (which can take seconds of downtime or more..).
> 
> IOW, it sounds more reasonalbe to me that no matter whether vfio will
> support multifd, it'll be nice we stick with vfio_load_state() /
> vfio_save_state() for config space, and hopefully it's also easier it
> always go via the main channel to everyone.  In these two hooks, VFIO can
> do whatever it wants to sync with other things (on src, sync with
> concurrent thread pool saving iterable data and dumping things to multifd
> channels; on dst, sync with multifd concurrent loads). I think it can
> remove the requirement on the load_finish() interface completely.  Yes,
> this can only load VFIO's pci config space one by one, but I think this is
> much simpler, and I hope it's also not that slow, but I'm not sure.

To be clear, I made a following diagram describing how the patch set
is supposed to work right now, including changing per-device
VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.

Time flows on it left to right (->).

----------- DIAGRAM START -----------
Source overall flow:
Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /

Target overall flow:
Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)

Target config data load operations flow:
multifd config data read (1) -> config data load (2)

Notes:
(1): per device threads running in parallel
(2): currently serialized (only one such operation running at a particular time), will hopefully be parallelized in the future
----------- DIAGRAM END -----------

Hope the diagram survived being pasted into an e-mail message.

One can see, that even now there's a bit of "low hanging fruit" of missing
possible parallelism:
It seems that the source could wait for multifd device state + multifd config
data *after* non-iterables are sent rather than before as it is done
currently - so they will be sent in parallel with multifd data.

Since written description is often prone to misunderstanding
could you please annotate that diagram with your proposed new flow?

Thanks,
Maciej
Peter Xu Sept. 30, 2024, 9:57 p.m. UTC | #12
On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
> On 27.09.2024 02:53, Peter Xu wrote:
> > On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
> > > On 20.09.2024 18:45, Peter Xu wrote:
> > > > On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
> > > > > On 19.09.2024 23:11, Peter Xu wrote:
> > > > > > On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > On 9.09.2024 22:03, Peter Xu wrote:
> > > > > > > > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > > > > > > > 
> > > > > > > > > load_finish SaveVMHandler allows migration code to poll whether
> > > > > > > > > a device-specific asynchronous device state loading operation had finished.
> > > > > > > > > 
> > > > > > > > > In order to avoid calling this handler needlessly the device is supposed
> > > > > > > > > to notify the migration code of its possible readiness via a call to
> > > > > > > > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > > > > > > > qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > > > > > > > ---
> > > > > > > > >      include/migration/register.h | 21 +++++++++++++++
> > > > > > > > >      migration/migration.c        |  6 +++++
> > > > > > > > >      migration/migration.h        |  3 +++
> > > > > > > > >      migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > > > > > > > >      migration/savevm.h           |  4 +++
> > > > > > > > >      5 files changed, 86 insertions(+)
> > > > > > > > > 
> > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > > > > > > > index 4a578f140713..44d8cf5192ae 100644
> > > > > > > > > --- a/include/migration/register.h
> > > > > > > > > +++ b/include/migration/register.h
> > > > > > > > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > > > > > > > >          int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > > > > > > > >                                   Error **errp);
> > > > > > > > > +    /**
> > > > > > > > > +     * @load_finish
> > > > > > > > > +     *
> > > > > > > > > +     * Poll whether all asynchronous device state loading had finished.
> > > > > > > > > +     * Not called on the load failure path.
> > > > > > > > > +     *
> > > > > > > > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > +     *
> > > > > > > > > +     * If this method signals "not ready" then it might not be called
> > > > > > > > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > > > > > > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > > > > > > > 
> > > > > > > > [1]
> > > > > > > > 
> > > > > > > > > +     *
> > > > > > > > > +     * @opaque: data pointer passed to register_savevm_live()
> > > > > > > > > +     * @is_finished: whether the loading had finished (output parameter)
> > > > > > > > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > > > > > > > +     *
> > > > > > > > > +     * Returns zero to indicate success and negative for error
> > > > > > > > > +     * It's not an error that the loading still hasn't finished.
> > > > > > > > > +     */
> > > > > > > > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > > > > > 
> > > > > > > > The load_finish() semantics is a bit weird, especially above [1] on "only
> > > > > > > > allowed to be called once if ..." and also on the locks.
> > > > > > > 
> > > > > > > The point of this remark is that a driver needs to call
> > > > > > > qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> > > > > > > core to call its load_finish handler again.
> > > > > > > 
> > > > > > > > It looks to me vfio_load_finish() also does the final load of the device.
> > > > > > > > 
> > > > > > > > I wonder whether that final load can be done in the threads,
> > > > > > > 
> > > > > > > Here, the problem is that current VFIO VMState has to be loaded from the main
> > > > > > > migration thread as it internally calls QEMU core address space modification
> > > > > > > methods which explode if called from another thread(s).
> > > > > > 
> > > > > > Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> > > > > > BQL if possible, when that's ready then in your case here IIUC you can
> > > > > > simply take BQL in whichever thread that loads it.. but yeah it's not ready
> > > > > > at least..
> > > > > 
> > > > > Yeah, long term we might want to work on making these QEMU core address space
> > > > > modification methods somehow callable from multiple threads but that's
> > > > > definitely not something for the initial patch set.
> > > > > 
> > > > > > Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> > > > > > be done in VFIO's save_live_complete_precopy() through the main channel
> > > > > > somehow?  IOW, does it rely on iterative data to be fetched first from
> > > > > > kernel, or completely separate states?
> > > > > 
> > > > > The device state data needs to be fully loaded first before "activating"
> > > > > the device by loading its config state.
> > > > > 
> > > > > > And just curious: how large is it
> > > > > > normally (and I suppose this decides whether it's applicable to be sent via
> > > > > > the main channel at all..)?
> > > > > 
> > > > > Config data is *much* smaller than device state data - as far as I remember
> > > > > it was on order of kilobytes.
> > > > > 
> > > > > > > 
> > > > > > > > then after
> > > > > > > > everything loaded the device post a semaphore telling the main thread to
> > > > > > > > continue.  See e.g.:
> > > > > > > > 
> > > > > > > >         if (migrate_switchover_ack()) {
> > > > > > > >             qemu_loadvm_state_switchover_ack_needed(mis);
> > > > > > > >         }
> > > > > > > > 
> > > > > > > > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > > > > > > > when all things are loaded?  We can then get rid of this slightly awkward
> > > > > > > > interface.  I had a feeling that things can be simplified (e.g., if the
> > > > > > > > thread will take care of loading the final vmstate then the mutex is also
> > > > > > > > not needed? etc.).
> > > > > > > 
> > > > > > > With just a single call to switchover_ack_needed per VFIO device it would
> > > > > > > need to do a blocking wait for the device buffers and config state load
> > > > > > > to finish, therefore blocking other VFIO devices from potentially loading
> > > > > > > their config state if they are ready to begin this operation earlier.
> > > > > > 
> > > > > > I am not sure I get you here, loading VFIO device states (I mean, the
> > > > > > non-iterable part) will need to be done sequentially IIUC due to what you
> > > > > > said and should rely on BQL, so I don't know how that could happen
> > > > > > concurrently for now.  But I think indeed BQL is a problem.
> > > > > Consider that we have two VFIO devices (A and B), with the following order
> > > > > of switchover_ack_needed handler calls for them: first A get this call,
> > > > > once the call for A finishes then B gets this call.
> > > > > 
> > > > > Now consider what happens if B had loaded all its buffers (in the loading
> > > > > thread) and it is ready for its config load before A finished loading its
> > > > > buffers.
> > > > > 
> > > > > B has to wait idle in this situation (even though it could have been already
> > > > > loading its config) since the switchover_ack_needed handler for A won't
> > > > > return until A is fully done.
> > > > 
> > > > This sounds like a performance concern, and I wonder how much this impacts
> > > > the real workload (that you run a test and measure, with/without such
> > > > concurrency) when we can save two devices in parallel anyway; I would
> > > > expect the real diff is small due to the fact I mentioned that we save >1
> > > > VFIO devices concurrently via multifd.
> > > > 
> > > > Do you think we can start with a simpler approach?
> > > 
> > > I don't think introducing a performance/scalability issue like that is
> > > a good thing, especially that we already have a design that avoids it.
> > > 
> > > Unfortunately, my current setup does not allow live migrating VMs with
> > > more than 4 VFs so I can't benchmark that.
> > 
> > /me wonders why benchmarking it requires more than 4 VFs.
> 
> My point here was that the scalability problem will most likely get more
> pronounced with more VFs.
> 
> > > 
> > > But I almost certain that with more VFs the situation with devices being
> > > ready out-of-order will get even more likely.
> > 
> > If the config space is small, why loading it in sequence would be a
> > problem?
> > 
> > Have you measured how much time it needs to load one VF's config space that
> > you're using?  I suppose that's vfio_load_device_config_state() alone?
> 
> It's not the amount of data to load matters here but that these address
> space operations are slow.
> 
> The whole config load takes ~70 ms per device - that's time equivalent
> of transferring 875 MiB of device state via a 100 GBit/s link.

What's the downtime of migration with 1/2/4 VFs?  I remember I saw some
data somewhere but it's not in the cover letter.  It'll be good to mention
these results in the cover letter when repost.

I'm guessing 70ms isn't a huge deal here, if your NIC has 128GB internal
device state to migrate.. but maybe I'm wrong.

I also wonder whether you profiled a bit on how that 70ms contributes to
what is slow.

> 
> > > 
> > > > So what I'm thinking could be very clean is, we just discussed about
> > > > MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
> > > > wonder when with it why not we move one step further to have
> > > > MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
> > > > ready to send non-iterable".  It can be controlled by the same migration
> > > > property so we only send these two flags in 9.2+ machine types.
> > > > 
> > > > Then IIUC VFIO can send config data through main wire (just like most of
> > > > other pci devices! which is IMHO a good fit..) and on destination VFIO
> > > > holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
> > > 
> > > Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
> > > to the considerations above) also delay starting the config load until all
> > > iterable devices were read/transferred/loaded and also would complicate
> > > future efforts at loading that config data in parallel.
> > 
> > However I wonder whether we can keep it simple in that VFIO's config space
> > is still always saved in vfio_save_state().  I still think it's easier we
> > stick with the main channel whenever possible.  For this specific case, if
> > the config space is small I think it's tricky you bypass this with:
> > 
> >      if (migration->multifd_transfer) {
> >          /* Emit dummy NOP data */
> >          qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> >          return;
> >      }
> > 
> > Then squash this as the tail of the iterable data.
> > 
> > On the src, I think it could use a per-device semaphore, so that iterable
> > save() thread will post() only if it finishes dumping all the data, then
> > that orders VFIO iterable data v.s. config space save().
> 
> In the future we want to not only transfer but also load the config data
> in parallel.

How feasible do you think this idea is?  E.g. does it involve BQL so far
(e.g. memory updates, others)?  What's still missing to make it concurrent?

> 
> So going back to transferring this data serialized via the main migration
> channel would be taking a step back here.

If below holds true:

  - 70ms is still very small amount in the total downtime, and,

  - this can avoid the below load_finish() API

Then I'd go for it.. or again, at least the load_finish() needs change,
IMHO..

> 
> By the way, we already have a serialization point in
> qemu_savevm_state_complete_precopy_iterable() after iterables have been sent -
> waiting for device state sending threads to finish their work.
> 
> Whether this thread_pool_wait() operation will be implemented using
> semaphores I'm not sure yet - will depend on how well this will fit other
> GThreadPool internals.
> 
> > On the dst, after a 2nd thought, MIG_CMD_SEND_NON_ITERABE may not work or
> > needed indeed, because multifd bypasses the main channel, so if we send
> > anything like MIG_CMD_SEND_NON_ITERABE on the main channel it won't
> > guarantee multifd load all complete.  However IIUC that can be used in a
> > similar way as the src qemu I mentioned above with a per-device semaphore,
> > so that only all the iterable data of this device loaded and applied to the
> > HW would it post(), before that, vfio_load_state() should wait() on that
> > sem waiting for data to ready (while multifd threads will be doing that
> > part).  I wonder whether we may reuse the multifd recv thread in the
> > initial version, so maybe we don't need any other threads on destination.
> > 
> > The load_finish() interface is currently not able to be reused right,
> > afaict.  Just have a look at its definition:
> > 
> >      /**
> >       * @load_finish
> >       *
> >       * Poll whether all asynchronous device state loading had finished.
> >       * Not called on the load failure path.
> >       *
> >       * Called while holding the qemu_loadvm_load_finish_ready_lock.
> >       *
> >       * If this method signals "not ready" then it might not be called
> >       * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> >       * while holding qemu_loadvm_load_finish_ready_lock.
> >       *
> >       * @opaque: data pointer passed to register_savevm_live()
> >       * @is_finished: whether the loading had finished (output parameter)
> >       * @errp: pointer to Error*, to store an error if it happens.
> >       *
> >       * Returns zero to indicate success and negative for error
> >       * It's not an error that the loading still hasn't finished.
> >       */
> >      int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > 
> > It's over complicated on defining all its details:
> > 
> >    - Not re-entrant by default..
> 
> What do you mean by "re-entrant" here?
> 
> This handler is called only from single migration thread, so it cannot
> be re-entered anyway since the control doesn't return to the migration
> code until this handler exits (and obviously the handler won't call
> itself recursively).

I think it's not a good design to say "you can call this function once, but
not the 2nd time until you wait on a semaphore".

> 
> > this is so weirdly designed so that the
> >      caller needs to know which is even the "1st invocation of the
> >      function"... It is just weird.
> 
> I don't quite understand that - why do you think that caller needs to
> know whether this is the "1st invocation of the function"?
> 
> The caller only tracks whether all these handlers reported that they
> finished their work:
> >       bool all_ready = true;
> >       QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> >           bool this_ready;
> > 
> >           ret = se->ops->load_finish(se->opaque, &this_ready, &local_err);
> >           if (ret) {
> >           } else if (!this_ready) {
> >               all_ready = false;
> >           }
> > 
> >       }
> >       if (all_ready) {
> >             break;
> >       }
> 
> 
> >    - Requires one more global mutex that non vmstate handler ever requested,
> 
> Could you elaborate what do you mean by "that non vmstate handler ever requested"?

I meant no historical vmstate handler hook functions requires such
complicated locking to work.

> 
> >      that I feel like perhaps can be replaced by a sem (then to drop the
> >      condvar)?
> 
> Once we have ability to load device config state outside main migration
> thread replacing "load_finish" handler with a semaphore should indeed be
> possible (that's internal migration API so there should be no issue
> removing it as not necessary anymore at this point).
> 
> But for now, the devices need to have ability to run their config load
> code on the main migration thread, and for that they need to be called
> from this handler "load_finish".

A sem seems a must here to notify the iterable data finished loading, but
that doesn't need to hook to the vmstate handler, but some post-process
tasks, like what we do around cpu_synchronize_all_post_init() time.

If per-device vmstate handler hook version of load_finish() is destined to
look as weird in this case, I'd rather consider a totally separate way to
enqueue some jobs that needs to be run after all vmstates loaded.  Then
after one VFIO device fully loads its data, it enqueues the task and post()
to one migration sem saying that "there's one post-process task, please run
it in migration thread".  There can be a total number of tasks registered
so that migration thread knows not to continue until these number of tasks
processed.  That counter can be part of vmstate handler, maybe, reporting
that "this vmstate handler has one post-process task".

Maybe you have other ideas, but please no, let's avoid this load_finish()
thing..

I'd rather still see justifications showing that this 70ms really is
helpful.. I'd rather wish we have +70ms*Ndev downtime but drop this hook
until we have a clearer mind when all config space can be loaded
concurrently, for example.  So we start from simple.

> 
> >    - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
> >      above..
> > 
> > So if you really think it matters to load whatever VFIO device who's
> > iterable data is ready first, then let's try come up with some better
> > interface..  I can try to think about it too, but please answer me
> > questions above so I can understand what I am missing on why that's
> > important.  Numbers could help, even if 4 VF and I wonder how much diff
> > there can be.  Mostly, I don't know why it's slow right now if it is; I
> > thought it should be pretty fast, at least not a concern in VFIO migration
> > world (which can take seconds of downtime or more..).
> > 
> > IOW, it sounds more reasonalbe to me that no matter whether vfio will
> > support multifd, it'll be nice we stick with vfio_load_state() /
> > vfio_save_state() for config space, and hopefully it's also easier it
> > always go via the main channel to everyone.  In these two hooks, VFIO can
> > do whatever it wants to sync with other things (on src, sync with
> > concurrent thread pool saving iterable data and dumping things to multifd
> > channels; on dst, sync with multifd concurrent loads). I think it can
> > remove the requirement on the load_finish() interface completely.  Yes,
> > this can only load VFIO's pci config space one by one, but I think this is
> > much simpler, and I hope it's also not that slow, but I'm not sure.
> 
> To be clear, I made a following diagram describing how the patch set
> is supposed to work right now, including changing per-device
> VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
> 
> Time flows on it left to right (->).
> 
> ----------- DIAGRAM START -----------
> Source overall flow:
> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
> Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
> 
> Target overall flow:
> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
> Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
> 
> Target config data load operations flow:
> multifd config data read (1) -> config data load (2)
> 
> Notes:
> (1): per device threads running in parallel

Here I raised this question before, but I'll ask again: do you think we can
avoid using a separate thread on dest qemu, but reuse multifd recv threads?

Src probably needs its own threads because multifd sender threads takes
request, so it can't block on its own.

However dest qemu isn't like that, it's packet driven so I think maybe it's
ok VFIO directly loads the data in the multifd threads.  We may want to
have enough multifd threads to make sure IO still don't block much on the
NIC, but I think tuning the num of multifd threads should work in this
case.

> (2): currently serialized (only one such operation running at a particular time), will hopefully be parallelized in the future
> ----------- DIAGRAM END -----------
> 
> Hope the diagram survived being pasted into an e-mail message.
> 
> One can see, that even now there's a bit of "low hanging fruit" of missing
> possible parallelism:
> It seems that the source could wait for multifd device state + multifd config
> data *after* non-iterables are sent rather than before as it is done
> currently - so they will be sent in parallel with multifd data.

Currently it's blocked by this chunk of code of yours:

    if (multifd_device_state) {
        ret = multifd_join_device_state_save_threads();
        if (ret) {
            qemu_file_set_error(f, ret);
            return -1;
        }
    }

If with your proposal that vfio config space sent via multifd channels,
indeed I don't see why it can't be moved to be after non-iterable save()
completes.  Is that what you implied as "low hanging fruit"?

[***]

> 
> Since written description is often prone to misunderstanding
> could you please annotate that diagram with your proposed new flow?

What I was suggesting (removing load_finish()) is mostly the same as what
you drew I think, especially on src:

===============
Source overall flow:
Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
===============

In this case we can't do the optimization above [***], since what I
suggested requires VFIO's vfio_save_state() to dump the config space, so
the original order will be needed here.

While on dest, config data load will need to also load using vfio's
vfio_load_state() so it'll be invoked just like what we normally do with
non-iterable device states (so here "config data load operations" is part
of loading all non-iterable devices):

===============
Target overall flow:                                                              (X)
Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable (multifd config data read -> config data load operations)
Multifd channels:                                       \ multifd device state load                                /
                                        (lower part done via multifd recv threads, not separate threads)
===============

So here the ordering of (X) is not guarded by anything, however in
vfio_load_state() the device can sem_wait() on a semaphore that only be
posted until this device's device state is fully loaded.  So it's not
completely serialized - "config data load operations" of DEV1 can still
happen concurrently with "multifd device state load" of DEV2.

Sorry, this might not be as clear as it's not easy to draw in the graph,
but I hope the words can help clarify what I meant.

If 70ms is not a major deal, I suggest we consider above approach, I think
it can simplify at least the vmstate handler API.  If 70ms matters, let's
try refactor load_finish() to something usable.

Thanks,
Maciej S. Szmigiero Oct. 1, 2024, 8:41 p.m. UTC | #13
On 30.09.2024 23:57, Peter Xu wrote:
> On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
>> On 27.09.2024 02:53, Peter Xu wrote:
>>> On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
>>>> On 20.09.2024 18:45, Peter Xu wrote:
>>>>> On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
>>>>>> On 19.09.2024 23:11, Peter Xu wrote:
>>>>>>> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>> On 9.09.2024 22:03, Peter Xu wrote:
>>>>>>>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>>>>>>>
>>>>>>>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>>>>>>>> a device-specific asynchronous device state loading operation had finished.
>>>>>>>>>>
>>>>>>>>>> In order to avoid calling this handler needlessly the device is supposed
>>>>>>>>>> to notify the migration code of its possible readiness via a call to
>>>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>>>>>>>> qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>>>>>>>> ---
>>>>>>>>>>       include/migration/register.h | 21 +++++++++++++++
>>>>>>>>>>       migration/migration.c        |  6 +++++
>>>>>>>>>>       migration/migration.h        |  3 +++
>>>>>>>>>>       migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>>>>>>>>>       migration/savevm.h           |  4 +++
>>>>>>>>>>       5 files changed, 86 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/include/migration/register.h b/include/migration/register.h
>>>>>>>>>> index 4a578f140713..44d8cf5192ae 100644
>>>>>>>>>> --- a/include/migration/register.h
>>>>>>>>>> +++ b/include/migration/register.h
>>>>>>>>>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>>>>>>>>>           int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>>>>>>>>>                                    Error **errp);
>>>>>>>>>> +    /**
>>>>>>>>>> +     * @load_finish
>>>>>>>>>> +     *
>>>>>>>>>> +     * Poll whether all asynchronous device state loading had finished.
>>>>>>>>>> +     * Not called on the load failure path.
>>>>>>>>>> +     *
>>>>>>>>>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>> +     *
>>>>>>>>>> +     * If this method signals "not ready" then it might not be called
>>>>>>>>>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>>>>>>>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>
>>>>>>>>> [1]
>>>>>>>>>
>>>>>>>>>> +     *
>>>>>>>>>> +     * @opaque: data pointer passed to register_savevm_live()
>>>>>>>>>> +     * @is_finished: whether the loading had finished (output parameter)
>>>>>>>>>> +     * @errp: pointer to Error*, to store an error if it happens.
>>>>>>>>>> +     *
>>>>>>>>>> +     * Returns zero to indicate success and negative for error
>>>>>>>>>> +     * It's not an error that the loading still hasn't finished.
>>>>>>>>>> +     */
>>>>>>>>>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>>>>>>>
>>>>>>>>> The load_finish() semantics is a bit weird, especially above [1] on "only
>>>>>>>>> allowed to be called once if ..." and also on the locks.
>>>>>>>>
>>>>>>>> The point of this remark is that a driver needs to call
>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
>>>>>>>> core to call its load_finish handler again.
>>>>>>>>
>>>>>>>>> It looks to me vfio_load_finish() also does the final load of the device.
>>>>>>>>>
>>>>>>>>> I wonder whether that final load can be done in the threads,
>>>>>>>>
>>>>>>>> Here, the problem is that current VFIO VMState has to be loaded from the main
>>>>>>>> migration thread as it internally calls QEMU core address space modification
>>>>>>>> methods which explode if called from another thread(s).
>>>>>>>
>>>>>>> Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
>>>>>>> BQL if possible, when that's ready then in your case here IIUC you can
>>>>>>> simply take BQL in whichever thread that loads it.. but yeah it's not ready
>>>>>>> at least..
>>>>>>
>>>>>> Yeah, long term we might want to work on making these QEMU core address space
>>>>>> modification methods somehow callable from multiple threads but that's
>>>>>> definitely not something for the initial patch set.
>>>>>>
>>>>>>> Would it be possible vfio_save_complete_precopy_async_thread_config_state()
>>>>>>> be done in VFIO's save_live_complete_precopy() through the main channel
>>>>>>> somehow?  IOW, does it rely on iterative data to be fetched first from
>>>>>>> kernel, or completely separate states?
>>>>>>
>>>>>> The device state data needs to be fully loaded first before "activating"
>>>>>> the device by loading its config state.
>>>>>>
>>>>>>> And just curious: how large is it
>>>>>>> normally (and I suppose this decides whether it's applicable to be sent via
>>>>>>> the main channel at all..)?
>>>>>>
>>>>>> Config data is *much* smaller than device state data - as far as I remember
>>>>>> it was on order of kilobytes.
>>>>>>
>>>>>>>>
>>>>>>>>> then after
>>>>>>>>> everything loaded the device post a semaphore telling the main thread to
>>>>>>>>> continue.  See e.g.:
>>>>>>>>>
>>>>>>>>>          if (migrate_switchover_ack()) {
>>>>>>>>>              qemu_loadvm_state_switchover_ack_needed(mis);
>>>>>>>>>          }
>>>>>>>>>
>>>>>>>>> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
>>>>>>>>> when all things are loaded?  We can then get rid of this slightly awkward
>>>>>>>>> interface.  I had a feeling that things can be simplified (e.g., if the
>>>>>>>>> thread will take care of loading the final vmstate then the mutex is also
>>>>>>>>> not needed? etc.).
>>>>>>>>
>>>>>>>> With just a single call to switchover_ack_needed per VFIO device it would
>>>>>>>> need to do a blocking wait for the device buffers and config state load
>>>>>>>> to finish, therefore blocking other VFIO devices from potentially loading
>>>>>>>> their config state if they are ready to begin this operation earlier.
>>>>>>>
>>>>>>> I am not sure I get you here, loading VFIO device states (I mean, the
>>>>>>> non-iterable part) will need to be done sequentially IIUC due to what you
>>>>>>> said and should rely on BQL, so I don't know how that could happen
>>>>>>> concurrently for now.  But I think indeed BQL is a problem.
>>>>>> Consider that we have two VFIO devices (A and B), with the following order
>>>>>> of switchover_ack_needed handler calls for them: first A get this call,
>>>>>> once the call for A finishes then B gets this call.
>>>>>>
>>>>>> Now consider what happens if B had loaded all its buffers (in the loading
>>>>>> thread) and it is ready for its config load before A finished loading its
>>>>>> buffers.
>>>>>>
>>>>>> B has to wait idle in this situation (even though it could have been already
>>>>>> loading its config) since the switchover_ack_needed handler for A won't
>>>>>> return until A is fully done.
>>>>>
>>>>> This sounds like a performance concern, and I wonder how much this impacts
>>>>> the real workload (that you run a test and measure, with/without such
>>>>> concurrency) when we can save two devices in parallel anyway; I would
>>>>> expect the real diff is small due to the fact I mentioned that we save >1
>>>>> VFIO devices concurrently via multifd.
>>>>>
>>>>> Do you think we can start with a simpler approach?
>>>>
>>>> I don't think introducing a performance/scalability issue like that is
>>>> a good thing, especially that we already have a design that avoids it.
>>>>
>>>> Unfortunately, my current setup does not allow live migrating VMs with
>>>> more than 4 VFs so I can't benchmark that.
>>>
>>> /me wonders why benchmarking it requires more than 4 VFs.
>>
>> My point here was that the scalability problem will most likely get more
>> pronounced with more VFs.
>>
>>>>
>>>> But I almost certain that with more VFs the situation with devices being
>>>> ready out-of-order will get even more likely.
>>>
>>> If the config space is small, why loading it in sequence would be a
>>> problem?
>>>
>>> Have you measured how much time it needs to load one VF's config space that
>>> you're using?  I suppose that's vfio_load_device_config_state() alone?
>>
>> It's not the amount of data to load matters here but that these address
>> space operations are slow.
>>
>> The whole config load takes ~70 ms per device - that's time equivalent
>> of transferring 875 MiB of device state via a 100 GBit/s link.
> 
> What's the downtime of migration with 1/2/4 VFs?  I remember I saw some
> data somewhere but it's not in the cover letter.  It'll be good to mention
> these results in the cover letter when repost.

Downtimes with the device state transfer being disabled / enabled:
             4 VFs   2 VFs    1 VF
Disabled: 1783 ms  614 ms  283 ms
Enabled:  1068 ms  434 ms  274 ms

Will add these numbers to the cover letter of the next patch set version.

> I'm guessing 70ms isn't a huge deal here, if your NIC has 128GB internal
> device state to migrate.. but maybe I'm wrong.

It's ~100 MiB of device state per VF here.

And it's 70ms of downtime *per device*:
so with 4 VF it's ~280ms of downtime taken by the config loads.
That's a lot - with perfect parallelization this downtime should
*reduce by* 210ms.

> I also wonder whether you profiled a bit on how that 70ms contributes to
> what is slow.

I think that's something we can do after we have parallel config loads
and it turns out their downtime for some reason still scales strongly
linearly with the number of VFIO devices (rather than taking roughly
constant time regardless of the count of these devices if running perfectly
in parallel).

>>
>>>>
>>>>> So what I'm thinking could be very clean is, we just discussed about
>>>>> MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
>>>>> wonder when with it why not we move one step further to have
>>>>> MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
>>>>> ready to send non-iterable".  It can be controlled by the same migration
>>>>> property so we only send these two flags in 9.2+ machine types.
>>>>>
>>>>> Then IIUC VFIO can send config data through main wire (just like most of
>>>>> other pci devices! which is IMHO a good fit..) and on destination VFIO
>>>>> holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
>>>>
>>>> Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
>>>> to the considerations above) also delay starting the config load until all
>>>> iterable devices were read/transferred/loaded and also would complicate
>>>> future efforts at loading that config data in parallel.
>>>
>>> However I wonder whether we can keep it simple in that VFIO's config space
>>> is still always saved in vfio_save_state().  I still think it's easier we
>>> stick with the main channel whenever possible.  For this specific case, if
>>> the config space is small I think it's tricky you bypass this with:
>>>
>>>       if (migration->multifd_transfer) {
>>>           /* Emit dummy NOP data */
>>>           qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>>           return;
>>>       }
>>>
>>> Then squash this as the tail of the iterable data.
>>>
>>> On the src, I think it could use a per-device semaphore, so that iterable
>>> save() thread will post() only if it finishes dumping all the data, then
>>> that orders VFIO iterable data v.s. config space save().
>>
>> In the future we want to not only transfer but also load the config data
>> in parallel.
> 
> How feasible do you think this idea is?  E.g. does it involve BQL so far
> (e.g. memory updates, others)?  What's still missing to make it concurrent?

My gut feeling is that is feasible overall but it's too much of a rabbit
hole for the first version of this device state transfer feature.

I think it will need some deeper QEMU core address space management changes,
which need to be researched/developed/tested/reviewed/etc. on their own.

If it was an easy task I would have gladly included such support in this
patch set version already for extra downtime reduction :)

>>
>> So going back to transferring this data serialized via the main migration
>> channel would be taking a step back here.
> 
> If below holds true:
> 
>    - 70ms is still very small amount in the total downtime, and,
> 
>    - this can avoid the below load_finish() API
> 
> Then I'd go for it.. or again, at least the load_finish() needs change,
> IMHO..

As I wrote above, it's not 70 ms total but 70 ms per device.

Also, even 70 ms is a lot, considering that the default downtime limit
is 300 ms - with a single device that's nearly 1/4 of the limit already.

>>
>> By the way, we already have a serialization point in
>> qemu_savevm_state_complete_precopy_iterable() after iterables have been sent -
>> waiting for device state sending threads to finish their work.
>>
>> Whether this thread_pool_wait() operation will be implemented using
>> semaphores I'm not sure yet - will depend on how well this will fit other
>> GThreadPool internals.
>>
>>> On the dst, after a 2nd thought, MIG_CMD_SEND_NON_ITERABE may not work or
>>> needed indeed, because multifd bypasses the main channel, so if we send
>>> anything like MIG_CMD_SEND_NON_ITERABE on the main channel it won't
>>> guarantee multifd load all complete.  However IIUC that can be used in a
>>> similar way as the src qemu I mentioned above with a per-device semaphore,
>>> so that only all the iterable data of this device loaded and applied to the
>>> HW would it post(), before that, vfio_load_state() should wait() on that
>>> sem waiting for data to ready (while multifd threads will be doing that
>>> part).  I wonder whether we may reuse the multifd recv thread in the
>>> initial version, so maybe we don't need any other threads on destination.
>>>
>>> The load_finish() interface is currently not able to be reused right,
>>> afaict.  Just have a look at its definition:
>>>
>>>       /**
>>>        * @load_finish
>>>        *
>>>        * Poll whether all asynchronous device state loading had finished.
>>>        * Not called on the load failure path.
>>>        *
>>>        * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>        *
>>>        * If this method signals "not ready" then it might not be called
>>>        * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>        * while holding qemu_loadvm_load_finish_ready_lock.
>>>        *
>>>        * @opaque: data pointer passed to register_savevm_live()
>>>        * @is_finished: whether the loading had finished (output parameter)
>>>        * @errp: pointer to Error*, to store an error if it happens.
>>>        *
>>>        * Returns zero to indicate success and negative for error
>>>        * It's not an error that the loading still hasn't finished.
>>>        */
>>>       int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>
>>> It's over complicated on defining all its details:
>>>
>>>     - Not re-entrant by default..
>>
>> What do you mean by "re-entrant" here?
>>
>> This handler is called only from single migration thread, so it cannot
>> be re-entered anyway since the control doesn't return to the migration
>> code until this handler exits (and obviously the handler won't call
>> itself recursively).
> 
> I think it's not a good design to say "you can call this function once, but
> not the 2nd time until you wait on a semaphore".

That's not exactly how this API is supposed to work.

I suspect that you took that "it might not be called again until
qemu_loadvm_load_finish_ready_broadcast() is invoked" as prohibition
from being called again until that signal is broadcast.

The intended meaning of that sentence was "it is possible that it won't
be called again until qemu_loadvm_load_finish_ready_broadcast() is invoked".

In other words, the migration core is free to call this handler how
many times the migration core wants.

But if the handler wants be *sure* that it will get called by the
migration core after the handler has returned "not ready" then it needs
to arrange for load_finish_ready_broadcast() to be invoked somehow.

(..)
>>
>>>       that I feel like perhaps can be replaced by a sem (then to drop the
>>>       condvar)?
>>
>> Once we have ability to load device config state outside main migration
>> thread replacing "load_finish" handler with a semaphore should indeed be
>> possible (that's internal migration API so there should be no issue
>> removing it as not necessary anymore at this point).
>>
>> But for now, the devices need to have ability to run their config load
>> code on the main migration thread, and for that they need to be called
>> from this handler "load_finish".
> 
> A sem seems a must here to notify the iterable data finished loading, but
> that doesn't need to hook to the vmstate handler, but some post-process
> tasks, like what we do around cpu_synchronize_all_post_init() time.
> 
> If per-device vmstate handler hook version of load_finish() is destined to
> look as weird in this case, I'd rather consider a totally separate way to
> enqueue some jobs that needs to be run after all vmstates loaded.  Then
> after one VFIO device fully loads its data, it enqueues the task and post()
> to one migration sem saying that "there's one post-process task, please run
> it in migration thread".  There can be a total number of tasks registered
> so that migration thread knows not to continue until these number of tasks
> processed.  That counter can be part of vmstate handler, maybe, reporting
> that "this vmstate handler has one post-process task".
> 
> Maybe you have other ideas, but please no, let's avoid this load_finish()
> thing..

I can certainly implement the task-queuing approach instead of the
load_finish() handler API if you like such approach more.

> I'd rather still see justifications showing that this 70ms really is
> helpful.. I'd rather wish we have +70ms*Ndev downtime but drop this hook
> until we have a clearer mind when all config space can be loaded
> concurrently, for example.  So we start from simple.

As I wrote above, even 70ms for a single device is a lot considering the
default downtime limit - and that's even more true if multiplied by
multiple devices.

>>
>>>     - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
>>>       above..
>>>
>>> So if you really think it matters to load whatever VFIO device who's
>>> iterable data is ready first, then let's try come up with some better
>>> interface..  I can try to think about it too, but please answer me
>>> questions above so I can understand what I am missing on why that's
>>> important.  Numbers could help, even if 4 VF and I wonder how much diff
>>> there can be.  Mostly, I don't know why it's slow right now if it is; I
>>> thought it should be pretty fast, at least not a concern in VFIO migration
>>> world (which can take seconds of downtime or more..).
>>>
>>> IOW, it sounds more reasonalbe to me that no matter whether vfio will
>>> support multifd, it'll be nice we stick with vfio_load_state() /
>>> vfio_save_state() for config space, and hopefully it's also easier it
>>> always go via the main channel to everyone.  In these two hooks, VFIO can
>>> do whatever it wants to sync with other things (on src, sync with
>>> concurrent thread pool saving iterable data and dumping things to multifd
>>> channels; on dst, sync with multifd concurrent loads). I think it can
>>> remove the requirement on the load_finish() interface completely.  Yes,
>>> this can only load VFIO's pci config space one by one, but I think this is
>>> much simpler, and I hope it's also not that slow, but I'm not sure.
>>
>> To be clear, I made a following diagram describing how the patch set
>> is supposed to work right now, including changing per-device
>> VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
>>
>> Time flows on it left to right (->).
>>
>> ----------- DIAGRAM START -----------
>> Source overall flow:
>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
>> Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
>>
>> Target overall flow:
>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
>> Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
>>
>> Target config data load operations flow:
>> multifd config data read (1) -> config data load (2)
>>
>> Notes:
>> (1): per device threads running in parallel
> 
> Here I raised this question before, but I'll ask again: do you think we can
> avoid using a separate thread on dest qemu, but reuse multifd recv threads?
> 
> Src probably needs its own threads because multifd sender threads takes
> request, so it can't block on its own.
> 
> However dest qemu isn't like that, it's packet driven so I think maybe it's
> ok VFIO directly loads the data in the multifd threads.  We may want to
> have enough multifd threads to make sure IO still don't block much on the
> NIC, but I think tuning the num of multifd threads should work in this
> case.

We need to have the receiving threads decoupled from the VFIO device state
loading threads at least because otherwise:
1) You can have a deadlock if device state for multiple devices arrives
out of order, like here:

Time flows left to right (->).
Multifd channel 1: (VFIO device 1 buffer 2) (VFIO device 2 buffer 1)
Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 1)

Both channel receive/load threads would be stuck forever in this case,
since they can't load buffer 2 for devices 1 and 2 until they load
buffer 1 for each of these devices.

2) If devices are loading buffers at different speeds you don't want
to block the faster device from receiving new buffer just because
the slower one hasn't finished its loading yet.

>> (2): currently serialized (only one such operation running at a particular time), will hopefully be parallelized in the future
>> ----------- DIAGRAM END -----------
>>
>> Hope the diagram survived being pasted into an e-mail message.
>>
>> One can see, that even now there's a bit of "low hanging fruit" of missing
>> possible parallelism:
>> It seems that the source could wait for multifd device state + multifd config
>> data *after* non-iterables are sent rather than before as it is done
>> currently - so they will be sent in parallel with multifd data.
> 
> Currently it's blocked by this chunk of code of yours:
> 
>      if (multifd_device_state) {
>          ret = multifd_join_device_state_save_threads();
>          if (ret) {
>              qemu_file_set_error(f, ret);
>              return -1;
>          }
>      }
> 
> If with your proposal that vfio config space sent via multifd channels,
> indeed I don't see why it can't be moved to be after non-iterable save()
> completes.  Is that what you implied as "low hanging fruit"?

Yes, exactly - to wait for save threads to finish only after non-iterables
have already been saved.

By "low hanging fruit" I meant it should be a fairly easy change.

> [***]
> 
>>
>> Since written description is often prone to misunderstanding
>> could you please annotate that diagram with your proposed new flow?
> 
> What I was suggesting (removing load_finish()) is mostly the same as what
> you drew I think, especially on src:
> 
> ===============
> Source overall flow:
> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
> Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
> ===============
> 
> In this case we can't do the optimization above [***], since what I
> suggested requires VFIO's vfio_save_state() to dump the config space, so
> the original order will be needed here.
> 
> While on dest, config data load will need to also load using vfio's
> vfio_load_state() so it'll be invoked just like what we normally do with
> non-iterable device states (so here "config data load operations" is part
> of loading all non-iterable devices):
> 
> ===============
> Target overall flow:                                                              (X)
> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable (multifd config data read -> config data load operations)
> Multifd channels:                                       \ multifd device state load                                /
>                                          (lower part done via multifd recv threads, not separate threads)
> ===============
> 
> So here the ordering of (X) is not guarded by anything, however in
> vfio_load_state() the device can sem_wait() on a semaphore that only be
> posted until this device's device state is fully loaded.  So it's not
> completely serialized - "config data load operations" of DEV1 can still
> happen concurrently with "multifd device state load" of DEV2.
> 
> Sorry, this might not be as clear as it's not easy to draw in the graph,
> but I hope the words can help clarify what I meant.
> 
> If 70ms is not a major deal, I suggest we consider above approach, I think
> it can simplify at least the vmstate handler API.  If 70ms matters, let's
> try refactor load_finish() to something usable.

I understand your point here, however as I wrote above, I think that's too
much downtime to "waste" so I will try to rework the load_finish() handler
into the task-queuing approach as you suggested earlier.

> Thanks,
> 

Thanks,
Maciej
Peter Xu Oct. 1, 2024, 9:30 p.m. UTC | #14
On Tue, Oct 01, 2024 at 10:41:14PM +0200, Maciej S. Szmigiero wrote:
> On 30.09.2024 23:57, Peter Xu wrote:
> > On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
> > > On 27.09.2024 02:53, Peter Xu wrote:
> > > > On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
> > > > > On 20.09.2024 18:45, Peter Xu wrote:
> > > > > > On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > On 19.09.2024 23:11, Peter Xu wrote:
> > > > > > > > On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > On 9.09.2024 22:03, Peter Xu wrote:
> > > > > > > > > > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > > > > > > > > > 
> > > > > > > > > > > load_finish SaveVMHandler allows migration code to poll whether
> > > > > > > > > > > a device-specific asynchronous device state loading operation had finished.
> > > > > > > > > > > 
> > > > > > > > > > > In order to avoid calling this handler needlessly the device is supposed
> > > > > > > > > > > to notify the migration code of its possible readiness via a call to
> > > > > > > > > > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > > > > > > > > > qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > > > > > > > > > ---
> > > > > > > > > > >       include/migration/register.h | 21 +++++++++++++++
> > > > > > > > > > >       migration/migration.c        |  6 +++++
> > > > > > > > > > >       migration/migration.h        |  3 +++
> > > > > > > > > > >       migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >       migration/savevm.h           |  4 +++
> > > > > > > > > > >       5 files changed, 86 insertions(+)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > > > > > > > > > index 4a578f140713..44d8cf5192ae 100644
> > > > > > > > > > > --- a/include/migration/register.h
> > > > > > > > > > > +++ b/include/migration/register.h
> > > > > > > > > > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > > > > > > > > > >           int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > > > > > > > > > >                                    Error **errp);
> > > > > > > > > > > +    /**
> > > > > > > > > > > +     * @load_finish
> > > > > > > > > > > +     *
> > > > > > > > > > > +     * Poll whether all asynchronous device state loading had finished.
> > > > > > > > > > > +     * Not called on the load failure path.
> > > > > > > > > > > +     *
> > > > > > > > > > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > > +     *
> > > > > > > > > > > +     * If this method signals "not ready" then it might not be called
> > > > > > > > > > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > > > > > > > > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > 
> > > > > > > > > > [1]
> > > > > > > > > > 
> > > > > > > > > > > +     *
> > > > > > > > > > > +     * @opaque: data pointer passed to register_savevm_live()
> > > > > > > > > > > +     * @is_finished: whether the loading had finished (output parameter)
> > > > > > > > > > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > > > > > > > > > +     *
> > > > > > > > > > > +     * Returns zero to indicate success and negative for error
> > > > > > > > > > > +     * It's not an error that the loading still hasn't finished.
> > > > > > > > > > > +     */
> > > > > > > > > > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > > > > > > > 
> > > > > > > > > > The load_finish() semantics is a bit weird, especially above [1] on "only
> > > > > > > > > > allowed to be called once if ..." and also on the locks.
> > > > > > > > > 
> > > > > > > > > The point of this remark is that a driver needs to call
> > > > > > > > > qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> > > > > > > > > core to call its load_finish handler again.
> > > > > > > > > 
> > > > > > > > > > It looks to me vfio_load_finish() also does the final load of the device.
> > > > > > > > > > 
> > > > > > > > > > I wonder whether that final load can be done in the threads,
> > > > > > > > > 
> > > > > > > > > Here, the problem is that current VFIO VMState has to be loaded from the main
> > > > > > > > > migration thread as it internally calls QEMU core address space modification
> > > > > > > > > methods which explode if called from another thread(s).
> > > > > > > > 
> > > > > > > > Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> > > > > > > > BQL if possible, when that's ready then in your case here IIUC you can
> > > > > > > > simply take BQL in whichever thread that loads it.. but yeah it's not ready
> > > > > > > > at least..
> > > > > > > 
> > > > > > > Yeah, long term we might want to work on making these QEMU core address space
> > > > > > > modification methods somehow callable from multiple threads but that's
> > > > > > > definitely not something for the initial patch set.
> > > > > > > 
> > > > > > > > Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> > > > > > > > be done in VFIO's save_live_complete_precopy() through the main channel
> > > > > > > > somehow?  IOW, does it rely on iterative data to be fetched first from
> > > > > > > > kernel, or completely separate states?
> > > > > > > 
> > > > > > > The device state data needs to be fully loaded first before "activating"
> > > > > > > the device by loading its config state.
> > > > > > > 
> > > > > > > > And just curious: how large is it
> > > > > > > > normally (and I suppose this decides whether it's applicable to be sent via
> > > > > > > > the main channel at all..)?
> > > > > > > 
> > > > > > > Config data is *much* smaller than device state data - as far as I remember
> > > > > > > it was on order of kilobytes.
> > > > > > > 
> > > > > > > > > 
> > > > > > > > > > then after
> > > > > > > > > > everything loaded the device post a semaphore telling the main thread to
> > > > > > > > > > continue.  See e.g.:
> > > > > > > > > > 
> > > > > > > > > >          if (migrate_switchover_ack()) {
> > > > > > > > > >              qemu_loadvm_state_switchover_ack_needed(mis);
> > > > > > > > > >          }
> > > > > > > > > > 
> > > > > > > > > > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > > > > > > > > > when all things are loaded?  We can then get rid of this slightly awkward
> > > > > > > > > > interface.  I had a feeling that things can be simplified (e.g., if the
> > > > > > > > > > thread will take care of loading the final vmstate then the mutex is also
> > > > > > > > > > not needed? etc.).
> > > > > > > > > 
> > > > > > > > > With just a single call to switchover_ack_needed per VFIO device it would
> > > > > > > > > need to do a blocking wait for the device buffers and config state load
> > > > > > > > > to finish, therefore blocking other VFIO devices from potentially loading
> > > > > > > > > their config state if they are ready to begin this operation earlier.
> > > > > > > > 
> > > > > > > > I am not sure I get you here, loading VFIO device states (I mean, the
> > > > > > > > non-iterable part) will need to be done sequentially IIUC due to what you
> > > > > > > > said and should rely on BQL, so I don't know how that could happen
> > > > > > > > concurrently for now.  But I think indeed BQL is a problem.
> > > > > > > Consider that we have two VFIO devices (A and B), with the following order
> > > > > > > of switchover_ack_needed handler calls for them: first A get this call,
> > > > > > > once the call for A finishes then B gets this call.
> > > > > > > 
> > > > > > > Now consider what happens if B had loaded all its buffers (in the loading
> > > > > > > thread) and it is ready for its config load before A finished loading its
> > > > > > > buffers.
> > > > > > > 
> > > > > > > B has to wait idle in this situation (even though it could have been already
> > > > > > > loading its config) since the switchover_ack_needed handler for A won't
> > > > > > > return until A is fully done.
> > > > > > 
> > > > > > This sounds like a performance concern, and I wonder how much this impacts
> > > > > > the real workload (that you run a test and measure, with/without such
> > > > > > concurrency) when we can save two devices in parallel anyway; I would
> > > > > > expect the real diff is small due to the fact I mentioned that we save >1
> > > > > > VFIO devices concurrently via multifd.
> > > > > > 
> > > > > > Do you think we can start with a simpler approach?
> > > > > 
> > > > > I don't think introducing a performance/scalability issue like that is
> > > > > a good thing, especially that we already have a design that avoids it.
> > > > > 
> > > > > Unfortunately, my current setup does not allow live migrating VMs with
> > > > > more than 4 VFs so I can't benchmark that.
> > > > 
> > > > /me wonders why benchmarking it requires more than 4 VFs.
> > > 
> > > My point here was that the scalability problem will most likely get more
> > > pronounced with more VFs.
> > > 
> > > > > 
> > > > > But I almost certain that with more VFs the situation with devices being
> > > > > ready out-of-order will get even more likely.
> > > > 
> > > > If the config space is small, why loading it in sequence would be a
> > > > problem?
> > > > 
> > > > Have you measured how much time it needs to load one VF's config space that
> > > > you're using?  I suppose that's vfio_load_device_config_state() alone?
> > > 
> > > It's not the amount of data to load matters here but that these address
> > > space operations are slow.
> > > 
> > > The whole config load takes ~70 ms per device - that's time equivalent
> > > of transferring 875 MiB of device state via a 100 GBit/s link.
> > 
> > What's the downtime of migration with 1/2/4 VFs?  I remember I saw some
> > data somewhere but it's not in the cover letter.  It'll be good to mention
> > these results in the cover letter when repost.
> 
> Downtimes with the device state transfer being disabled / enabled:
>             4 VFs   2 VFs    1 VF
> Disabled: 1783 ms  614 ms  283 ms
> Enabled:  1068 ms  434 ms  274 ms
> 
> Will add these numbers to the cover letter of the next patch set version.

Thanks.

> 
> > I'm guessing 70ms isn't a huge deal here, if your NIC has 128GB internal
> > device state to migrate.. but maybe I'm wrong.
> 
> It's ~100 MiB of device state per VF here.

Ouch..

I watched your kvm forum talk recording, I remember that's where I get that
128 number but probably get the unit wrong.. ok that makes sense.

> 
> And it's 70ms of downtime *per device*:
> so with 4 VF it's ~280ms of downtime taken by the config loads.
> That's a lot - with perfect parallelization this downtime should
> *reduce by* 210ms.

Yes, in this case it's a lot.  I wonder why it won't scale as good even
with your patchset.

Did you profile why?  I highly doubt in your case network is an issue, as
there's only 100MB per-dev data, so even on 10gbps it takes 100ms only to
transfer for each, while now assuming it can run concurrently.  I think you
mentioned you were using 100gbps, right?

Logically when with multiple threads, VFIO read()s should happen at least
concurrently per-device.  Have you checked that there's no kernel-side
global VFIO lock etc. that serializes portions of the threads read()s /
write()s on the VFIO fds?

It's just a pity that you went this far, added all these logics, but
without making it fully concurrent at least per device.

I'm OK if you want this in without that figured out, but if I were you I'll
probably try to dig a bit to at least know why.

> 
> > I also wonder whether you profiled a bit on how that 70ms contributes to
> > what is slow.
> 
> I think that's something we can do after we have parallel config loads
> and it turns out their downtime for some reason still scales strongly
> linearly with the number of VFIO devices (rather than taking roughly
> constant time regardless of the count of these devices if running perfectly
> in parallel).

Similarly, I wonder whether the config space load() can involves something
globally shared.  I'd also dig a bit here, but I'll leave that to you to
decide.

> 
> > > 
> > > > > 
> > > > > > So what I'm thinking could be very clean is, we just discussed about
> > > > > > MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
> > > > > > wonder when with it why not we move one step further to have
> > > > > > MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
> > > > > > ready to send non-iterable".  It can be controlled by the same migration
> > > > > > property so we only send these two flags in 9.2+ machine types.
> > > > > > 
> > > > > > Then IIUC VFIO can send config data through main wire (just like most of
> > > > > > other pci devices! which is IMHO a good fit..) and on destination VFIO
> > > > > > holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
> > > > > 
> > > > > Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
> > > > > to the considerations above) also delay starting the config load until all
> > > > > iterable devices were read/transferred/loaded and also would complicate
> > > > > future efforts at loading that config data in parallel.
> > > > 
> > > > However I wonder whether we can keep it simple in that VFIO's config space
> > > > is still always saved in vfio_save_state().  I still think it's easier we
> > > > stick with the main channel whenever possible.  For this specific case, if
> > > > the config space is small I think it's tricky you bypass this with:
> > > > 
> > > >       if (migration->multifd_transfer) {
> > > >           /* Emit dummy NOP data */
> > > >           qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> > > >           return;
> > > >       }
> > > > 
> > > > Then squash this as the tail of the iterable data.
> > > > 
> > > > On the src, I think it could use a per-device semaphore, so that iterable
> > > > save() thread will post() only if it finishes dumping all the data, then
> > > > that orders VFIO iterable data v.s. config space save().
> > > 
> > > In the future we want to not only transfer but also load the config data
> > > in parallel.
> > 
> > How feasible do you think this idea is?  E.g. does it involve BQL so far
> > (e.g. memory updates, others)?  What's still missing to make it concurrent?
> 
> My gut feeling is that is feasible overall but it's too much of a rabbit
> hole for the first version of this device state transfer feature.
> 
> I think it will need some deeper QEMU core address space management changes,
> which need to be researched/developed/tested/reviewed/etc. on their own.
> 
> If it was an easy task I would have gladly included such support in this
> patch set version already for extra downtime reduction :)

Yes I understand.

Note that it doesn't need to be implemented and resolved in one shot, but I
wonder if it'll still be good to debug the issue and know where is not
scaling.

Considering that your design is fully concurrent as of now on iterable data
from QEMU side, it's less persuasive to provide perf numbers that still
doesn't scale that much; 1.78s -> 1.06s is a good improvement, but it
doesn't seem to solve the scalability issue that this whole series wanted
to address in general.

An extreme (bad) example is if VFIO has all ioctl()/read()/write() take a
global lock, then any work in QEMU trying to run things in parallel will be
a vain.  Such patchset cannot be accepted because the other issue needs to
be resolved first.

Now it's in the middle of best/worst condition, where it did improve but it
still doesn't scale that well.  I think it can be accepted, but still I
feel like we're ignoring some of the real issues.  We can choose to ignore
the kernel saying that "it's too much to do together", but IMHO the issues
should be tackled in the other way round.. the normal case is one should
work out the kernel scalability issues, then QEMU should be on top.. Simply
because any kernel change that might scale >1 device save()/load() can
affect future QEMU change and design, not vice versa.

Again, I know you wished we make some progress, so I don't have a strong
opinion.  Just FYI.

> 
> > > 
> > > So going back to transferring this data serialized via the main migration
> > > channel would be taking a step back here.
> > 
> > If below holds true:
> > 
> >    - 70ms is still very small amount in the total downtime, and,
> > 
> >    - this can avoid the below load_finish() API
> > 
> > Then I'd go for it.. or again, at least the load_finish() needs change,
> > IMHO..
> 
> As I wrote above, it's not 70 ms total but 70 ms per device.
> 
> Also, even 70 ms is a lot, considering that the default downtime limit
> is 300 ms - with a single device that's nearly 1/4 of the limit already.
> 
> > > 
> > > By the way, we already have a serialization point in
> > > qemu_savevm_state_complete_precopy_iterable() after iterables have been sent -
> > > waiting for device state sending threads to finish their work.
> > > 
> > > Whether this thread_pool_wait() operation will be implemented using
> > > semaphores I'm not sure yet - will depend on how well this will fit other
> > > GThreadPool internals.
> > > 
> > > > On the dst, after a 2nd thought, MIG_CMD_SEND_NON_ITERABE may not work or
> > > > needed indeed, because multifd bypasses the main channel, so if we send
> > > > anything like MIG_CMD_SEND_NON_ITERABE on the main channel it won't
> > > > guarantee multifd load all complete.  However IIUC that can be used in a
> > > > similar way as the src qemu I mentioned above with a per-device semaphore,
> > > > so that only all the iterable data of this device loaded and applied to the
> > > > HW would it post(), before that, vfio_load_state() should wait() on that
> > > > sem waiting for data to ready (while multifd threads will be doing that
> > > > part).  I wonder whether we may reuse the multifd recv thread in the
> > > > initial version, so maybe we don't need any other threads on destination.
> > > > 
> > > > The load_finish() interface is currently not able to be reused right,
> > > > afaict.  Just have a look at its definition:
> > > > 
> > > >       /**
> > > >        * @load_finish
> > > >        *
> > > >        * Poll whether all asynchronous device state loading had finished.
> > > >        * Not called on the load failure path.
> > > >        *
> > > >        * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > >        *
> > > >        * If this method signals "not ready" then it might not be called
> > > >        * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > >        * while holding qemu_loadvm_load_finish_ready_lock.
> > > >        *
> > > >        * @opaque: data pointer passed to register_savevm_live()
> > > >        * @is_finished: whether the loading had finished (output parameter)
> > > >        * @errp: pointer to Error*, to store an error if it happens.
> > > >        *
> > > >        * Returns zero to indicate success and negative for error
> > > >        * It's not an error that the loading still hasn't finished.
> > > >        */
> > > >       int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > 
> > > > It's over complicated on defining all its details:
> > > > 
> > > >     - Not re-entrant by default..
> > > 
> > > What do you mean by "re-entrant" here?
> > > 
> > > This handler is called only from single migration thread, so it cannot
> > > be re-entered anyway since the control doesn't return to the migration
> > > code until this handler exits (and obviously the handler won't call
> > > itself recursively).
> > 
> > I think it's not a good design to say "you can call this function once, but
> > not the 2nd time until you wait on a semaphore".
> 
> That's not exactly how this API is supposed to work.
> 
> I suspect that you took that "it might not be called again until
> qemu_loadvm_load_finish_ready_broadcast() is invoked" as prohibition
> from being called again until that signal is broadcast.
> 
> The intended meaning of that sentence was "it is possible that it won't
> be called again until qemu_loadvm_load_finish_ready_broadcast() is invoked".
> 
> In other words, the migration core is free to call this handler how
> many times the migration core wants.
> 
> But if the handler wants be *sure* that it will get called by the
> migration core after the handler has returned "not ready" then it needs
> to arrange for load_finish_ready_broadcast() to be invoked somehow.

OK I see.

> 
> (..)
> > > 
> > > >       that I feel like perhaps can be replaced by a sem (then to drop the
> > > >       condvar)?
> > > 
> > > Once we have ability to load device config state outside main migration
> > > thread replacing "load_finish" handler with a semaphore should indeed be
> > > possible (that's internal migration API so there should be no issue
> > > removing it as not necessary anymore at this point).
> > > 
> > > But for now, the devices need to have ability to run their config load
> > > code on the main migration thread, and for that they need to be called
> > > from this handler "load_finish".
> > 
> > A sem seems a must here to notify the iterable data finished loading, but
> > that doesn't need to hook to the vmstate handler, but some post-process
> > tasks, like what we do around cpu_synchronize_all_post_init() time.
> > 
> > If per-device vmstate handler hook version of load_finish() is destined to
> > look as weird in this case, I'd rather consider a totally separate way to
> > enqueue some jobs that needs to be run after all vmstates loaded.  Then
> > after one VFIO device fully loads its data, it enqueues the task and post()
> > to one migration sem saying that "there's one post-process task, please run
> > it in migration thread".  There can be a total number of tasks registered
> > so that migration thread knows not to continue until these number of tasks
> > processed.  That counter can be part of vmstate handler, maybe, reporting
> > that "this vmstate handler has one post-process task".
> > 
> > Maybe you have other ideas, but please no, let's avoid this load_finish()
> > thing..
> 
> I can certainly implement the task-queuing approach instead of the
> load_finish() handler API if you like such approach more.

I have an even simpler solution now.  I think you can reuse precopy
notifiers.

You can add one new PRECOPY_NOTIFY_INCOMING_COMPLETE event, invoke it after
vmstate load all done.

As long as VFIO devices exist, VFIO can register with that event, then it
can do whatever it wants in the main loader thread with BQL held.

You can hide that sem post() / wait() all there, then it's completely VFIO
internal.  Then we leave vmstate handler alone; it just doesn't sound
suitable when the hooks need to be called out of order.

> 
> > I'd rather still see justifications showing that this 70ms really is
> > helpful.. I'd rather wish we have +70ms*Ndev downtime but drop this hook
> > until we have a clearer mind when all config space can be loaded
> > concurrently, for example.  So we start from simple.
> 
> As I wrote above, even 70ms for a single device is a lot considering the
> default downtime limit - and that's even more true if multiplied by
> multiple devices.
> 
> > > 
> > > >     - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
> > > >       above..
> > > > 
> > > > So if you really think it matters to load whatever VFIO device who's
> > > > iterable data is ready first, then let's try come up with some better
> > > > interface..  I can try to think about it too, but please answer me
> > > > questions above so I can understand what I am missing on why that's
> > > > important.  Numbers could help, even if 4 VF and I wonder how much diff
> > > > there can be.  Mostly, I don't know why it's slow right now if it is; I
> > > > thought it should be pretty fast, at least not a concern in VFIO migration
> > > > world (which can take seconds of downtime or more..).
> > > > 
> > > > IOW, it sounds more reasonalbe to me that no matter whether vfio will
> > > > support multifd, it'll be nice we stick with vfio_load_state() /
> > > > vfio_save_state() for config space, and hopefully it's also easier it
> > > > always go via the main channel to everyone.  In these two hooks, VFIO can
> > > > do whatever it wants to sync with other things (on src, sync with
> > > > concurrent thread pool saving iterable data and dumping things to multifd
> > > > channels; on dst, sync with multifd concurrent loads). I think it can
> > > > remove the requirement on the load_finish() interface completely.  Yes,
> > > > this can only load VFIO's pci config space one by one, but I think this is
> > > > much simpler, and I hope it's also not that slow, but I'm not sure.
> > > 
> > > To be clear, I made a following diagram describing how the patch set
> > > is supposed to work right now, including changing per-device
> > > VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
> > > 
> > > Time flows on it left to right (->).
> > > 
> > > ----------- DIAGRAM START -----------
> > > Source overall flow:
> > > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
> > > Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
> > > 
> > > Target overall flow:
> > > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
> > > Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
> > > 
> > > Target config data load operations flow:
> > > multifd config data read (1) -> config data load (2)
> > > 
> > > Notes:
> > > (1): per device threads running in parallel
> > 
> > Here I raised this question before, but I'll ask again: do you think we can
> > avoid using a separate thread on dest qemu, but reuse multifd recv threads?
> > 
> > Src probably needs its own threads because multifd sender threads takes
> > request, so it can't block on its own.
> > 
> > However dest qemu isn't like that, it's packet driven so I think maybe it's
> > ok VFIO directly loads the data in the multifd threads.  We may want to
> > have enough multifd threads to make sure IO still don't block much on the
> > NIC, but I think tuning the num of multifd threads should work in this
> > case.
> 
> We need to have the receiving threads decoupled from the VFIO device state
> loading threads at least because otherwise:
> 1) You can have a deadlock if device state for multiple devices arrives
> out of order, like here:
> 
> Time flows left to right (->).
> Multifd channel 1: (VFIO device 1 buffer 2) (VFIO device 2 buffer 1)
> Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 1)
> 
> Both channel receive/load threads would be stuck forever in this case,
> since they can't load buffer 2 for devices 1 and 2 until they load
> buffer 1 for each of these devices.
> 
> 2) If devices are loading buffers at different speeds you don't want
> to block the faster device from receiving new buffer just because
> the slower one hasn't finished its loading yet.

I don't see why it can't be avoided.  Let me draw this in columns.

How I picture this is:

   multifd recv thread 1                     multifd recv thread 2
   ---------------------                     ---------------------
   recv VFIO device 1 buffer 2             recv VFIO device 2 buffer 2
    -> found that (dev1, buf1) missing,      -> found that (dev2, buf1) missing,
       skip load                                skip load
   recv VFIO device 2 buffer 1             recv VFIO device 1 buffer 1 
    -> found that (dev2, buf1+buf2) ready,   -> found that (dev1, buf1+buf2) ready,
       load buf1+2 for dev2 here                load buf1+2 for dev1 here
                                               
Here right after one multifd thread recvs a buffer, it needs to be injected
into the cache array (with proper locking), so that whoever receives a full
series of those buffers will do the load (again, with proper locking..).

Would this not work?

> 
> > > (2): currently serialized (only one such operation running at a particular time), will hopefully be parallelized in the future
> > > ----------- DIAGRAM END -----------
> > > 
> > > Hope the diagram survived being pasted into an e-mail message.
> > > 
> > > One can see, that even now there's a bit of "low hanging fruit" of missing
> > > possible parallelism:
> > > It seems that the source could wait for multifd device state + multifd config
> > > data *after* non-iterables are sent rather than before as it is done
> > > currently - so they will be sent in parallel with multifd data.
> > 
> > Currently it's blocked by this chunk of code of yours:
> > 
> >      if (multifd_device_state) {
> >          ret = multifd_join_device_state_save_threads();
> >          if (ret) {
> >              qemu_file_set_error(f, ret);
> >              return -1;
> >          }
> >      }
> > 
> > If with your proposal that vfio config space sent via multifd channels,
> > indeed I don't see why it can't be moved to be after non-iterable save()
> > completes.  Is that what you implied as "low hanging fruit"?
> 
> Yes, exactly - to wait for save threads to finish only after non-iterables
> have already been saved.
> 
> By "low hanging fruit" I meant it should be a fairly easy change.
> 
> > [***]
> > 
> > > 
> > > Since written description is often prone to misunderstanding
> > > could you please annotate that diagram with your proposed new flow?
> > 
> > What I was suggesting (removing load_finish()) is mostly the same as what
> > you drew I think, especially on src:
> > 
> > ===============
> > Source overall flow:
> > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
> > Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
> > ===============
> > 
> > In this case we can't do the optimization above [***], since what I
> > suggested requires VFIO's vfio_save_state() to dump the config space, so
> > the original order will be needed here.
> > 
> > While on dest, config data load will need to also load using vfio's
> > vfio_load_state() so it'll be invoked just like what we normally do with
> > non-iterable device states (so here "config data load operations" is part
> > of loading all non-iterable devices):
> > 
> > ===============
> > Target overall flow:                                                              (X)
> > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable (multifd config data read -> config data load operations)
> > Multifd channels:                                       \ multifd device state load                                /
> >                                          (lower part done via multifd recv threads, not separate threads)
> > ===============
> > 
> > So here the ordering of (X) is not guarded by anything, however in
> > vfio_load_state() the device can sem_wait() on a semaphore that only be
> > posted until this device's device state is fully loaded.  So it's not
> > completely serialized - "config data load operations" of DEV1 can still
> > happen concurrently with "multifd device state load" of DEV2.
> > 
> > Sorry, this might not be as clear as it's not easy to draw in the graph,
> > but I hope the words can help clarify what I meant.
> > 
> > If 70ms is not a major deal, I suggest we consider above approach, I think
> > it can simplify at least the vmstate handler API.  If 70ms matters, let's
> > try refactor load_finish() to something usable.
> 
> I understand your point here, however as I wrote above, I think that's too
> much downtime to "waste" so I will try to rework the load_finish() handler
> into the task-queuing approach as you suggested earlier.

Thanks.
Maciej S. Szmigiero Oct. 2, 2024, 8:11 p.m. UTC | #15
On 1.10.2024 23:30, Peter Xu wrote:
> On Tue, Oct 01, 2024 at 10:41:14PM +0200, Maciej S. Szmigiero wrote:
>> On 30.09.2024 23:57, Peter Xu wrote:
>>> On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
>>>> On 27.09.2024 02:53, Peter Xu wrote:
>>>>> On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
>>>>>> On 20.09.2024 18:45, Peter Xu wrote:
>>>>>>> On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>> On 19.09.2024 23:11, Peter Xu wrote:
>>>>>>>>> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>> On 9.09.2024 22:03, Peter Xu wrote:
>>>>>>>>>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>>>>>>>>>
>>>>>>>>>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>>>>>>>>>> a device-specific asynchronous device state loading operation had finished.
>>>>>>>>>>>>
>>>>>>>>>>>> In order to avoid calling this handler needlessly the device is supposed
>>>>>>>>>>>> to notify the migration code of its possible readiness via a call to
>>>>>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>>>>>>>>>> qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        include/migration/register.h | 21 +++++++++++++++
>>>>>>>>>>>>        migration/migration.c        |  6 +++++
>>>>>>>>>>>>        migration/migration.h        |  3 +++
>>>>>>>>>>>>        migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
>>>>>>>>>>>>        migration/savevm.h           |  4 +++
>>>>>>>>>>>>        5 files changed, 86 insertions(+)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/include/migration/register.h b/include/migration/register.h
>>>>>>>>>>>> index 4a578f140713..44d8cf5192ae 100644
>>>>>>>>>>>> --- a/include/migration/register.h
>>>>>>>>>>>> +++ b/include/migration/register.h
>>>>>>>>>>>> @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
>>>>>>>>>>>>            int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
>>>>>>>>>>>>                                     Error **errp);
>>>>>>>>>>>> +    /**
>>>>>>>>>>>> +     * @load_finish
>>>>>>>>>>>> +     *
>>>>>>>>>>>> +     * Poll whether all asynchronous device state loading had finished.
>>>>>>>>>>>> +     * Not called on the load failure path.
>>>>>>>>>>>> +     *
>>>>>>>>>>>> +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>>>> +     *
>>>>>>>>>>>> +     * If this method signals "not ready" then it might not be called
>>>>>>>>>>>> +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
>>>>>>>>>>>> +     * while holding qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>>>
>>>>>>>>>>> [1]
>>>>>>>>>>>
>>>>>>>>>>>> +     *
>>>>>>>>>>>> +     * @opaque: data pointer passed to register_savevm_live()
>>>>>>>>>>>> +     * @is_finished: whether the loading had finished (output parameter)
>>>>>>>>>>>> +     * @errp: pointer to Error*, to store an error if it happens.
>>>>>>>>>>>> +     *
>>>>>>>>>>>> +     * Returns zero to indicate success and negative for error
>>>>>>>>>>>> +     * It's not an error that the loading still hasn't finished.
>>>>>>>>>>>> +     */
>>>>>>>>>>>> +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
>>>>>>>>>>>
>>>>>>>>>>> The load_finish() semantics is a bit weird, especially above [1] on "only
>>>>>>>>>>> allowed to be called once if ..." and also on the locks.
>>>>>>>>>>
>>>>>>>>>> The point of this remark is that a driver needs to call
>>>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
>>>>>>>>>> core to call its load_finish handler again.
>>>>>>>>>>
>>>>>>>>>>> It looks to me vfio_load_finish() also does the final load of the device.
>>>>>>>>>>>
>>>>>>>>>>> I wonder whether that final load can be done in the threads,
>>>>>>>>>>
>>>>>>>>>> Here, the problem is that current VFIO VMState has to be loaded from the main
>>>>>>>>>> migration thread as it internally calls QEMU core address space modification
>>>>>>>>>> methods which explode if called from another thread(s).
>>>>>>>>>
>>>>>>>>> Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
>>>>>>>>> BQL if possible, when that's ready then in your case here IIUC you can
>>>>>>>>> simply take BQL in whichever thread that loads it.. but yeah it's not ready
>>>>>>>>> at least..
>>>>>>>>
>>>>>>>> Yeah, long term we might want to work on making these QEMU core address space
>>>>>>>> modification methods somehow callable from multiple threads but that's
>>>>>>>> definitely not something for the initial patch set.
>>>>>>>>
>>>>>>>>> Would it be possible vfio_save_complete_precopy_async_thread_config_state()
>>>>>>>>> be done in VFIO's save_live_complete_precopy() through the main channel
>>>>>>>>> somehow?  IOW, does it rely on iterative data to be fetched first from
>>>>>>>>> kernel, or completely separate states?
>>>>>>>>
>>>>>>>> The device state data needs to be fully loaded first before "activating"
>>>>>>>> the device by loading its config state.
>>>>>>>>
>>>>>>>>> And just curious: how large is it
>>>>>>>>> normally (and I suppose this decides whether it's applicable to be sent via
>>>>>>>>> the main channel at all..)?
>>>>>>>>
>>>>>>>> Config data is *much* smaller than device state data - as far as I remember
>>>>>>>> it was on order of kilobytes.
>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> then after
>>>>>>>>>>> everything loaded the device post a semaphore telling the main thread to
>>>>>>>>>>> continue.  See e.g.:
>>>>>>>>>>>
>>>>>>>>>>>           if (migrate_switchover_ack()) {
>>>>>>>>>>>               qemu_loadvm_state_switchover_ack_needed(mis);
>>>>>>>>>>>           }
>>>>>>>>>>>
>>>>>>>>>>> IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
>>>>>>>>>>> when all things are loaded?  We can then get rid of this slightly awkward
>>>>>>>>>>> interface.  I had a feeling that things can be simplified (e.g., if the
>>>>>>>>>>> thread will take care of loading the final vmstate then the mutex is also
>>>>>>>>>>> not needed? etc.).
>>>>>>>>>>
>>>>>>>>>> With just a single call to switchover_ack_needed per VFIO device it would
>>>>>>>>>> need to do a blocking wait for the device buffers and config state load
>>>>>>>>>> to finish, therefore blocking other VFIO devices from potentially loading
>>>>>>>>>> their config state if they are ready to begin this operation earlier.
>>>>>>>>>
>>>>>>>>> I am not sure I get you here, loading VFIO device states (I mean, the
>>>>>>>>> non-iterable part) will need to be done sequentially IIUC due to what you
>>>>>>>>> said and should rely on BQL, so I don't know how that could happen
>>>>>>>>> concurrently for now.  But I think indeed BQL is a problem.
>>>>>>>> Consider that we have two VFIO devices (A and B), with the following order
>>>>>>>> of switchover_ack_needed handler calls for them: first A get this call,
>>>>>>>> once the call for A finishes then B gets this call.
>>>>>>>>
>>>>>>>> Now consider what happens if B had loaded all its buffers (in the loading
>>>>>>>> thread) and it is ready for its config load before A finished loading its
>>>>>>>> buffers.
>>>>>>>>
>>>>>>>> B has to wait idle in this situation (even though it could have been already
>>>>>>>> loading its config) since the switchover_ack_needed handler for A won't
>>>>>>>> return until A is fully done.
>>>>>>>
>>>>>>> This sounds like a performance concern, and I wonder how much this impacts
>>>>>>> the real workload (that you run a test and measure, with/without such
>>>>>>> concurrency) when we can save two devices in parallel anyway; I would
>>>>>>> expect the real diff is small due to the fact I mentioned that we save >1
>>>>>>> VFIO devices concurrently via multifd.
>>>>>>>
>>>>>>> Do you think we can start with a simpler approach?
>>>>>>
>>>>>> I don't think introducing a performance/scalability issue like that is
>>>>>> a good thing, especially that we already have a design that avoids it.
>>>>>>
>>>>>> Unfortunately, my current setup does not allow live migrating VMs with
>>>>>> more than 4 VFs so I can't benchmark that.
>>>>>
>>>>> /me wonders why benchmarking it requires more than 4 VFs.
>>>>
>>>> My point here was that the scalability problem will most likely get more
>>>> pronounced with more VFs.
>>>>
>>>>>>
>>>>>> But I almost certain that with more VFs the situation with devices being
>>>>>> ready out-of-order will get even more likely.
>>>>>
>>>>> If the config space is small, why loading it in sequence would be a
>>>>> problem?
>>>>>
>>>>> Have you measured how much time it needs to load one VF's config space that
>>>>> you're using?  I suppose that's vfio_load_device_config_state() alone?
>>>>
>>>> It's not the amount of data to load matters here but that these address
>>>> space operations are slow.
>>>>
>>>> The whole config load takes ~70 ms per device - that's time equivalent
>>>> of transferring 875 MiB of device state via a 100 GBit/s link.
>>>
>>> What's the downtime of migration with 1/2/4 VFs?  I remember I saw some
>>> data somewhere but it's not in the cover letter.  It'll be good to mention
>>> these results in the cover letter when repost.
>>
>> Downtimes with the device state transfer being disabled / enabled:
>>              4 VFs   2 VFs    1 VF
>> Disabled: 1783 ms  614 ms  283 ms
>> Enabled:  1068 ms  434 ms  274 ms
>>
>> Will add these numbers to the cover letter of the next patch set version.
> 
> Thanks.
> 
>>
>>> I'm guessing 70ms isn't a huge deal here, if your NIC has 128GB internal
>>> device state to migrate.. but maybe I'm wrong.
>>
>> It's ~100 MiB of device state per VF here.
> 
> Ouch..
> 
> I watched your kvm forum talk recording, I remember that's where I get that
> 128 number but probably get the unit wrong.. ok that makes sense.
> 
>>
>> And it's 70ms of downtime *per device*:
>> so with 4 VF it's ~280ms of downtime taken by the config loads.
>> That's a lot - with perfect parallelization this downtime should
>> *reduce by* 210ms.
> 
> Yes, in this case it's a lot.  I wonder why it won't scale as good even
> with your patchset.
> 
> Did you profile why?  I highly doubt in your case network is an issue, as
> there's only 100MB per-dev data, so even on 10gbps it takes 100ms only to
> transfer for each, while now assuming it can run concurrently.  I think you
> mentioned you were using 100gbps, right?

Right, these 2 test machines are connected via a 100 GBbps network.

> Logically when with multiple threads, VFIO read()s should happen at least
> concurrently per-device.  Have you checked that there's no kernel-side
> global VFIO lock etc. that serializes portions of the threads read()s /
> write()s on the VFIO fds?

For these devices the kernel side has been significantly improved a year ago:
https://lore.kernel.org/kvm/20230911093856.81910-1-yishaih@nvidia.com/

In the mlx5 driver the in-kernel device reading task (work) is separated
from the userspace (QEMU) read()ing task via a double/multi buffering scheme.

If there was indeed some global lock serializing all device accesses we
wouldn't be seeing that much improvement from this patch set as we are
seeing - especially that the improvement seems to *increase* with the
increased VF count in a single PF.

> It's just a pity that you went this far, added all these logics, but
> without making it fully concurrent at least per device.

AFAIK NVIDIA/Mellanox are continuously working on improving the mlx5 driver,
but to benefit from the driver parallelism we need parallelism in QEMU
too so the userspace won't become the serialization point/bottleneck.

In other words, it's kind of a chicken and egg problem.

That's why I want to preserve as much parallelism in this patch set as
possible to avoid accidental serialization which (even if not a problem
right now) may become the bottleneck at some point.

> I'm OK if you want this in without that figured out, but if I were you I'll
> probably try to dig a bit to at least know why.
> 
>>
>>> I also wonder whether you profiled a bit on how that 70ms contributes to
>>> what is slow.
>>
>> I think that's something we can do after we have parallel config loads
>> and it turns out their downtime for some reason still scales strongly
>> linearly with the number of VFIO devices (rather than taking roughly
>> constant time regardless of the count of these devices if running perfectly
>> in parallel).
> 
> Similarly, I wonder whether the config space load() can involves something
> globally shared.  I'd also dig a bit here, but I'll leave that to you to
> decide.

Making config loads thread-safe/parallelizable is definitely on my future
TODO list.

Just wanted to keep the amount of changes in the first version of this
patch set within reasonable bounds - one has to draw a line somewhere
otherwise we'll keep working on this patch set forever, with the
QEMU code being a moving target meanwhile.

>>
>>>>
>>>>>>
>>>>>>> So what I'm thinking could be very clean is, we just discussed about
>>>>>>> MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
>>>>>>> wonder when with it why not we move one step further to have
>>>>>>> MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
>>>>>>> ready to send non-iterable".  It can be controlled by the same migration
>>>>>>> property so we only send these two flags in 9.2+ machine types.
>>>>>>>
>>>>>>> Then IIUC VFIO can send config data through main wire (just like most of
>>>>>>> other pci devices! which is IMHO a good fit..) and on destination VFIO
>>>>>>> holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
>>>>>>
>>>>>> Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
>>>>>> to the considerations above) also delay starting the config load until all
>>>>>> iterable devices were read/transferred/loaded and also would complicate
>>>>>> future efforts at loading that config data in parallel.
>>>>>
>>>>> However I wonder whether we can keep it simple in that VFIO's config space
>>>>> is still always saved in vfio_save_state().  I still think it's easier we
>>>>> stick with the main channel whenever possible.  For this specific case, if
>>>>> the config space is small I think it's tricky you bypass this with:
>>>>>
>>>>>        if (migration->multifd_transfer) {
>>>>>            /* Emit dummy NOP data */
>>>>>            qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>>>>            return;
>>>>>        }
>>>>>
>>>>> Then squash this as the tail of the iterable data.
>>>>>
>>>>> On the src, I think it could use a per-device semaphore, so that iterable
>>>>> save() thread will post() only if it finishes dumping all the data, then
>>>>> that orders VFIO iterable data v.s. config space save().
>>>>
>>>> In the future we want to not only transfer but also load the config data
>>>> in parallel.
>>>
>>> How feasible do you think this idea is?  E.g. does it involve BQL so far
>>> (e.g. memory updates, others)?  What's still missing to make it concurrent?
>>
>> My gut feeling is that is feasible overall but it's too much of a rabbit
>> hole for the first version of this device state transfer feature.
>>
>> I think it will need some deeper QEMU core address space management changes,
>> which need to be researched/developed/tested/reviewed/etc. on their own.
>>
>> If it was an easy task I would have gladly included such support in this
>> patch set version already for extra downtime reduction :)
> 
> Yes I understand.
> 
> Note that it doesn't need to be implemented and resolved in one shot, but I
> wonder if it'll still be good to debug the issue and know where is not
> scaling.
> 
> Considering that your design is fully concurrent as of now on iterable data
> from QEMU side, it's less persuasive to provide perf numbers that still
> doesn't scale that much; 1.78s -> 1.06s is a good improvement, but it
> doesn't seem to solve the scalability issue that this whole series wanted
> to address in general.
> 
> An extreme (bad) example is if VFIO has all ioctl()/read()/write() take a
> global lock, then any work in QEMU trying to run things in parallel will be
> a vain.  Such patchset cannot be accepted because the other issue needs to
> be resolved first.
> 
> Now it's in the middle of best/worst condition, where it did improve but it
> still doesn't scale that well.  I think it can be accepted, but still I
> feel like we're ignoring some of the real issues.  We can choose to ignore
> the kernel saying that "it's too much to do together", but IMHO the issues
> should be tackled in the other way round.. the normal case is one should
> work out the kernel scalability issues, then QEMU should be on top.. Simply
> because any kernel change that might scale >1 device save()/load() can
> affect future QEMU change and design, not vice versa.
> 
> Again, I know you wished we make some progress, so I don't have a strong
> opinion.  Just FYI.
> 

As I wrote above, the kernel side of things are being taken care of by
the mlx5 driver maintainers.

And these performance numbers suggest that there isn't some global lock
serializing all device accesses as otherwise it would quickly become
the bottleneck and we would be seeing diminishing improvement from
increased VF count instead of increased improvement.

(..)
>>>>
>>>>>        that I feel like perhaps can be replaced by a sem (then to drop the
>>>>>        condvar)?
>>>>
>>>> Once we have ability to load device config state outside main migration
>>>> thread replacing "load_finish" handler with a semaphore should indeed be
>>>> possible (that's internal migration API so there should be no issue
>>>> removing it as not necessary anymore at this point).
>>>>
>>>> But for now, the devices need to have ability to run their config load
>>>> code on the main migration thread, and for that they need to be called
>>>> from this handler "load_finish".
>>>
>>> A sem seems a must here to notify the iterable data finished loading, but
>>> that doesn't need to hook to the vmstate handler, but some post-process
>>> tasks, like what we do around cpu_synchronize_all_post_init() time.
>>>
>>> If per-device vmstate handler hook version of load_finish() is destined to
>>> look as weird in this case, I'd rather consider a totally separate way to
>>> enqueue some jobs that needs to be run after all vmstates loaded.  Then
>>> after one VFIO device fully loads its data, it enqueues the task and post()
>>> to one migration sem saying that "there's one post-process task, please run
>>> it in migration thread".  There can be a total number of tasks registered
>>> so that migration thread knows not to continue until these number of tasks
>>> processed.  That counter can be part of vmstate handler, maybe, reporting
>>> that "this vmstate handler has one post-process task".
>>>
>>> Maybe you have other ideas, but please no, let's avoid this load_finish()
>>> thing..
>>
>> I can certainly implement the task-queuing approach instead of the
>> load_finish() handler API if you like such approach more.
> 
> I have an even simpler solution now.  I think you can reuse precopy
> notifiers.
> 
> You can add one new PRECOPY_NOTIFY_INCOMING_COMPLETE event, invoke it after
> vmstate load all done.
> 
> As long as VFIO devices exist, VFIO can register with that event, then it
> can do whatever it wants in the main loader thread with BQL held.
> 
> You can hide that sem post() / wait() all there, then it's completely VFIO
> internal.  Then we leave vmstate handler alone; it just doesn't sound
> suitable when the hooks need to be called out of order.

I can certainly implement this functionality via a new
precopy_notify(PRECOPY_NOTIFY_INCOMING_COMPLETE) notifier, for example
by having a single notify handler registered by the VFIO driver, which
handler will be common to all VFIO devices.

This handler on the VFIO driver side will then take care of proper operation
ordering between the existing VFIO devices.

>>>>
>>>>>      - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
>>>>>        above..
>>>>>
>>>>> So if you really think it matters to load whatever VFIO device who's
>>>>> iterable data is ready first, then let's try come up with some better
>>>>> interface..  I can try to think about it too, but please answer me
>>>>> questions above so I can understand what I am missing on why that's
>>>>> important.  Numbers could help, even if 4 VF and I wonder how much diff
>>>>> there can be.  Mostly, I don't know why it's slow right now if it is; I
>>>>> thought it should be pretty fast, at least not a concern in VFIO migration
>>>>> world (which can take seconds of downtime or more..).
>>>>>
>>>>> IOW, it sounds more reasonalbe to me that no matter whether vfio will
>>>>> support multifd, it'll be nice we stick with vfio_load_state() /
>>>>> vfio_save_state() for config space, and hopefully it's also easier it
>>>>> always go via the main channel to everyone.  In these two hooks, VFIO can
>>>>> do whatever it wants to sync with other things (on src, sync with
>>>>> concurrent thread pool saving iterable data and dumping things to multifd
>>>>> channels; on dst, sync with multifd concurrent loads). I think it can
>>>>> remove the requirement on the load_finish() interface completely.  Yes,
>>>>> this can only load VFIO's pci config space one by one, but I think this is
>>>>> much simpler, and I hope it's also not that slow, but I'm not sure.
>>>>
>>>> To be clear, I made a following diagram describing how the patch set
>>>> is supposed to work right now, including changing per-device
>>>> VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
>>>>
>>>> Time flows on it left to right (->).
>>>>
>>>> ----------- DIAGRAM START -----------
>>>> Source overall flow:
>>>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
>>>> Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
>>>>
>>>> Target overall flow:
>>>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
>>>> Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
>>>>
>>>> Target config data load operations flow:
>>>> multifd config data read (1) -> config data load (2)
>>>>
>>>> Notes:
>>>> (1): per device threads running in parallel
>>>
>>> Here I raised this question before, but I'll ask again: do you think we can
>>> avoid using a separate thread on dest qemu, but reuse multifd recv threads?
>>>
>>> Src probably needs its own threads because multifd sender threads takes
>>> request, so it can't block on its own.
>>>
>>> However dest qemu isn't like that, it's packet driven so I think maybe it's
>>> ok VFIO directly loads the data in the multifd threads.  We may want to
>>> have enough multifd threads to make sure IO still don't block much on the
>>> NIC, but I think tuning the num of multifd threads should work in this
>>> case.
>>
>> We need to have the receiving threads decoupled from the VFIO device state
>> loading threads at least because otherwise:
>> 1) You can have a deadlock if device state for multiple devices arrives
>> out of order, like here:
>>
>> Time flows left to right (->).
>> Multifd channel 1: (VFIO device 1 buffer 2) (VFIO device 2 buffer 1)
>> Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 1)
>>
>> Both channel receive/load threads would be stuck forever in this case,
>> since they can't load buffer 2 for devices 1 and 2 until they load
>> buffer 1 for each of these devices.
>>
>> 2) If devices are loading buffers at different speeds you don't want
>> to block the faster device from receiving new buffer just because
>> the slower one hasn't finished its loading yet.
> 
> I don't see why it can't be avoided.  Let me draw this in columns.
> 
> How I picture this is:
> 
>     multifd recv thread 1                     multifd recv thread 2
>     ---------------------                     ---------------------
>     recv VFIO device 1 buffer 2             recv VFIO device 2 buffer 2
>      -> found that (dev1, buf1) missing,      -> found that (dev2, buf1) missing,
>         skip load                                skip load
>     recv VFIO device 2 buffer 1             recv VFIO device 1 buffer 1
>      -> found that (dev2, buf1+buf2) ready,   -> found that (dev1, buf1+buf2) ready,
>         load buf1+2 for dev2 here                load buf1+2 for dev1 here
>                                                 
> Here right after one multifd thread recvs a buffer, it needs to be injected
> into the cache array (with proper locking), so that whoever receives a full
> series of those buffers will do the load (again, with proper locking..).
> 
> Would this not work?
> 

For sure but that's definitely more complicated logic than just having
a simple device loading thread that naturally loads incoming buffers
for that device in-order.
That thread isn't even in the purview of the migration code since
it's a VFIO driver internal implementation detail.

And we'd still lose parallelism if it happens that two buffers that
are to be loaded next for two devices happen to arrive in the same
multifd channel:
Multifd channel 1: (VFIO device 1 buffer 1) (VFIO device 2 buffer 1)
Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 2)

Now device 2 buffer 1 has to wait until loading device 1 buffer 1
finishes even thought with the decoupled loading thread implementation
from this patch set these would be loaded in parallel.

> 
> Thanks.
> 

Thanks,
Maciej
Peter Xu Oct. 2, 2024, 9:25 p.m. UTC | #16
On Wed, Oct 02, 2024 at 10:11:33PM +0200, Maciej S. Szmigiero wrote:
> On 1.10.2024 23:30, Peter Xu wrote:
> > On Tue, Oct 01, 2024 at 10:41:14PM +0200, Maciej S. Szmigiero wrote:
> > > On 30.09.2024 23:57, Peter Xu wrote:
> > > > On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
> > > > > On 27.09.2024 02:53, Peter Xu wrote:
> > > > > > On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
> > > > > > > On 20.09.2024 18:45, Peter Xu wrote:
> > > > > > > > On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > On 19.09.2024 23:11, Peter Xu wrote:
> > > > > > > > > > On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > > > On 9.09.2024 22:03, Peter Xu wrote:
> > > > > > > > > > > > On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > > > > > > > From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> > > > > > > > > > > > > 
> > > > > > > > > > > > > load_finish SaveVMHandler allows migration code to poll whether
> > > > > > > > > > > > > a device-specific asynchronous device state loading operation had finished.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > In order to avoid calling this handler needlessly the device is supposed
> > > > > > > > > > > > > to notify the migration code of its possible readiness via a call to
> > > > > > > > > > > > > qemu_loadvm_load_finish_ready_broadcast() while holding
> > > > > > > > > > > > > qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >        include/migration/register.h | 21 +++++++++++++++
> > > > > > > > > > > > >        migration/migration.c        |  6 +++++
> > > > > > > > > > > > >        migration/migration.h        |  3 +++
> > > > > > > > > > > > >        migration/savevm.c           | 52 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > > >        migration/savevm.h           |  4 +++
> > > > > > > > > > > > >        5 files changed, 86 insertions(+)
> > > > > > > > > > > > > 
> > > > > > > > > > > > > diff --git a/include/migration/register.h b/include/migration/register.h
> > > > > > > > > > > > > index 4a578f140713..44d8cf5192ae 100644
> > > > > > > > > > > > > --- a/include/migration/register.h
> > > > > > > > > > > > > +++ b/include/migration/register.h
> > > > > > > > > > > > > @@ -278,6 +278,27 @@ typedef struct SaveVMHandlers {
> > > > > > > > > > > > >            int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
> > > > > > > > > > > > >                                     Error **errp);
> > > > > > > > > > > > > +    /**
> > > > > > > > > > > > > +     * @load_finish
> > > > > > > > > > > > > +     *
> > > > > > > > > > > > > +     * Poll whether all asynchronous device state loading had finished.
> > > > > > > > > > > > > +     * Not called on the load failure path.
> > > > > > > > > > > > > +     *
> > > > > > > > > > > > > +     * Called while holding the qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > > > > +     *
> > > > > > > > > > > > > +     * If this method signals "not ready" then it might not be called
> > > > > > > > > > > > > +     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
> > > > > > > > > > > > > +     * while holding qemu_loadvm_load_finish_ready_lock.
> > > > > > > > > > > > 
> > > > > > > > > > > > [1]
> > > > > > > > > > > > 
> > > > > > > > > > > > > +     *
> > > > > > > > > > > > > +     * @opaque: data pointer passed to register_savevm_live()
> > > > > > > > > > > > > +     * @is_finished: whether the loading had finished (output parameter)
> > > > > > > > > > > > > +     * @errp: pointer to Error*, to store an error if it happens.
> > > > > > > > > > > > > +     *
> > > > > > > > > > > > > +     * Returns zero to indicate success and negative for error
> > > > > > > > > > > > > +     * It's not an error that the loading still hasn't finished.
> > > > > > > > > > > > > +     */
> > > > > > > > > > > > > +    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
> > > > > > > > > > > > 
> > > > > > > > > > > > The load_finish() semantics is a bit weird, especially above [1] on "only
> > > > > > > > > > > > allowed to be called once if ..." and also on the locks.
> > > > > > > > > > > 
> > > > > > > > > > > The point of this remark is that a driver needs to call
> > > > > > > > > > > qemu_loadvm_load_finish_ready_broadcast() if it wants for the migration
> > > > > > > > > > > core to call its load_finish handler again.
> > > > > > > > > > > 
> > > > > > > > > > > > It looks to me vfio_load_finish() also does the final load of the device.
> > > > > > > > > > > > 
> > > > > > > > > > > > I wonder whether that final load can be done in the threads,
> > > > > > > > > > > 
> > > > > > > > > > > Here, the problem is that current VFIO VMState has to be loaded from the main
> > > > > > > > > > > migration thread as it internally calls QEMU core address space modification
> > > > > > > > > > > methods which explode if called from another thread(s).
> > > > > > > > > > 
> > > > > > > > > > Ahh, I see.  I'm trying to make dest qemu loadvm in a thread too and yield
> > > > > > > > > > BQL if possible, when that's ready then in your case here IIUC you can
> > > > > > > > > > simply take BQL in whichever thread that loads it.. but yeah it's not ready
> > > > > > > > > > at least..
> > > > > > > > > 
> > > > > > > > > Yeah, long term we might want to work on making these QEMU core address space
> > > > > > > > > modification methods somehow callable from multiple threads but that's
> > > > > > > > > definitely not something for the initial patch set.
> > > > > > > > > 
> > > > > > > > > > Would it be possible vfio_save_complete_precopy_async_thread_config_state()
> > > > > > > > > > be done in VFIO's save_live_complete_precopy() through the main channel
> > > > > > > > > > somehow?  IOW, does it rely on iterative data to be fetched first from
> > > > > > > > > > kernel, or completely separate states?
> > > > > > > > > 
> > > > > > > > > The device state data needs to be fully loaded first before "activating"
> > > > > > > > > the device by loading its config state.
> > > > > > > > > 
> > > > > > > > > > And just curious: how large is it
> > > > > > > > > > normally (and I suppose this decides whether it's applicable to be sent via
> > > > > > > > > > the main channel at all..)?
> > > > > > > > > 
> > > > > > > > > Config data is *much* smaller than device state data - as far as I remember
> > > > > > > > > it was on order of kilobytes.
> > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > > then after
> > > > > > > > > > > > everything loaded the device post a semaphore telling the main thread to
> > > > > > > > > > > > continue.  See e.g.:
> > > > > > > > > > > > 
> > > > > > > > > > > >           if (migrate_switchover_ack()) {
> > > > > > > > > > > >               qemu_loadvm_state_switchover_ack_needed(mis);
> > > > > > > > > > > >           }
> > > > > > > > > > > > 
> > > > > > > > > > > > IIUC, VFIO can register load_complete_ack similarly so it only sem_post()
> > > > > > > > > > > > when all things are loaded?  We can then get rid of this slightly awkward
> > > > > > > > > > > > interface.  I had a feeling that things can be simplified (e.g., if the
> > > > > > > > > > > > thread will take care of loading the final vmstate then the mutex is also
> > > > > > > > > > > > not needed? etc.).
> > > > > > > > > > > 
> > > > > > > > > > > With just a single call to switchover_ack_needed per VFIO device it would
> > > > > > > > > > > need to do a blocking wait for the device buffers and config state load
> > > > > > > > > > > to finish, therefore blocking other VFIO devices from potentially loading
> > > > > > > > > > > their config state if they are ready to begin this operation earlier.
> > > > > > > > > > 
> > > > > > > > > > I am not sure I get you here, loading VFIO device states (I mean, the
> > > > > > > > > > non-iterable part) will need to be done sequentially IIUC due to what you
> > > > > > > > > > said and should rely on BQL, so I don't know how that could happen
> > > > > > > > > > concurrently for now.  But I think indeed BQL is a problem.
> > > > > > > > > Consider that we have two VFIO devices (A and B), with the following order
> > > > > > > > > of switchover_ack_needed handler calls for them: first A get this call,
> > > > > > > > > once the call for A finishes then B gets this call.
> > > > > > > > > 
> > > > > > > > > Now consider what happens if B had loaded all its buffers (in the loading
> > > > > > > > > thread) and it is ready for its config load before A finished loading its
> > > > > > > > > buffers.
> > > > > > > > > 
> > > > > > > > > B has to wait idle in this situation (even though it could have been already
> > > > > > > > > loading its config) since the switchover_ack_needed handler for A won't
> > > > > > > > > return until A is fully done.
> > > > > > > > 
> > > > > > > > This sounds like a performance concern, and I wonder how much this impacts
> > > > > > > > the real workload (that you run a test and measure, with/without such
> > > > > > > > concurrency) when we can save two devices in parallel anyway; I would
> > > > > > > > expect the real diff is small due to the fact I mentioned that we save >1
> > > > > > > > VFIO devices concurrently via multifd.
> > > > > > > > 
> > > > > > > > Do you think we can start with a simpler approach?
> > > > > > > 
> > > > > > > I don't think introducing a performance/scalability issue like that is
> > > > > > > a good thing, especially that we already have a design that avoids it.
> > > > > > > 
> > > > > > > Unfortunately, my current setup does not allow live migrating VMs with
> > > > > > > more than 4 VFs so I can't benchmark that.
> > > > > > 
> > > > > > /me wonders why benchmarking it requires more than 4 VFs.
> > > > > 
> > > > > My point here was that the scalability problem will most likely get more
> > > > > pronounced with more VFs.
> > > > > 
> > > > > > > 
> > > > > > > But I almost certain that with more VFs the situation with devices being
> > > > > > > ready out-of-order will get even more likely.
> > > > > > 
> > > > > > If the config space is small, why loading it in sequence would be a
> > > > > > problem?
> > > > > > 
> > > > > > Have you measured how much time it needs to load one VF's config space that
> > > > > > you're using?  I suppose that's vfio_load_device_config_state() alone?
> > > > > 
> > > > > It's not the amount of data to load matters here but that these address
> > > > > space operations are slow.
> > > > > 
> > > > > The whole config load takes ~70 ms per device - that's time equivalent
> > > > > of transferring 875 MiB of device state via a 100 GBit/s link.
> > > > 
> > > > What's the downtime of migration with 1/2/4 VFs?  I remember I saw some
> > > > data somewhere but it's not in the cover letter.  It'll be good to mention
> > > > these results in the cover letter when repost.
> > > 
> > > Downtimes with the device state transfer being disabled / enabled:
> > >              4 VFs   2 VFs    1 VF
> > > Disabled: 1783 ms  614 ms  283 ms
> > > Enabled:  1068 ms  434 ms  274 ms
> > > 
> > > Will add these numbers to the cover letter of the next patch set version.
> > 
> > Thanks.
> > 
> > > 
> > > > I'm guessing 70ms isn't a huge deal here, if your NIC has 128GB internal
> > > > device state to migrate.. but maybe I'm wrong.
> > > 
> > > It's ~100 MiB of device state per VF here.
> > 
> > Ouch..
> > 
> > I watched your kvm forum talk recording, I remember that's where I get that
> > 128 number but probably get the unit wrong.. ok that makes sense.
> > 
> > > 
> > > And it's 70ms of downtime *per device*:
> > > so with 4 VF it's ~280ms of downtime taken by the config loads.
> > > That's a lot - with perfect parallelization this downtime should
> > > *reduce by* 210ms.
> > 
> > Yes, in this case it's a lot.  I wonder why it won't scale as good even
> > with your patchset.
> > 
> > Did you profile why?  I highly doubt in your case network is an issue, as
> > there's only 100MB per-dev data, so even on 10gbps it takes 100ms only to
> > transfer for each, while now assuming it can run concurrently.  I think you
> > mentioned you were using 100gbps, right?
> 
> Right, these 2 test machines are connected via a 100 GBbps network.
> 
> > Logically when with multiple threads, VFIO read()s should happen at least
> > concurrently per-device.  Have you checked that there's no kernel-side
> > global VFIO lock etc. that serializes portions of the threads read()s /
> > write()s on the VFIO fds?
> 
> For these devices the kernel side has been significantly improved a year ago:
> https://lore.kernel.org/kvm/20230911093856.81910-1-yishaih@nvidia.com/
> 
> In the mlx5 driver the in-kernel device reading task (work) is separated
> from the userspace (QEMU) read()ing task via a double/multi buffering scheme.
> 
> If there was indeed some global lock serializing all device accesses we
> wouldn't be seeing that much improvement from this patch set as we are
> seeing - especially that the improvement seems to *increase* with the
> increased VF count in a single PF.
> 
> > It's just a pity that you went this far, added all these logics, but
> > without making it fully concurrent at least per device.
> 
> AFAIK NVIDIA/Mellanox are continuously working on improving the mlx5 driver,
> but to benefit from the driver parallelism we need parallelism in QEMU
> too so the userspace won't become the serialization point/bottleneck.
> 
> In other words, it's kind of a chicken and egg problem.
> 
> That's why I want to preserve as much parallelism in this patch set as
> possible to avoid accidental serialization which (even if not a problem
> right now) may become the bottleneck at some point.
> 
> > I'm OK if you want this in without that figured out, but if I were you I'll
> > probably try to dig a bit to at least know why.
> > 
> > > 
> > > > I also wonder whether you profiled a bit on how that 70ms contributes to
> > > > what is slow.
> > > 
> > > I think that's something we can do after we have parallel config loads
> > > and it turns out their downtime for some reason still scales strongly
> > > linearly with the number of VFIO devices (rather than taking roughly
> > > constant time regardless of the count of these devices if running perfectly
> > > in parallel).
> > 
> > Similarly, I wonder whether the config space load() can involves something
> > globally shared.  I'd also dig a bit here, but I'll leave that to you to
> > decide.
> 
> Making config loads thread-safe/parallelizable is definitely on my future
> TODO list.
> 
> Just wanted to keep the amount of changes in the first version of this
> patch set within reasonable bounds - one has to draw a line somewhere
> otherwise we'll keep working on this patch set forever, with the
> QEMU code being a moving target meanwhile.
> 
> > > 
> > > > > 
> > > > > > > 
> > > > > > > > So what I'm thinking could be very clean is, we just discussed about
> > > > > > > > MIG_CMD_SWITCHOVER and looks like you also think it's an OK approach.  I
> > > > > > > > wonder when with it why not we move one step further to have
> > > > > > > > MIG_CMD_SEND_NON_ITERABE just to mark that "iterable devices all done,
> > > > > > > > ready to send non-iterable".  It can be controlled by the same migration
> > > > > > > > property so we only send these two flags in 9.2+ machine types.
> > > > > > > > 
> > > > > > > > Then IIUC VFIO can send config data through main wire (just like most of
> > > > > > > > other pci devices! which is IMHO a good fit..) and on destination VFIO
> > > > > > > > holds off loading them until passing the MIG_CMD_SEND_NON_ITERABE phase.
> > > > > > > 
> > > > > > > Starting the config load only on MIG_CMD_SEND_NON_ITERABE would (in addition
> > > > > > > to the considerations above) also delay starting the config load until all
> > > > > > > iterable devices were read/transferred/loaded and also would complicate
> > > > > > > future efforts at loading that config data in parallel.
> > > > > > 
> > > > > > However I wonder whether we can keep it simple in that VFIO's config space
> > > > > > is still always saved in vfio_save_state().  I still think it's easier we
> > > > > > stick with the main channel whenever possible.  For this specific case, if
> > > > > > the config space is small I think it's tricky you bypass this with:
> > > > > > 
> > > > > >        if (migration->multifd_transfer) {
> > > > > >            /* Emit dummy NOP data */
> > > > > >            qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> > > > > >            return;
> > > > > >        }
> > > > > > 
> > > > > > Then squash this as the tail of the iterable data.
> > > > > > 
> > > > > > On the src, I think it could use a per-device semaphore, so that iterable
> > > > > > save() thread will post() only if it finishes dumping all the data, then
> > > > > > that orders VFIO iterable data v.s. config space save().
> > > > > 
> > > > > In the future we want to not only transfer but also load the config data
> > > > > in parallel.
> > > > 
> > > > How feasible do you think this idea is?  E.g. does it involve BQL so far
> > > > (e.g. memory updates, others)?  What's still missing to make it concurrent?
> > > 
> > > My gut feeling is that is feasible overall but it's too much of a rabbit
> > > hole for the first version of this device state transfer feature.
> > > 
> > > I think it will need some deeper QEMU core address space management changes,
> > > which need to be researched/developed/tested/reviewed/etc. on their own.
> > > 
> > > If it was an easy task I would have gladly included such support in this
> > > patch set version already for extra downtime reduction :)
> > 
> > Yes I understand.
> > 
> > Note that it doesn't need to be implemented and resolved in one shot, but I
> > wonder if it'll still be good to debug the issue and know where is not
> > scaling.
> > 
> > Considering that your design is fully concurrent as of now on iterable data
> > from QEMU side, it's less persuasive to provide perf numbers that still
> > doesn't scale that much; 1.78s -> 1.06s is a good improvement, but it
> > doesn't seem to solve the scalability issue that this whole series wanted
> > to address in general.
> > 
> > An extreme (bad) example is if VFIO has all ioctl()/read()/write() take a
> > global lock, then any work in QEMU trying to run things in parallel will be
> > a vain.  Such patchset cannot be accepted because the other issue needs to
> > be resolved first.
> > 
> > Now it's in the middle of best/worst condition, where it did improve but it
> > still doesn't scale that well.  I think it can be accepted, but still I
> > feel like we're ignoring some of the real issues.  We can choose to ignore
> > the kernel saying that "it's too much to do together", but IMHO the issues
> > should be tackled in the other way round.. the normal case is one should
> > work out the kernel scalability issues, then QEMU should be on top.. Simply
> > because any kernel change that might scale >1 device save()/load() can
> > affect future QEMU change and design, not vice versa.
> > 
> > Again, I know you wished we make some progress, so I don't have a strong
> > opinion.  Just FYI.
> > 
> 
> As I wrote above, the kernel side of things are being taken care of by
> the mlx5 driver maintainers.
> 
> And these performance numbers suggest that there isn't some global lock
> serializing all device accesses as otherwise it would quickly become
> the bottleneck and we would be seeing diminishing improvement from
> increased VF count instead of increased improvement.

Personally I am not satisfied with scaling with these numbers..

  1VF       2VFs      4VFs
  274 ms -> 434 ms -> 1068 ms

The lock doesn't need to be as stupid as a global lock that all ioctl()s
take and it might not be as obvious that we can easily see.  It can hide
internally, it can be not in the form of a lock at all.

1068 is almost 4x of 274 here, that's really not scalable at all even if it
is improvement for sure..  I still feel like something is off.  If you
think kernel isn't the bottleneck, I am actually more curious on why,
especially if that could be relevant to the qemu design.

> 
> (..)
> > > > > 
> > > > > >        that I feel like perhaps can be replaced by a sem (then to drop the
> > > > > >        condvar)?
> > > > > 
> > > > > Once we have ability to load device config state outside main migration
> > > > > thread replacing "load_finish" handler with a semaphore should indeed be
> > > > > possible (that's internal migration API so there should be no issue
> > > > > removing it as not necessary anymore at this point).
> > > > > 
> > > > > But for now, the devices need to have ability to run their config load
> > > > > code on the main migration thread, and for that they need to be called
> > > > > from this handler "load_finish".
> > > > 
> > > > A sem seems a must here to notify the iterable data finished loading, but
> > > > that doesn't need to hook to the vmstate handler, but some post-process
> > > > tasks, like what we do around cpu_synchronize_all_post_init() time.
> > > > 
> > > > If per-device vmstate handler hook version of load_finish() is destined to
> > > > look as weird in this case, I'd rather consider a totally separate way to
> > > > enqueue some jobs that needs to be run after all vmstates loaded.  Then
> > > > after one VFIO device fully loads its data, it enqueues the task and post()
> > > > to one migration sem saying that "there's one post-process task, please run
> > > > it in migration thread".  There can be a total number of tasks registered
> > > > so that migration thread knows not to continue until these number of tasks
> > > > processed.  That counter can be part of vmstate handler, maybe, reporting
> > > > that "this vmstate handler has one post-process task".
> > > > 
> > > > Maybe you have other ideas, but please no, let's avoid this load_finish()
> > > > thing..
> > > 
> > > I can certainly implement the task-queuing approach instead of the
> > > load_finish() handler API if you like such approach more.
> > 
> > I have an even simpler solution now.  I think you can reuse precopy
> > notifiers.
> > 
> > You can add one new PRECOPY_NOTIFY_INCOMING_COMPLETE event, invoke it after
> > vmstate load all done.
> > 
> > As long as VFIO devices exist, VFIO can register with that event, then it
> > can do whatever it wants in the main loader thread with BQL held.
> > 
> > You can hide that sem post() / wait() all there, then it's completely VFIO
> > internal.  Then we leave vmstate handler alone; it just doesn't sound
> > suitable when the hooks need to be called out of order.
> 
> I can certainly implement this functionality via a new
> precopy_notify(PRECOPY_NOTIFY_INCOMING_COMPLETE) notifier, for example
> by having a single notify handler registered by the VFIO driver, which
> handler will be common to all VFIO devices.
> 
> This handler on the VFIO driver side will then take care of proper operation
> ordering between the existing VFIO devices.

Great!

> 
> > > > > 
> > > > > >      - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
> > > > > >        above..
> > > > > > 
> > > > > > So if you really think it matters to load whatever VFIO device who's
> > > > > > iterable data is ready first, then let's try come up with some better
> > > > > > interface..  I can try to think about it too, but please answer me
> > > > > > questions above so I can understand what I am missing on why that's
> > > > > > important.  Numbers could help, even if 4 VF and I wonder how much diff
> > > > > > there can be.  Mostly, I don't know why it's slow right now if it is; I
> > > > > > thought it should be pretty fast, at least not a concern in VFIO migration
> > > > > > world (which can take seconds of downtime or more..).
> > > > > > 
> > > > > > IOW, it sounds more reasonalbe to me that no matter whether vfio will
> > > > > > support multifd, it'll be nice we stick with vfio_load_state() /
> > > > > > vfio_save_state() for config space, and hopefully it's also easier it
> > > > > > always go via the main channel to everyone.  In these two hooks, VFIO can
> > > > > > do whatever it wants to sync with other things (on src, sync with
> > > > > > concurrent thread pool saving iterable data and dumping things to multifd
> > > > > > channels; on dst, sync with multifd concurrent loads). I think it can
> > > > > > remove the requirement on the load_finish() interface completely.  Yes,
> > > > > > this can only load VFIO's pci config space one by one, but I think this is
> > > > > > much simpler, and I hope it's also not that slow, but I'm not sure.
> > > > > 
> > > > > To be clear, I made a following diagram describing how the patch set
> > > > > is supposed to work right now, including changing per-device
> > > > > VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
> > > > > 
> > > > > Time flows on it left to right (->).
> > > > > 
> > > > > ----------- DIAGRAM START -----------
> > > > > Source overall flow:
> > > > > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
> > > > > Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
> > > > > 
> > > > > Target overall flow:
> > > > > Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
> > > > > Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
> > > > > 
> > > > > Target config data load operations flow:
> > > > > multifd config data read (1) -> config data load (2)
> > > > > 
> > > > > Notes:
> > > > > (1): per device threads running in parallel
> > > > 
> > > > Here I raised this question before, but I'll ask again: do you think we can
> > > > avoid using a separate thread on dest qemu, but reuse multifd recv threads?
> > > > 
> > > > Src probably needs its own threads because multifd sender threads takes
> > > > request, so it can't block on its own.
> > > > 
> > > > However dest qemu isn't like that, it's packet driven so I think maybe it's
> > > > ok VFIO directly loads the data in the multifd threads.  We may want to
> > > > have enough multifd threads to make sure IO still don't block much on the
> > > > NIC, but I think tuning the num of multifd threads should work in this
> > > > case.
> > > 
> > > We need to have the receiving threads decoupled from the VFIO device state
> > > loading threads at least because otherwise:
> > > 1) You can have a deadlock if device state for multiple devices arrives
> > > out of order, like here:
> > > 
> > > Time flows left to right (->).
> > > Multifd channel 1: (VFIO device 1 buffer 2) (VFIO device 2 buffer 1)
> > > Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 1)
> > > 
> > > Both channel receive/load threads would be stuck forever in this case,
> > > since they can't load buffer 2 for devices 1 and 2 until they load
> > > buffer 1 for each of these devices.
> > > 
> > > 2) If devices are loading buffers at different speeds you don't want
> > > to block the faster device from receiving new buffer just because
> > > the slower one hasn't finished its loading yet.
> > 
> > I don't see why it can't be avoided.  Let me draw this in columns.
> > 
> > How I picture this is:
> > 
> >     multifd recv thread 1                     multifd recv thread 2
> >     ---------------------                     ---------------------
> >     recv VFIO device 1 buffer 2             recv VFIO device 2 buffer 2
> >      -> found that (dev1, buf1) missing,      -> found that (dev2, buf1) missing,
> >         skip load                                skip load
> >     recv VFIO device 2 buffer 1             recv VFIO device 1 buffer 1
> >      -> found that (dev2, buf1+buf2) ready,   -> found that (dev1, buf1+buf2) ready,
> >         load buf1+2 for dev2 here                load buf1+2 for dev1 here
> > Here right after one multifd thread recvs a buffer, it needs to be injected
> > into the cache array (with proper locking), so that whoever receives a full
> > series of those buffers will do the load (again, with proper locking..).
> > 
> > Would this not work?
> > 
> 
> For sure but that's definitely more complicated logic than just having
> a simple device loading thread that naturally loads incoming buffers
> for that device in-order.

I thought it was mostly your logic that was implemented, but yeah I didn't
check too much details on VFIO side.

> That thread isn't even in the purview of the migration code since
> it's a VFIO driver internal implementation detail.
> 
> And we'd still lose parallelism if it happens that two buffers that
> are to be loaded next for two devices happen to arrive in the same
> multifd channel:
> Multifd channel 1: (VFIO device 1 buffer 1) (VFIO device 2 buffer 1)
> Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 2)
> 
> Now device 2 buffer 1 has to wait until loading device 1 buffer 1
> finishes even thought with the decoupled loading thread implementation
> from this patch set these would be loaded in parallel.

Well it's possible indeed, but with normally 8 or more threads being there,
possibility of having such dependency is low.

Cedric has similar comment on starting from simple on the thread model.
I'd still suggest if ever possible we try reuse multifd recv threads; I do
expect the results should be similar.

I am sorry to ask for this, Fabiano already blames me for this, but..
logically it'll be best we use no new thread in the series, then one patch
on top with your new thread solution to justify its performance benefits
and worthwhile to having those threads at all.

PS: I'd suggest if you really need those threads it should still be managed
by migration framework like the src thread pool.  Sorry I'm pretty stubborn
on this, especially after I notice we have query-migrationthreads API just
recently.. even if now I'm not sure whether we should remove that API.  I
assume that shouldn't need much change, even if necessary.

Thanks,
Maciej S. Szmigiero Oct. 3, 2024, 8:34 p.m. UTC | #17
On 2.10.2024 23:25, Peter Xu wrote:
> On Wed, Oct 02, 2024 at 10:11:33PM +0200, Maciej S. Szmigiero wrote:
>> On 1.10.2024 23:30, Peter Xu wrote:
>>> On Tue, Oct 01, 2024 at 10:41:14PM +0200, Maciej S. Szmigiero wrote:
>>>> On 30.09.2024 23:57, Peter Xu wrote:
>>>>> On Mon, Sep 30, 2024 at 09:25:54PM +0200, Maciej S. Szmigiero wrote:
>>>>>> On 27.09.2024 02:53, Peter Xu wrote:
>>>>>>> On Fri, Sep 27, 2024 at 12:34:31AM +0200, Maciej S. Szmigiero wrote:
>>>>>>>> On 20.09.2024 18:45, Peter Xu wrote:
>>>>>>>>> On Fri, Sep 20, 2024 at 05:23:08PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>> On 19.09.2024 23:11, Peter Xu wrote:
>>>>>>>>>>> On Thu, Sep 19, 2024 at 09:49:10PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>>>> On 9.09.2024 22:03, Peter Xu wrote:
>>>>>>>>>>>>> On Tue, Aug 27, 2024 at 07:54:27PM +0200, Maciej S. Szmigiero wrote:
>>>>>>>>>>>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> load_finish SaveVMHandler allows migration code to poll whether
>>>>>>>>>>>>>> a device-specific asynchronous device state loading operation had finished.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> In order to avoid calling this handler needlessly the device is supposed
>>>>>>>>>>>>>> to notify the migration code of its possible readiness via a call to
>>>>>>>>>>>>>> qemu_loadvm_load_finish_ready_broadcast() while holding
>>>>>>>>>>>>>> qemu_loadvm_load_finish_ready_lock.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>>>>>>>>>>>> ---
(..)
>> As I wrote above, the kernel side of things are being taken care of by
>> the mlx5 driver maintainers.
>>
>> And these performance numbers suggest that there isn't some global lock
>> serializing all device accesses as otherwise it would quickly become
>> the bottleneck and we would be seeing diminishing improvement from
>> increased VF count instead of increased improvement.
> 
> Personally I am not satisfied with scaling with these numbers..
> 
>    1VF       2VFs      4VFs
>    274 ms -> 434 ms -> 1068 ms
> 
> The lock doesn't need to be as stupid as a global lock that all ioctl()s
> take and it might not be as obvious that we can easily see.  It can hide
> internally, it can be not in the form of a lock at all.
> 
> 1068 is almost 4x of 274 here, that's really not scalable at all even if it
> is improvement for sure..  I still feel like something is off.  If you
> think kernel isn't the bottleneck, I am actually more curious on why,
> especially if that could be relevant to the qemu design.
> 

These are 4 VFs of a single PF NIC, so it's not only kernel driver
involved here but also the whole physical device itself.

Without the userspace/QEMU side being parallelized it was hard to even
measure the driver/device-side bottlenecks.

However, even with the current state of things we still get a nice 67%
improvement in downtime.

As I wrote yesterday, AFAIK it is a WIP also on the mlx5/device side of
things.

(..)
>>>>>>
>>>>>>>       - How qemu_loadvm_load_finish_ready_broadcast() interacts with all
>>>>>>>         above..
>>>>>>>
>>>>>>> So if you really think it matters to load whatever VFIO device who's
>>>>>>> iterable data is ready first, then let's try come up with some better
>>>>>>> interface..  I can try to think about it too, but please answer me
>>>>>>> questions above so I can understand what I am missing on why that's
>>>>>>> important.  Numbers could help, even if 4 VF and I wonder how much diff
>>>>>>> there can be.  Mostly, I don't know why it's slow right now if it is; I
>>>>>>> thought it should be pretty fast, at least not a concern in VFIO migration
>>>>>>> world (which can take seconds of downtime or more..).
>>>>>>>
>>>>>>> IOW, it sounds more reasonalbe to me that no matter whether vfio will
>>>>>>> support multifd, it'll be nice we stick with vfio_load_state() /
>>>>>>> vfio_save_state() for config space, and hopefully it's also easier it
>>>>>>> always go via the main channel to everyone.  In these two hooks, VFIO can
>>>>>>> do whatever it wants to sync with other things (on src, sync with
>>>>>>> concurrent thread pool saving iterable data and dumping things to multifd
>>>>>>> channels; on dst, sync with multifd concurrent loads). I think it can
>>>>>>> remove the requirement on the load_finish() interface completely.  Yes,
>>>>>>> this can only load VFIO's pci config space one by one, but I think this is
>>>>>>> much simpler, and I hope it's also not that slow, but I'm not sure.
>>>>>>
>>>>>> To be clear, I made a following diagram describing how the patch set
>>>>>> is supposed to work right now, including changing per-device
>>>>>> VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE into a common MIG_CMD_SWITCHOVER.
>>>>>>
>>>>>> Time flows on it left to right (->).
>>>>>>
>>>>>> ----------- DIAGRAM START -----------
>>>>>> Source overall flow:
>>>>>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable                                                                          -> non iterable
>>>>>> Multifd channels:                                       \ multifd device state read and queue (1) -> multifd config data read and queue (1) /
>>>>>>
>>>>>> Target overall flow:
>>>>>> Main channel: live VM phase data -> MIG_CMD_SWITCHOVER -> iterable -> non iterable -> config data load operations
>>>>>> Multifd channels:                                       \ multifd device state (1) -> multifd config data read (1)
>>>>>>
>>>>>> Target config data load operations flow:
>>>>>> multifd config data read (1) -> config data load (2)
>>>>>>
>>>>>> Notes:
>>>>>> (1): per device threads running in parallel
>>>>>
>>>>> Here I raised this question before, but I'll ask again: do you think we can
>>>>> avoid using a separate thread on dest qemu, but reuse multifd recv threads?
>>>>>
>>>>> Src probably needs its own threads because multifd sender threads takes
>>>>> request, so it can't block on its own.
>>>>>
>>>>> However dest qemu isn't like that, it's packet driven so I think maybe it's
>>>>> ok VFIO directly loads the data in the multifd threads.  We may want to
>>>>> have enough multifd threads to make sure IO still don't block much on the
>>>>> NIC, but I think tuning the num of multifd threads should work in this
>>>>> case.
>>>>
>>>> We need to have the receiving threads decoupled from the VFIO device state
>>>> loading threads at least because otherwise:
>>>> 1) You can have a deadlock if device state for multiple devices arrives
>>>> out of order, like here:
>>>>
>>>> Time flows left to right (->).
>>>> Multifd channel 1: (VFIO device 1 buffer 2) (VFIO device 2 buffer 1)
>>>> Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 1)
>>>>
>>>> Both channel receive/load threads would be stuck forever in this case,
>>>> since they can't load buffer 2 for devices 1 and 2 until they load
>>>> buffer 1 for each of these devices.
>>>>
>>>> 2) If devices are loading buffers at different speeds you don't want
>>>> to block the faster device from receiving new buffer just because
>>>> the slower one hasn't finished its loading yet.
>>>
>>> I don't see why it can't be avoided.  Let me draw this in columns.
>>>
>>> How I picture this is:
>>>
>>>      multifd recv thread 1                     multifd recv thread 2
>>>      ---------------------                     ---------------------
>>>      recv VFIO device 1 buffer 2             recv VFIO device 2 buffer 2
>>>       -> found that (dev1, buf1) missing,      -> found that (dev2, buf1) missing,
>>>          skip load                                skip load
>>>      recv VFIO device 2 buffer 1             recv VFIO device 1 buffer 1
>>>       -> found that (dev2, buf1+buf2) ready,   -> found that (dev1, buf1+buf2) ready,
>>>          load buf1+2 for dev2 here                load buf1+2 for dev1 here
>>> Here right after one multifd thread recvs a buffer, it needs to be injected
>>> into the cache array (with proper locking), so that whoever receives a full
>>> series of those buffers will do the load (again, with proper locking..).
>>>
>>> Would this not work?
>>>
>>
>> For sure but that's definitely more complicated logic than just having
>> a simple device loading thread that naturally loads incoming buffers
>> for that device in-order.
> 
> I thought it was mostly your logic that was implemented, but yeah I didn't
> check too much details on VFIO side.
> 
>> That thread isn't even in the purview of the migration code since
>> it's a VFIO driver internal implementation detail.
>>
>> And we'd still lose parallelism if it happens that two buffers that
>> are to be loaded next for two devices happen to arrive in the same
>> multifd channel:
>> Multifd channel 1: (VFIO device 1 buffer 1) (VFIO device 2 buffer 1)
>> Multifd channel 2: (VFIO device 2 buffer 2) (VFIO device 1 buffer 2)
>>
>> Now device 2 buffer 1 has to wait until loading device 1 buffer 1
>> finishes even thought with the decoupled loading thread implementation
>> from this patch set these would be loaded in parallel.
> 
> Well it's possible indeed, but with normally 8 or more threads being there,
> possibility of having such dependency is low.
> 
> Cedric has similar comment on starting from simple on the thread model.
> I'd still suggest if ever possible we try reuse multifd recv threads; I do
> expect the results should be similar.
> 
> I am sorry to ask for this, Fabiano already blames me for this, but..
> logically it'll be best we use no new thread in the series, then one patch
> on top with your new thread solution to justify its performance benefits
> and worthwhile to having those threads at all.

To be clear, these loading threads are mostly blocking I/O threads, NOT
compute threads.
This means that the usual "rule of thumb" that the count of threads should
not exceed the total number of logical CPUs does NOT apply to them.

They are similar to what glibc uses under the hood to simulate POSIX AIO
(aio_read(), aio_write()), to implement an async DNS resolver (getaddrinfo_a())
and what Glib's GIO uses to simulate its own async file operations.
Using helper threads for turning blocking I/O into "AIO" is a pretty common
thing.

To show that these loading threads mostly spend their time sleeping (waiting
for I/O) I made a quick patch at [1] tracing how much time they spend waiting
for incoming buffers and how much time they spend waiting for these buffers
to be loaded into the device.

The results (without patch [2] described later) are like this:
> 5919@1727974993.403280:vfio_load_state_device_buffer_start  (0000:af:00.2)
> 5921@1727974993.407932:vfio_load_state_device_buffer_start  (0000:af:00.4)
> 5922@1727974993.407964:vfio_load_state_device_buffer_start  (0000:af:00.5)
> 5920@1727974993.408480:vfio_load_state_device_buffer_start  (0000:af:00.3)
> 5920@1727974993.666843:vfio_load_state_device_buffer_end  (0000:af:00.3) wait 43 ms load 217 ms
> 5921@1727974993.686005:vfio_load_state_device_buffer_end  (0000:af:00.4) wait 75 ms load 206 ms
> 5919@1727974993.686054:vfio_load_state_device_buffer_end  (0000:af:00.2) wait 69 ms load 210 ms
> 5922@1727974993.689919:vfio_load_state_device_buffer_end  (0000:af:00.5) wait 79 ms load 204 ms

Summing up:
0000:af:00.2 total loading time 283 ms, wait 69 ms load 210 ms
0000:af:00.3 total loading time 258 ms, wait 43 ms load 217 ms
0000:af:00.4 total loading time 278 ms, wait 75 ms load 206 ms
0000:af:00.5 total loading time 282 ms, wait 79 ms load 204 ms

In other words, these threads spend ~100% of their total runtime waiting
for I/O, 70%-75% of that time waiting for buffers to get loaded into their
target device.

So having more threads here won't negatively affect the host CPU
consumption since these threads barely use the host CPU at all.
Also, their count is capped at the number of VFIO devices in the VM.

I also did a quick test with the same config as usual: 4 VFs, 6 multifd
channels, but with patch at [2] simulating forced coupling of loading
threads to multifd receive channel threads.

With this patch load_state_buffer() handler will return to the multifd
channel thread only when the loading thread finishes loading available
buffers and is about to wait for the next buffers to arrive - just as
loading buffers directly from these channel threads would do.

The resulting lowest downtime from 115 live migration runs was 1295ms -
that's 21% worse than 1068ms of downtime with these loading threads running
on their own.

I expect that this performance penalty to get even worse with more VFs
than 4.

So no, we can't load buffers directly from multifd channel receive threads.

> PS: I'd suggest if you really need those threads it should still be managed
> by migration framework like the src thread pool.  Sorry I'm pretty stubborn
> on this, especially after I notice we have query-migrationthreads API just
> recently.. even if now I'm not sure whether we should remove that API.  I
> assume that shouldn't need much change, even if necessary.

I can certainly make these loading threads managed in a thread pool if that's
easier for you.

> Thanks,
> 

Thanks,
Maciej

[1]: https://github.com/maciejsszmigiero/qemu/commit/b0833053359715c604070f64fc058f90ec61d180
[2]: https://github.com/maciejsszmigiero/qemu/commit/0c9b4072eaebf8e7bd9560dd27a14cd048097565
Peter Xu Oct. 3, 2024, 9:17 p.m. UTC | #18
On Thu, Oct 03, 2024 at 10:34:28PM +0200, Maciej S. Szmigiero wrote:
> To be clear, these loading threads are mostly blocking I/O threads, NOT
> compute threads.
> This means that the usual "rule of thumb" that the count of threads should
> not exceed the total number of logical CPUs does NOT apply to them.
> 
> They are similar to what glibc uses under the hood to simulate POSIX AIO
> (aio_read(), aio_write()), to implement an async DNS resolver (getaddrinfo_a())
> and what Glib's GIO uses to simulate its own async file operations.
> Using helper threads for turning blocking I/O into "AIO" is a pretty common
> thing.

Fair enough.  Yes I could be over-cautious due to the previous experience
on managing all kinds of migration threads.

> 
> To show that these loading threads mostly spend their time sleeping (waiting
> for I/O) I made a quick patch at [1] tracing how much time they spend waiting
> for incoming buffers and how much time they spend waiting for these buffers
> to be loaded into the device.
> 
> The results (without patch [2] described later) are like this:
> > 5919@1727974993.403280:vfio_load_state_device_buffer_start  (0000:af:00.2)
> > 5921@1727974993.407932:vfio_load_state_device_buffer_start  (0000:af:00.4)
> > 5922@1727974993.407964:vfio_load_state_device_buffer_start  (0000:af:00.5)
> > 5920@1727974993.408480:vfio_load_state_device_buffer_start  (0000:af:00.3)
> > 5920@1727974993.666843:vfio_load_state_device_buffer_end  (0000:af:00.3) wait 43 ms load 217 ms
> > 5921@1727974993.686005:vfio_load_state_device_buffer_end  (0000:af:00.4) wait 75 ms load 206 ms
> > 5919@1727974993.686054:vfio_load_state_device_buffer_end  (0000:af:00.2) wait 69 ms load 210 ms
> > 5922@1727974993.689919:vfio_load_state_device_buffer_end  (0000:af:00.5) wait 79 ms load 204 ms
> 
> Summing up:
> 0000:af:00.2 total loading time 283 ms, wait 69 ms load 210 ms
> 0000:af:00.3 total loading time 258 ms, wait 43 ms load 217 ms
> 0000:af:00.4 total loading time 278 ms, wait 75 ms load 206 ms
> 0000:af:00.5 total loading time 282 ms, wait 79 ms load 204 ms
> 
> In other words, these threads spend ~100% of their total runtime waiting
> for I/O, 70%-75% of that time waiting for buffers to get loaded into their
> target device.
> 
> So having more threads here won't negatively affect the host CPU
> consumption since these threads barely use the host CPU at all.
> Also, their count is capped at the number of VFIO devices in the VM.
> 
> I also did a quick test with the same config as usual: 4 VFs, 6 multifd
> channels, but with patch at [2] simulating forced coupling of loading
> threads to multifd receive channel threads.
> 
> With this patch load_state_buffer() handler will return to the multifd
> channel thread only when the loading thread finishes loading available
> buffers and is about to wait for the next buffers to arrive - just as
> loading buffers directly from these channel threads would do.
> 
> The resulting lowest downtime from 115 live migration runs was 1295ms -
> that's 21% worse than 1068ms of downtime with these loading threads running
> on their own.
> 
> I expect that this performance penalty to get even worse with more VFs
> than 4.
> 
> So no, we can't load buffers directly from multifd channel receive threads.

6 channels can be a bit less in this test case with 4 VFs, but indeed
adding such dependency on number of multifd threads isn't as good either, I
agree.  I'm ok as long as VFIO reviewers are fine.

> 
> > PS: I'd suggest if you really need those threads it should still be managed
> > by migration framework like the src thread pool.  Sorry I'm pretty stubborn
> > on this, especially after I notice we have query-migrationthreads API just
> > recently.. even if now I'm not sure whether we should remove that API.  I
> > assume that shouldn't need much change, even if necessary.
> 
> I can certainly make these loading threads managed in a thread pool if that's
> easier for you.

Yes, if you want to use separate thread it'll be great to match on the src
thread model with similar pool.  I hope the pool interface you have is
easily applicable on both sides.

Thanks,
diff mbox series

Patch

diff --git a/include/migration/register.h b/include/migration/register.h
index 4a578f140713..44d8cf5192ae 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -278,6 +278,27 @@  typedef struct SaveVMHandlers {
     int (*load_state_buffer)(void *opaque, char *data, size_t data_size,
                              Error **errp);
 
+    /**
+     * @load_finish
+     *
+     * Poll whether all asynchronous device state loading had finished.
+     * Not called on the load failure path.
+     *
+     * Called while holding the qemu_loadvm_load_finish_ready_lock.
+     *
+     * If this method signals "not ready" then it might not be called
+     * again until qemu_loadvm_load_finish_ready_broadcast() is invoked
+     * while holding qemu_loadvm_load_finish_ready_lock.
+     *
+     * @opaque: data pointer passed to register_savevm_live()
+     * @is_finished: whether the loading had finished (output parameter)
+     * @errp: pointer to Error*, to store an error if it happens.
+     *
+     * Returns zero to indicate success and negative for error
+     * It's not an error that the loading still hasn't finished.
+     */
+    int (*load_finish)(void *opaque, bool *is_finished, Error **errp);
+
     /**
      * @load_setup
      *
diff --git a/migration/migration.c b/migration/migration.c
index 3dea06d57732..d61e7b055e07 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -259,6 +259,9 @@  void migration_object_init(void)
 
     current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
 
+    qemu_mutex_init(&current_incoming->load_finish_ready_mutex);
+    qemu_cond_init(&current_incoming->load_finish_ready_cond);
+
     migration_object_check(current_migration, &error_fatal);
 
     ram_mig_init();
@@ -410,6 +413,9 @@  void migration_incoming_state_destroy(void)
         mis->postcopy_qemufile_dst = NULL;
     }
 
+    qemu_mutex_destroy(&mis->load_finish_ready_mutex);
+    qemu_cond_destroy(&mis->load_finish_ready_cond);
+
     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 }
 
diff --git a/migration/migration.h b/migration/migration.h
index 38aa1402d516..4e2443e6c8ec 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -230,6 +230,9 @@  struct MigrationIncomingState {
 
     /* Do exit on incoming migration failure */
     bool exit_on_error;
+
+    QemuCond load_finish_ready_cond;
+    QemuMutex load_finish_ready_mutex;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/migration/savevm.c b/migration/savevm.c
index 3fde5ca8c26b..33c9200d1e78 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -3022,6 +3022,37 @@  int qemu_loadvm_state(QEMUFile *f)
         return ret;
     }
 
+    qemu_loadvm_load_finish_ready_lock();
+    while (!ret) { /* Don't call load_finish() handlers on the load failure path */
+        bool all_ready = true;
+        SaveStateEntry *se = NULL;
+
+        QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+            bool this_ready;
+
+            if (!se->ops || !se->ops->load_finish) {
+                continue;
+            }
+
+            ret = se->ops->load_finish(se->opaque, &this_ready, &local_err);
+            if (ret) {
+                error_report_err(local_err);
+
+                qemu_loadvm_load_finish_ready_unlock();
+                return -EINVAL;
+            } else if (!this_ready) {
+                all_ready = false;
+            }
+        }
+
+        if (all_ready) {
+            break;
+        }
+
+        qemu_cond_wait(&mis->load_finish_ready_cond, &mis->load_finish_ready_mutex);
+    }
+    qemu_loadvm_load_finish_ready_unlock();
+
     if (ret == 0) {
         ret = qemu_file_get_error(f);
     }
@@ -3126,6 +3157,27 @@  int qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
     return 0;
 }
 
+void qemu_loadvm_load_finish_ready_lock(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    qemu_mutex_lock(&mis->load_finish_ready_mutex);
+}
+
+void qemu_loadvm_load_finish_ready_unlock(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    qemu_mutex_unlock(&mis->load_finish_ready_mutex);
+}
+
+void qemu_loadvm_load_finish_ready_broadcast(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    qemu_cond_broadcast(&mis->load_finish_ready_cond);
+}
+
 bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
                   bool has_devices, strList *devices, Error **errp)
 {
diff --git a/migration/savevm.h b/migration/savevm.h
index d388f1bfca98..69ae22cded7a 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -73,4 +73,8 @@  int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
 int qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
                                   char *buf, size_t len, Error **errp);
 
+void qemu_loadvm_load_finish_ready_lock(void);
+void qemu_loadvm_load_finish_ready_unlock(void);
+void qemu_loadvm_load_finish_ready_broadcast(void);
+
 #endif