diff mbox series

[v3,8/8] migration: Protect updates to current_migration with a mutex

Message ID 20241024213056.1395400-9-peterx@redhat.com (mailing list archive)
State New
Headers show
Series Migration: Make misc.h helpers available for whole VM lifecycle | expand

Commit Message

Peter Xu Oct. 24, 2024, 9:30 p.m. UTC
Introduce migration_mutex, protecting concurrent updates to
current_migration.

In reality, most of the exported migration functions are safe to access
migration objects on capabilities, etc., e.g. many of the code is invoked
within migration thread via different hooks (e.g., precopy notifier,
vmstate handler hooks, etc.).

So literally the mutex so far only makes sure below two APIs that are prone
to accessing freed current_migration:

        migration_is_running()
        migration_file_set_error()

Then we'll need to take the mutex too when init/free the migration object.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.h |  3 +++
 migration/migration.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

Comments

Cédric Le Goater Oct. 25, 2024, 12:50 p.m. UTC | #1
On 10/24/24 23:30, Peter Xu wrote:
> Introduce migration_mutex, protecting concurrent updates to
> current_migration.
> 
> In reality, most of the exported migration functions are safe to access
> migration objects on capabilities, etc., e.g. many of the code is invoked
> within migration thread via different hooks (e.g., precopy notifier,
> vmstate handler hooks, etc.).
> 
> So literally the mutex so far only makes sure below two APIs that are prone
> to accessing freed current_migration:
> 
>          migration_is_running()
>          migration_file_set_error()
> 
> Then we'll need to take the mutex too when init/free the migration object.

we should also drop :

   static void vfio_set_migration_error(int ret)
   {
       if (migration_is_running()) {
           migration_file_set_error(ret, NULL);
       }
   }

and use directly migration_file_set_error().

> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   migration/migration.h |  3 +++
>   migration/migration.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 46 insertions(+)
> 
> diff --git a/migration/migration.h b/migration/migration.h
> index 9fa26ab06a..05edcf0c49 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -473,6 +473,9 @@ struct MigrationState {
>       bool rdma_migration;
>   };
>   
> +extern QemuMutex migration_mutex;
> +#define  QEMU_MIGRATION_LOCK_GUARD()  QEMU_LOCK_GUARD(&migration_mutex)
> +

Why are these definitions exported ?

Thanks,

C.



>   void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
>                          MigrationStatus new_state);
>   
> diff --git a/migration/migration.c b/migration/migration.c
> index 127b01734d..ef044968df 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -97,6 +97,14 @@ enum mig_rp_message_type {
>      migrations at once.  For now we don't need to add
>      dynamic creation of migration */
>   
> +/*
> + * Protects current_migration below.  Must be hold when using migration
> + * exported functions unless the caller knows it won't get freed.  For
> + * example, when in the context of migration_thread() it's safe to access
> + * current_migration without the mutex, because the thread holds one extra
> + * refcount of the object, so it literally pins the object in-memory.
> + */
> +QemuMutex migration_mutex;
>   static MigrationState *current_migration;
>   static MigrationIncomingState *current_incoming;
>   
> @@ -110,6 +118,17 @@ static void migrate_fd_cancel(MigrationState *s);
>   static bool close_return_path_on_source(MigrationState *s);
>   static void migration_completion_end(MigrationState *s);
>   
> +/*
> + * This is explicitly done without migration_object_init(), because it may
> + * start to use this lock already when instance_init() of the object.  The
> + * mutex is alive for the whole lifecycle of QEMU, so it's always usable
> + * and never destroyed.
> + */
> +static void __attribute__((constructor)) migration_mutex_init(void)
> +{
> +    qemu_mutex_init(&migration_mutex);
> +}
> +
>   static void migration_downtime_start(MigrationState *s)
>   {
>       trace_vmstate_downtime_checkpoint("src-downtime-start");
> @@ -336,6 +355,14 @@ void migration_shutdown(void)
>        * stop the migration using this structure
>        */
>       migration_cancel(NULL);
> +    /*
> +     * Release the refcount from the main thread.  It means it can be freed
> +     * here if migration thread is not running.
> +     *
> +     * NOTE: we don't need QEMU_MIGRATION_LOCK_GUARD() on this access
> +     * because we're sure there's one refcount.  The lock will be taken in
> +     * finalize() if it triggers, so we can't take it here anyway.
> +     */
>       object_unref(OBJECT(current_migration));
>   
>       /*
> @@ -1118,8 +1145,14 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
>   
>   bool migration_is_running(void)
>   {
> +    QEMU_MIGRATION_LOCK_GUARD();
> +
>       MigrationState *s = current_migration;
>   
> +    if (!s) {
> +        return false;
> +    }
> +
>       switch (s->state) {
>       case MIGRATION_STATUS_ACTIVE:
>       case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> @@ -3029,8 +3062,14 @@ static MigThrError postcopy_pause(MigrationState *s)
>   
>   void migration_file_set_error(int ret, Error *err)
>   {
> +    QEMU_MIGRATION_LOCK_GUARD();
> +
>       MigrationState *s = current_migration;
>   
> +    if (!s) {
> +        return;
> +    }
> +
>       WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
>           if (s->to_dst_file) {
>               qemu_file_set_error_obj(s->to_dst_file, ret, err);
> @@ -3835,6 +3874,8 @@ static void migration_instance_finalize(Object *obj)
>   {
>       MigrationState *ms = MIGRATION_OBJ(obj);
>   
> +    QEMU_MIGRATION_LOCK_GUARD();
> +
>       qemu_mutex_destroy(&ms->error_mutex);
>       qemu_mutex_destroy(&ms->qemu_file_lock);
>       qemu_sem_destroy(&ms->wait_unplug_sem);
> @@ -3858,6 +3899,8 @@ static void migration_instance_init(Object *obj)
>   {
>       MigrationState *ms = MIGRATION_OBJ(obj);
>   
> +    QEMU_MIGRATION_LOCK_GUARD();
> +
>       /*
>        * There can only be one migration object globally. Keep a record of
>        * the pointer in current_migration, which will be reset after the
Peter Xu Oct. 28, 2024, 3:50 p.m. UTC | #2
On Fri, Oct 25, 2024 at 02:50:36PM +0200, Cédric Le Goater wrote:
> On 10/24/24 23:30, Peter Xu wrote:
> > Introduce migration_mutex, protecting concurrent updates to
> > current_migration.
> > 
> > In reality, most of the exported migration functions are safe to access
> > migration objects on capabilities, etc., e.g. many of the code is invoked
> > within migration thread via different hooks (e.g., precopy notifier,
> > vmstate handler hooks, etc.).
> > 
> > So literally the mutex so far only makes sure below two APIs that are prone
> > to accessing freed current_migration:
> > 
> >          migration_is_running()
> >          migration_file_set_error()
> > 
> > Then we'll need to take the mutex too when init/free the migration object.
> 
> we should also drop :
> 
>   static void vfio_set_migration_error(int ret)
>   {
>       if (migration_is_running()) {
>           migration_file_set_error(ret, NULL);
>       }
>   }
> 
> and use directly migration_file_set_error().

We'll need to export migration_is_running() anyway, though.  So maybe we
can do that as a VFIO's follow up?

> 
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   migration/migration.h |  3 +++
> >   migration/migration.c | 43 +++++++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 46 insertions(+)
> > 
> > diff --git a/migration/migration.h b/migration/migration.h
> > index 9fa26ab06a..05edcf0c49 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -473,6 +473,9 @@ struct MigrationState {
> >       bool rdma_migration;
> >   };
> > +extern QemuMutex migration_mutex;
> > +#define  QEMU_MIGRATION_LOCK_GUARD()  QEMU_LOCK_GUARD(&migration_mutex)
> > +
> 
> Why are these definitions exported ?

This is still only used in migration/ so it's not exported to QEMU.  I was
planning this can be available anywhere in migration/ when a function needs
to be exported.

However I think you're right.. this so far is even only used in
migration.c, so we don't need to export it, at least until some other
migration/*.c will export anything.  Will remove it when repost.

Thanks,

> 
> Thanks,
> 
> C.
> 
> 
> 
> >   void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
> >                          MigrationStatus new_state);
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 127b01734d..ef044968df 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -97,6 +97,14 @@ enum mig_rp_message_type {
> >      migrations at once.  For now we don't need to add
> >      dynamic creation of migration */
> > +/*
> > + * Protects current_migration below.  Must be hold when using migration
> > + * exported functions unless the caller knows it won't get freed.  For
> > + * example, when in the context of migration_thread() it's safe to access
> > + * current_migration without the mutex, because the thread holds one extra
> > + * refcount of the object, so it literally pins the object in-memory.
> > + */
> > +QemuMutex migration_mutex;
> >   static MigrationState *current_migration;
> >   static MigrationIncomingState *current_incoming;
> > @@ -110,6 +118,17 @@ static void migrate_fd_cancel(MigrationState *s);
> >   static bool close_return_path_on_source(MigrationState *s);
> >   static void migration_completion_end(MigrationState *s);
> > +/*
> > + * This is explicitly done without migration_object_init(), because it may
> > + * start to use this lock already when instance_init() of the object.  The
> > + * mutex is alive for the whole lifecycle of QEMU, so it's always usable
> > + * and never destroyed.
> > + */
> > +static void __attribute__((constructor)) migration_mutex_init(void)
> > +{
> > +    qemu_mutex_init(&migration_mutex);
> > +}
> > +
> >   static void migration_downtime_start(MigrationState *s)
> >   {
> >       trace_vmstate_downtime_checkpoint("src-downtime-start");
> > @@ -336,6 +355,14 @@ void migration_shutdown(void)
> >        * stop the migration using this structure
> >        */
> >       migration_cancel(NULL);
> > +    /*
> > +     * Release the refcount from the main thread.  It means it can be freed
> > +     * here if migration thread is not running.
> > +     *
> > +     * NOTE: we don't need QEMU_MIGRATION_LOCK_GUARD() on this access
> > +     * because we're sure there's one refcount.  The lock will be taken in
> > +     * finalize() if it triggers, so we can't take it here anyway.
> > +     */
> >       object_unref(OBJECT(current_migration));
> >       /*
> > @@ -1118,8 +1145,14 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
> >   bool migration_is_running(void)
> >   {
> > +    QEMU_MIGRATION_LOCK_GUARD();
> > +
> >       MigrationState *s = current_migration;
> > +    if (!s) {
> > +        return false;
> > +    }
> > +
> >       switch (s->state) {
> >       case MIGRATION_STATUS_ACTIVE:
> >       case MIGRATION_STATUS_POSTCOPY_ACTIVE:
> > @@ -3029,8 +3062,14 @@ static MigThrError postcopy_pause(MigrationState *s)
> >   void migration_file_set_error(int ret, Error *err)
> >   {
> > +    QEMU_MIGRATION_LOCK_GUARD();
> > +
> >       MigrationState *s = current_migration;
> > +    if (!s) {
> > +        return;
> > +    }
> > +
> >       WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
> >           if (s->to_dst_file) {
> >               qemu_file_set_error_obj(s->to_dst_file, ret, err);
> > @@ -3835,6 +3874,8 @@ static void migration_instance_finalize(Object *obj)
> >   {
> >       MigrationState *ms = MIGRATION_OBJ(obj);
> > +    QEMU_MIGRATION_LOCK_GUARD();
> > +
> >       qemu_mutex_destroy(&ms->error_mutex);
> >       qemu_mutex_destroy(&ms->qemu_file_lock);
> >       qemu_sem_destroy(&ms->wait_unplug_sem);
> > @@ -3858,6 +3899,8 @@ static void migration_instance_init(Object *obj)
> >   {
> >       MigrationState *ms = MIGRATION_OBJ(obj);
> > +    QEMU_MIGRATION_LOCK_GUARD();
> > +
> >       /*
> >        * There can only be one migration object globally. Keep a record of
> >        * the pointer in current_migration, which will be reset after the
>
Cédric Le Goater Nov. 4, 2024, 3:26 p.m. UTC | #3
On 10/28/24 16:50, Peter Xu wrote:
> On Fri, Oct 25, 2024 at 02:50:36PM +0200, Cédric Le Goater wrote:
>> On 10/24/24 23:30, Peter Xu wrote:
>>> Introduce migration_mutex, protecting concurrent updates to
>>> current_migration.
>>>
>>> In reality, most of the exported migration functions are safe to access
>>> migration objects on capabilities, etc., e.g. many of the code is invoked
>>> within migration thread via different hooks (e.g., precopy notifier,
>>> vmstate handler hooks, etc.).
>>>
>>> So literally the mutex so far only makes sure below two APIs that are prone
>>> to accessing freed current_migration:
>>>
>>>           migration_is_running()
>>>           migration_file_set_error()
>>>
>>> Then we'll need to take the mutex too when init/free the migration object.
>>
>> we should also drop :
>>
>>    static void vfio_set_migration_error(int ret)
>>    {
>>        if (migration_is_running()) {
>>            migration_file_set_error(ret, NULL);
>>        }
>>    }
>>
>> and use directly migration_file_set_error().
> 
> We'll need to export migration_is_running() anyway, though.  So maybe we
> can do that as a VFIO's follow up?

sure. Let's add that to the list, along with the 'log_enabled' bool that
you discussed with Avihai in the previous patch :

  [PATCH v3 7/8] migration: Unexport migration_is_active()

These changes would be good to have in QEMU 9.2.

Avihai, I have access to HW supporting HW dirty tracking.

Thanks,

C.
diff mbox series

Patch

diff --git a/migration/migration.h b/migration/migration.h
index 9fa26ab06a..05edcf0c49 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -473,6 +473,9 @@  struct MigrationState {
     bool rdma_migration;
 };
 
+extern QemuMutex migration_mutex;
+#define  QEMU_MIGRATION_LOCK_GUARD()  QEMU_LOCK_GUARD(&migration_mutex)
+
 void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
                        MigrationStatus new_state);
 
diff --git a/migration/migration.c b/migration/migration.c
index 127b01734d..ef044968df 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -97,6 +97,14 @@  enum mig_rp_message_type {
    migrations at once.  For now we don't need to add
    dynamic creation of migration */
 
+/*
+ * Protects current_migration below.  Must be hold when using migration
+ * exported functions unless the caller knows it won't get freed.  For
+ * example, when in the context of migration_thread() it's safe to access
+ * current_migration without the mutex, because the thread holds one extra
+ * refcount of the object, so it literally pins the object in-memory.
+ */
+QemuMutex migration_mutex;
 static MigrationState *current_migration;
 static MigrationIncomingState *current_incoming;
 
@@ -110,6 +118,17 @@  static void migrate_fd_cancel(MigrationState *s);
 static bool close_return_path_on_source(MigrationState *s);
 static void migration_completion_end(MigrationState *s);
 
+/*
+ * This is explicitly done without migration_object_init(), because it may
+ * start to use this lock already when instance_init() of the object.  The
+ * mutex is alive for the whole lifecycle of QEMU, so it's always usable
+ * and never destroyed.
+ */
+static void __attribute__((constructor)) migration_mutex_init(void)
+{
+    qemu_mutex_init(&migration_mutex);
+}
+
 static void migration_downtime_start(MigrationState *s)
 {
     trace_vmstate_downtime_checkpoint("src-downtime-start");
@@ -336,6 +355,14 @@  void migration_shutdown(void)
      * stop the migration using this structure
      */
     migration_cancel(NULL);
+    /*
+     * Release the refcount from the main thread.  It means it can be freed
+     * here if migration thread is not running.
+     *
+     * NOTE: we don't need QEMU_MIGRATION_LOCK_GUARD() on this access
+     * because we're sure there's one refcount.  The lock will be taken in
+     * finalize() if it triggers, so we can't take it here anyway.
+     */
     object_unref(OBJECT(current_migration));
 
     /*
@@ -1118,8 +1145,14 @@  void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
 
 bool migration_is_running(void)
 {
+    QEMU_MIGRATION_LOCK_GUARD();
+
     MigrationState *s = current_migration;
 
+    if (!s) {
+        return false;
+    }
+
     switch (s->state) {
     case MIGRATION_STATUS_ACTIVE:
     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
@@ -3029,8 +3062,14 @@  static MigThrError postcopy_pause(MigrationState *s)
 
 void migration_file_set_error(int ret, Error *err)
 {
+    QEMU_MIGRATION_LOCK_GUARD();
+
     MigrationState *s = current_migration;
 
+    if (!s) {
+        return;
+    }
+
     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
         if (s->to_dst_file) {
             qemu_file_set_error_obj(s->to_dst_file, ret, err);
@@ -3835,6 +3874,8 @@  static void migration_instance_finalize(Object *obj)
 {
     MigrationState *ms = MIGRATION_OBJ(obj);
 
+    QEMU_MIGRATION_LOCK_GUARD();
+
     qemu_mutex_destroy(&ms->error_mutex);
     qemu_mutex_destroy(&ms->qemu_file_lock);
     qemu_sem_destroy(&ms->wait_unplug_sem);
@@ -3858,6 +3899,8 @@  static void migration_instance_init(Object *obj)
 {
     MigrationState *ms = MIGRATION_OBJ(obj);
 
+    QEMU_MIGRATION_LOCK_GUARD();
+
     /*
      * There can only be one migration object globally. Keep a record of
      * the pointer in current_migration, which will be reset after the