diff mbox series

[v5,3/5] migration: enable multifd and postcopy together

Message ID 20250205122712.229151-4-ppandit@redhat.com (mailing list archive)
State New
Headers show
Series Allow to enable multifd and postcopy migration together | expand

Commit Message

Prasad Pandit Feb. 5, 2025, 12:27 p.m. UTC
From: Prasad Pandit <pjp@fedoraproject.org>

Enable Multifd and Postcopy migration together.
The migration_ioc_process_incoming() routine
checks magic value sent on each channel and
helps to properly setup multifd and postcopy
channels.

The Precopy and Multifd threads work during the
initial guest RAM transfer. When migration moves
to the Postcopy phase, the multifd threads are
restrained and Postcopy threads on the destination
request/pull data from the source side.

Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
---
 migration/migration.c      | 106 +++++++++++++++++++++++--------------
 migration/multifd-nocomp.c |   3 +-
 migration/options.c        |   5 --
 migration/ram.c            |   4 +-
 4 files changed, 70 insertions(+), 48 deletions(-)

v4: no change
- https://lore.kernel.org/qemu-devel/20250127120823.144949-1-ppandit@redhat.com/T/#t

Comments

Peter Xu Feb. 6, 2025, 11:16 p.m. UTC | #1
On Wed, Feb 05, 2025 at 05:57:10PM +0530, Prasad Pandit wrote:
> From: Prasad Pandit <pjp@fedoraproject.org>
> 
> Enable Multifd and Postcopy migration together.
> The migration_ioc_process_incoming() routine
> checks magic value sent on each channel and
> helps to properly setup multifd and postcopy
> channels.
> 
> The Precopy and Multifd threads work during the
> initial guest RAM transfer. When migration moves
> to the Postcopy phase, the multifd threads are
> restrained and Postcopy threads on the destination
> request/pull data from the source side.
> 
> Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
> ---
>  migration/migration.c      | 106 +++++++++++++++++++++++--------------
>  migration/multifd-nocomp.c |   3 +-
>  migration/options.c        |   5 --
>  migration/ram.c            |   4 +-
>  4 files changed, 70 insertions(+), 48 deletions(-)
> 
> v4: no change
> - https://lore.kernel.org/qemu-devel/20250127120823.144949-1-ppandit@redhat.com/T/#t
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 2d1da917c7..a280722e9e 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -92,6 +92,9 @@ enum mig_rp_message_type {
>      MIG_RP_MSG_MAX
>  };
>  
> +/* Migration channel types */
> +enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };

Maybe s/DEFAULT/MAIN/?

> +
>  /* When we add fault tolerance, we could have several
>     migrations at once.  For now we don't need to add
>     dynamic creation of migration */
> @@ -929,26 +932,33 @@ void migration_fd_process_incoming(QEMUFile *f)
>  /*
>   * Returns true when we want to start a new incoming migration process,
>   * false otherwise.
> + *
> + * All the required channels must be in place before a new incoming
> + * migration process starts.
> + *  - Multifd enabled:
> + *    The main channel and the multifd channels are required.
> + *  - Multifd/Postcopy disabled:
> + *    The main channel is required.
> + *  - Postcopy enabled:
> + *    We don't want to start a new incoming migration when
> + *    the postcopy channel is created. Because it is created
> + *    towards the end of the precopy migration.
> + *
>   */
> -static bool migration_should_start_incoming(bool main_channel)
> +static bool migration_should_start_incoming(uint8_t channel)
>  {
> -    /* Multifd doesn't start unless all channels are established */
> -    if (migrate_multifd()) {
> -        return migration_has_all_channels();
> -    }
> +    bool ret = false;
> +
> +    if (channel != CH_POSTCOPY) {
> +        MigrationIncomingState *mis = migration_incoming_get_current();
> +        ret = mis->from_src_file ? true : false;
>  
> -    /* Preempt channel only starts when the main channel is created */
> -    if (migrate_postcopy_preempt()) {
> -        return main_channel;
> +        if (ret && migrate_multifd()) {
> +            ret = multifd_recv_all_channels_created();
> +        }
>      }
>  
> -    /*
> -     * For all the rest types of migration, we should only reach here when
> -     * it's the main channel that's being created, and we should always
> -     * proceed with this channel.
> -     */
> -    assert(main_channel);
> -    return true;
> +    return ret;
>  }
>  
>  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
> @@ -956,13 +966,12 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>      MigrationIncomingState *mis = migration_incoming_get_current();
>      Error *local_err = NULL;
>      QEMUFile *f;
> -    bool default_channel = true;
>      uint32_t channel_magic = 0;
> +    uint8_t channel = CH_DEFAULT;
>      int ret = 0;
>  
> -    if (migrate_multifd() && !migrate_mapped_ram() &&
> -        !migrate_postcopy_ram() &&
> -        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
> +    if (!migration_should_start_incoming(channel)) {

This says "if we assume this is the main channel, and if we shouldn't start
incoming migration, then we should peek at the buffers".

Could you help explain?

> +        if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
>          /*
>           * With multiple channels, it is possible that we receive channels
>           * out of order on destination side, causing incorrect mapping of
> @@ -973,42 +982,58 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>           * tls handshake while initializing main channel so with tls this
>           * issue is not possible.
>           */
> -        ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
> -                                          sizeof(channel_magic), errp);
> +            ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
> +                                              sizeof(channel_magic), errp);
> +            if (ret != 0) {
> +                return;
> +            }
>  
> -        if (ret != 0) {
> -            return;
> +            if (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)) {
> +                channel = CH_DEFAULT;
> +            } else if (channel_magic == cpu_to_be32(MULTIFD_MAGIC)) {
> +                channel = CH_MULTIFD;
> +            } else if (!mis->from_src_file
> +                        && mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
> +                /* reconnect default channel for postcopy recovery */
> +                channel = CH_DEFAULT;

This is still in the big "peek buffer" if condition.

IMHO we can skip peeking buffer when postcopy paused, because in this stage
the channel must be (1) main channel first, then (2) preempt channel next.

> +            } else {
> +                error_report("%s: could not identify channel, unknown magic: %u",
> +                                __func__, channel_magic);
> +                return;
> +            }
> +        } else if (mis->from_src_file
> +            && (!strcmp(ioc->name, "migration-tls-incoming")
> +                || !strcmp(ioc->name, "migration-file-incoming"))) {
> +            channel = CH_MULTIFD;

Confused here too.  Why do we need to check ioc name? Shouldn't multifd has
the headers?

>          }
> -
> -        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
> -    } else {
> -        default_channel = !mis->from_src_file;
> +    } else if (mis->from_src_file) { // && migrate_postcopy_preempt()
> +        channel = CH_POSTCOPY;
>      }
>  
>      if (multifd_recv_setup(errp) != 0) {
>          return;
>      }
>  
> -    if (default_channel) {
> +    if (channel == CH_DEFAULT) {
>          f = qemu_file_new_input(ioc);
>          migration_incoming_setup(f);
> -    } else {
> +    } else if (channel == CH_MULTIFD) {
>          /* Multiple connections */
> -        assert(migration_needs_multiple_sockets());

Could I ask why removal?

>          if (migrate_multifd()) {
>              multifd_recv_new_channel(ioc, &local_err);
> -        } else {
> -            assert(migrate_postcopy_preempt());
> -            f = qemu_file_new_input(ioc);
> -            postcopy_preempt_new_channel(mis, f);
>          }
>          if (local_err) {
>              error_propagate(errp, local_err);
>              return;
>          }
> +    } else if (channel == CH_POSTCOPY) {
> +        assert(migrate_postcopy_preempt());
> +        assert(!mis->postcopy_qemufile_dst);
> +        f = qemu_file_new_input(ioc);
> +        postcopy_preempt_new_channel(mis, f);
>      }
>  
> -    if (migration_should_start_incoming(default_channel)) {
> +    if (migration_should_start_incoming(channel)) {
>          /* If it's a recovery, we're done */
>          if (postcopy_try_recover()) {
>              return;
> @@ -1025,21 +1050,22 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>   */
>  bool migration_has_all_channels(void)
>  {
> +    bool ret = false;
>      MigrationIncomingState *mis = migration_incoming_get_current();
>  
>      if (!mis->from_src_file) {
> -        return false;
> +        return ret;
>      }
>  
>      if (migrate_multifd()) {
> -        return multifd_recv_all_channels_created();
> +        ret = multifd_recv_all_channels_created();
>      }
>  
> -    if (migrate_postcopy_preempt()) {
> -        return mis->postcopy_qemufile_dst != NULL;
> +    if (ret && migrate_postcopy_preempt()) {

It might be better to avoid such "ret && XXX" nested check.  E.g. do you
think below easier to read?

diff --git a/migration/migration.c b/migration/migration.c
index 74c50cc72c..9eb2f3fdeb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1064,12 +1064,14 @@ bool migration_has_all_channels(void)
         return false;
     }
 
-    if (migrate_multifd()) {
-        return multifd_recv_all_channels_created();
+    if (migrate_multifd() &&
+        !multifd_recv_all_channels_created()) {
+        return false;
     }
 
-    if (migrate_postcopy_preempt()) {
-        return mis->postcopy_qemufile_dst != NULL;
+    if (migrate_postcopy_preempt() &&
+        mis->postcopy_qemufile_dst == NULL) {
+        return false;
     }
 
     return true;

> +        ret = mis->postcopy_qemufile_dst != NULL;
>      }
>  
> -    return true;
> +    return ret;
>  }
>  
>  int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> index 1325dba97c..d0edec7cd1 100644
> --- a/migration/multifd-nocomp.c
> +++ b/migration/multifd-nocomp.c
> @@ -16,6 +16,7 @@
>  #include "file.h"
>  #include "multifd.h"
>  #include "options.h"
> +#include "migration.h"
>  #include "qapi/error.h"
>  #include "qemu/cutils.h"
>  #include "qemu/error-report.h"
> @@ -391,7 +392,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f)
>      MultiFDSyncReq req;
>      int ret;
>  
> -    if (!migrate_multifd()) {
> +    if (!migrate_multifd() || migration_in_postcopy()) {
>          return 0;
>      }

[1]

>  
> diff --git a/migration/options.c b/migration/options.c
> index b8d5300326..8c878dea49 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -479,11 +479,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
>              error_setg(errp, "Postcopy is not compatible with ignore-shared");
>              return false;
>          }
> -
> -        if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
> -            error_setg(errp, "Postcopy is not yet compatible with multifd");
> -            return false;
> -        }
>      }
>  
>      if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
> diff --git a/migration/ram.c b/migration/ram.c
> index f2326788de..bdba7abe73 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1295,7 +1295,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
>          pss->page = 0;
>          pss->block = QLIST_NEXT_RCU(pss->block, next);
>          if (!pss->block) {
> -            if (multifd_ram_sync_per_round()) {
> +            if (multifd_ram_sync_per_round() && !migration_in_postcopy()) {

If you have above[1], why need this?

>                  QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
>                  int ret = multifd_ram_flush_and_sync(f);
>                  if (ret < 0) {
> @@ -1969,7 +1969,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
>          }
>      }
>  
> -    if (migrate_multifd()) {
> +    if (migrate_multifd() && !migration_in_postcopy()) {
>          RAMBlock *block = pss->block;
>          return ram_save_multifd_page(block, offset);
>      }
> -- 
> 2.48.1
> 

This patch still did nothing for multifd in postcopy_start().  I'm not sure
it's safe.

What happens if some multifd pages were sent, then we start postcopy, dest
vcpu threads running, then during postcopy some multifd pages finally
arrived and modifying the guest pages during vcpus running?

Thanks,
Prasad Pandit Feb. 7, 2025, 10:32 a.m. UTC | #2
Hi,

On Fri, 7 Feb 2025 at 04:46, Peter Xu <peterx@redhat.com> wrote:
> > +/* Migration channel types */
> > +enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };
>
> Maybe s/DEFAULT/MAIN/?

* Okay.

> > -    if (migrate_multifd() && !migrate_mapped_ram() &&
> > -        !migrate_postcopy_ram() &&
> > -        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
> > +    if (!migration_should_start_incoming(channel)) {
>
> This says "if we assume this is the main channel, and if we shouldn't start
> incoming migration, then we should peek at the buffers".
> Could you help explain?

* New migration starts only when the main channel and if 'multifd' is
enabled all multifd channels are established. So, if 'main' and
'multifd' channels are _not_ established then migration should _not_
start. And in that case, incoming connection is likely for one of
those channels and so we should peek at the buffers, because both
'main' and 'multifd' channels send magic values.

* migration_should_start_incoming() function returns 'true' only when
'main' and 'multifd' channels are being established. For 'postcopy'
channel it returns false.


> > +            } else if (!mis->from_src_file
> > +                        && mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
> > +                /* reconnect default channel for postcopy recovery */
> > +                channel = CH_DEFAULT;
>
> This is still in the big "peek buffer" if condition.
> IMHO we can skip peeking buffer when postcopy paused, because in this stage
> the channel must be (1) main channel first, then (2) preempt channel next.

* It is in the big 'peek buffer' condition because the 'main' channel
(= CH_DEFAULT) is being established here. Ideally, all channels should
send magic values to be consistent. The 'main' channel sends magic
value when it is established before starting migration, but the same
'main' channel does not send magic value when it is established during
postcopy recovery, that is an inconsistency (a bug) here. Ideal fix is
to send a magic value every time the 'main' channel is established,
irrespective of when it is established.

* Adding conditionals to check if it is _POSTCOPY_PAUSED state then
don't peek will only lead to complicated 'if' conditionals. This
channel handling code is already complex and non-intuitive enough.

> > +        } else if (mis->from_src_file
> > +            && (!strcmp(ioc->name, "migration-tls-incoming")
> > +                || !strcmp(ioc->name, "migration-file-incoming"))) {
> > +            channel = CH_MULTIFD;
>
> Confused here too.  Why do we need to check ioc name? Shouldn't multifd has
> the headers?

* Because they are not 'multifd' channels, tls/file channels don't
send magic values, but are still handled by
'multifd_recv_new_channel()' function.
===
    ...
    if (default_channel) {
        migration_incoming_setup(f);
    } else {
        if (migrate_multifd()) {
            multifd_recv_new_channel(ioc, &local_err);
        } else {
            postcopy_preempt_new_channel(mis, f);
        }
===
In the code above, if 'default_channel==false' and multifd() is
enabled, all incoming connections are handled by
'multifd_recv_new_channel()', irrespective of whether it is a
'multifd' channel or not. While creating multifd channels, there is no
check for channel type like: if(channel == CH_MULTIFD).

* IMHO, if we make all channels behave with consistency, ie. either
they all send magic value or none sends magic value, that'll simplify
this code a lot.

> > -        assert(migration_needs_multiple_sockets());
> Could I ask why removal?

* Because that function returns migrate_multifd() =>
migrate_multifd() || migrate_postcopy_preempt();
* And the very following check is also migrate_multifd(), as below:

> >          if (migrate_multifd()) {
> >              multifd_recv_new_channel(ioc, &local_err);


> It might be better to avoid such "ret && XXX" nested check.  E.g. do you
> think below easier to read?
>
> diff --git a/migration/migration.c b/migration/migration.c
> index 74c50cc72c..9eb2f3fdeb 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1064,12 +1064,14 @@ bool migration_has_all_channels(void)
>          return false;
>      }
>
> -    if (migrate_multifd()) {
> -        return multifd_recv_all_channels_created();
> +    if (migrate_multifd() &&
> +        !multifd_recv_all_channels_created()) {
> +        return false;
>      }
>
> -    if (migrate_postcopy_preempt()) {
> -        return mis->postcopy_qemufile_dst != NULL;
> +    if (migrate_postcopy_preempt() &&
> +        mis->postcopy_qemufile_dst == NULL) {
> +        return false;
>      }
>
>      return true;

* Will try it.

> > -    if (!migrate_multifd()) {
> > +    if (!migrate_multifd() || migration_in_postcopy()) {
> >          return 0;
> >      }
>
> [1]
>
> >
> >      if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
> > diff --git a/migration/ram.c b/migration/ram.c
> > index f2326788de..bdba7abe73 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -1295,7 +1295,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
> >          pss->page = 0;
> >          pss->block = QLIST_NEXT_RCU(pss->block, next);
> >          if (!pss->block) {
> > -            if (multifd_ram_sync_per_round()) {
> > +            if (multifd_ram_sync_per_round() && !migration_in_postcopy()) {
>
> If you have above[1], why need this?

* True, I tried with just [1] above first, but it was failing for some
reason. Will try again.

> This patch still did nothing for multifd in postcopy_start().  I'm not sure
> it's safe.
>
> What happens if some multifd pages were sent, then we start postcopy, dest
> vcpu threads running, then during postcopy some multifd pages finally
> arrived and modifying the guest pages during vcpus running?

* ram_save_target_page() function saves multifd pages only when
(..!migration_in_postcopy()) not in postcopy mode. Case of 'multifd'
page arriving late on destination and 'postcopy' starting before that
is strange, because if multifd page is getting late, that network
latency should affect 'postcopy' channel too, no? But still if it is
possible, do we want to call - multifd_ram_flush_and_sync() before
postcopy_start()? Will that help?  I'll check if/how it works.

Thank you.
---
  - Prasad
Peter Xu Feb. 7, 2025, 3:45 p.m. UTC | #3
On Fri, Feb 07, 2025 at 04:02:44PM +0530, Prasad Pandit wrote:
> Hi,
> 
> On Fri, 7 Feb 2025 at 04:46, Peter Xu <peterx@redhat.com> wrote:
> > > +/* Migration channel types */
> > > +enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };
> >
> > Maybe s/DEFAULT/MAIN/?
> 
> * Okay.
> 
> > > -    if (migrate_multifd() && !migrate_mapped_ram() &&

[b]

> > > -        !migrate_postcopy_ram() &&
> > > -        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
> > > +    if (!migration_should_start_incoming(channel)) {
> >
> > This says "if we assume this is the main channel, and if we shouldn't start
> > incoming migration, then we should peek at the buffers".
> > Could you help explain?
> 
> * New migration starts only when the main channel and if 'multifd' is
> enabled all multifd channels are established. So, if 'main' and
> 'multifd' channels are _not_ established then migration should _not_
> start. And in that case, incoming connection is likely for one of
> those channels and so we should peek at the buffers, because both
> 'main' and 'multifd' channels send magic values.
> 
> * migration_should_start_incoming() function returns 'true' only when
> 'main' and 'multifd' channels are being established. For 'postcopy'
> channel it returns false.

This is not easy to follow neither with the current name, nor that you
"assumed this is main channel" and test it.  I think you may want to split
migration_has_all_channels() into migration_has_essential_channels() which
only covers main and multifd cases.  Then you can check if (!has_esential)
here.  You'd better also add a comment that all "essential channels" can be
peeked.

You may also want to bypass a few things, e.g. "postcopy paused stage" here
rather than inside, because postcopy-recover only happens:

  - First with a main channel, that is not peekable as no header when resume
  - Then with preempt channel, that is also not peekable

[a]

You may also need to keep the mapped-ram check.  They also don't support
peek.

> 
> 
> > > +            } else if (!mis->from_src_file
> > > +                        && mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
> > > +                /* reconnect default channel for postcopy recovery */
> > > +                channel = CH_DEFAULT;
> >
> > This is still in the big "peek buffer" if condition.
> > IMHO we can skip peeking buffer when postcopy paused, because in this stage
> > the channel must be (1) main channel first, then (2) preempt channel next.
> 
> * It is in the big 'peek buffer' condition because the 'main' channel
> (= CH_DEFAULT) is being established here. Ideally, all channels should
> send magic values to be consistent. The 'main' channel sends magic
> value when it is established before starting migration, but the same
> 'main' channel does not send magic value when it is established during
> postcopy recovery, that is an inconsistency (a bug) here. Ideal fix is

For a reconnection we could do better to define a header format indeed for
such extensions.  I can't say it's a bug.

> to send a magic value every time the 'main' channel is established,
> irrespective of when it is established.
> 
> * Adding conditionals to check if it is _POSTCOPY_PAUSED state then
> don't peek will only lead to complicated 'if' conditionals. This
> channel handling code is already complex and non-intuitive enough.

Please see above [a].

> 
> > > +        } else if (mis->from_src_file
> > > +            && (!strcmp(ioc->name, "migration-tls-incoming")
> > > +                || !strcmp(ioc->name, "migration-file-incoming"))) {
> > > +            channel = CH_MULTIFD;
> >
> > Confused here too.  Why do we need to check ioc name? Shouldn't multifd has
> > the headers?
> 
> * Because they are not 'multifd' channels, tls/file channels don't
> send magic values, but are still handled by

It might be because you have a bug where you removed mapped-ram check at
[b] above.  I think we need to keep it.

Why TLS channels don't send magic?

> 'multifd_recv_new_channel()' function.
> ===
>     ...
>     if (default_channel) {
>         migration_incoming_setup(f);
>     } else {
>         if (migrate_multifd()) {
>             multifd_recv_new_channel(ioc, &local_err);
>         } else {
>             postcopy_preempt_new_channel(mis, f);
>         }
> ===
> In the code above, if 'default_channel==false' and multifd() is
> enabled, all incoming connections are handled by
> 'multifd_recv_new_channel()', irrespective of whether it is a
> 'multifd' channel or not. While creating multifd channels, there is no
> check for channel type like: if(channel == CH_MULTIFD).
> 
> * IMHO, if we make all channels behave with consistency, ie. either
> they all send magic value or none sends magic value, that'll simplify
> this code a lot.
> 
> > > -        assert(migration_needs_multiple_sockets());
> > Could I ask why removal?
> 
> * Because that function returns migrate_multifd() =>
> migrate_multifd() || migrate_postcopy_preempt();
> * And the very following check is also migrate_multifd(), as below:
> 
> > >          if (migrate_multifd()) {
> > >              multifd_recv_new_channel(ioc, &local_err);
> 
> 
> > It might be better to avoid such "ret && XXX" nested check.  E.g. do you
> > think below easier to read?
> >
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 74c50cc72c..9eb2f3fdeb 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -1064,12 +1064,14 @@ bool migration_has_all_channels(void)
> >          return false;
> >      }
> >
> > -    if (migrate_multifd()) {
> > -        return multifd_recv_all_channels_created();
> > +    if (migrate_multifd() &&
> > +        !multifd_recv_all_channels_created()) {
> > +        return false;
> >      }
> >
> > -    if (migrate_postcopy_preempt()) {
> > -        return mis->postcopy_qemufile_dst != NULL;
> > +    if (migrate_postcopy_preempt() &&
> > +        mis->postcopy_qemufile_dst == NULL) {
> > +        return false;
> >      }
> >
> >      return true;
> 
> * Will try it.
> 
> > > -    if (!migrate_multifd()) {
> > > +    if (!migrate_multifd() || migration_in_postcopy()) {
> > >          return 0;
> > >      }
> >
> > [1]
> >
> > >
> > >      if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
> > > diff --git a/migration/ram.c b/migration/ram.c
> > > index f2326788de..bdba7abe73 100644
> > > --- a/migration/ram.c
> > > +++ b/migration/ram.c
> > > @@ -1295,7 +1295,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
> > >          pss->page = 0;
> > >          pss->block = QLIST_NEXT_RCU(pss->block, next);
> > >          if (!pss->block) {
> > > -            if (multifd_ram_sync_per_round()) {
> > > +            if (multifd_ram_sync_per_round() && !migration_in_postcopy()) {
> >
> > If you have above[1], why need this?
> 
> * True, I tried with just [1] above first, but it was failing for some
> reason. Will try again.

Another approach (cleaner at least to me..) is we check in_postcopy in
multifd_ram_sync_per_*() functions.

> 
> > This patch still did nothing for multifd in postcopy_start().  I'm not sure
> > it's safe.
> >
> > What happens if some multifd pages were sent, then we start postcopy, dest
> > vcpu threads running, then during postcopy some multifd pages finally
> > arrived and modifying the guest pages during vcpus running?
> 
> * ram_save_target_page() function saves multifd pages only when
> (..!migration_in_postcopy()) not in postcopy mode. Case of 'multifd'
> page arriving late on destination and 'postcopy' starting before that
> is strange, because if multifd page is getting late, that network
> latency should affect 'postcopy' channel too, no? But still if it is

I don't think so.  postcopy doesn't use any multifd channels.

> possible, do we want to call - multifd_ram_flush_and_sync() before
> postcopy_start()? Will that help?  I'll check if/how it works.

Note that all things flushed may or may not be enough, because IIUC the
flush only makes sure all threads are synced.  It may not make sure the
order of things to happen in multifd threads and postcopy thread.  The
latter is what we need - we need to make sure no page land in postcopy
threads.

That's why I was requesting to add an assert() in multifd recv thread to
make sure we will never receive a page during postcopy.

This part is the most important change of the whole series, please take
your time to understand the workflow and let's make sure it won't happen.

Thanks,
Prasad Pandit Feb. 8, 2025, 10:36 a.m. UTC | #4
Hello Peter,

On Fri, 7 Feb 2025 at 21:16, Peter Xu <peterx@redhat.com> wrote:
> This is not easy to follow neither with the current name, nor that you
> "assumed this is main channel" and test it.

* It is not my doing, nor is there any assumption, but that is how
current implementation works.
===
static bool migration_should_start_incoming(bool main_channel)
{
    /* Multifd doesn't start unless all channels are established */
    if (migrate_multifd()) {
        return migration_has_all_channels();
    }

    /* Preempt channel only starts when the main channel is created */
    if (migrate_postcopy_preempt()) {
        return main_channel;
    }

    /*
     * For all the rest types of migration, we should only reach here when
     * it's the main channel that's being created, and we should always
     * proceed with this channel.
     */
    assert(main_channel);
    return true;
}
===
* Above code returns 'true' for 'multifd' and 'main_channel' cases.
When migrate_postcopy_preempt() is true, main_channel is 'false', so
it returns false. All I have done is reused the
migration_should_start_incoming() function to simplify the 'if'
conditional at the top.

> I think you may want to split
> migration_has_all_channels() into migration_has_essential_channels() which
> only covers main and multifd cases.  Then you can check if (!has_esential)
> here.  You'd better also add a comment that all "essential channels" can be
> peeked.
>
> You may also want to bypass a few things, e.g. "postcopy paused stage" here
> rather than inside, because postcopy-recover only happens:
>
>   - First with a main channel, that is not peekable as no header when resume
>   - Then with preempt channel, that is also not peekable
>
> [a]
>
> You may also need to keep the mapped-ram check.  They also don't support
> peek.

* Instead of adding specific conditions and splitting functions, my
request is, let's work towards consistent channel behaviour that will
automatically simplify these conditions and channel handling. Maybe we
can do that in a subsequent series.

> >
> > > > +        } else if (mis->from_src_file
> > > > +            && (!strcmp(ioc->name, "migration-tls-incoming")
> > > > +                || !strcmp(ioc->name, "migration-file-incoming"))) {
> > > > +            channel = CH_MULTIFD;
> > >
> > > Confused here too.  Why do we need to check ioc name? Shouldn't multifd has
> > > the headers?
> >
> > * Because they are not 'multifd' channels, tls/file channels don't
> > send magic values, but are still handled by
>
> It might be because you have a bug where you removed mapped-ram check at
> [b] above.  I think we need to keep it.

* ie. Because I removed the mapped-ram check, so tls/file channels are
handled by multifd_recv_new_channel()? No, that's not the case.
Rather, that is how it works currently. I have not changed anything,
only made it more explicit to see that when it is tls/file channel,
handle it as a CH_MULTIFD type. Looking at the current code, one can
not see clearly how tls/file channels are handled.

> Why TLS channels don't send magic?

* Probably because they do TLS hand-shake while establishing a connection?

> > because if multifd page is getting late, that network
> > latency should affect 'postcopy' channel too, no? But still if it is
>
> I don't think so.  postcopy doesn't use any multifd channels.

* Yes, but it uses the same wire/network.

> > possible, do we want to call - multifd_ram_flush_and_sync() before
> > postcopy_start()? Will that help?  I'll check if/how it works.
>
> Note that all things flushed may or may not be enough, because IIUC the
> flush only makes sure all threads are synced.

* We are again using 'flush' and 'sync' interchangeably. What does -
flush only makes sure all threads are synced - mean really? Is it not
writing all socket data onto the wire/channel?

* Flush should write all socket data onto the network wire/channel.
The _order_ in which threads flush/write their socket data onto the
wire/channel is to synchronise them, maintaining/controlling that
_order_ is not flush.

> It may not make sure the order of things to happen in multifd threads and postcopy thread.  The
> latter is what we need - we need to make sure no page land in postcopy threads.

* Let's see:
   1) When migration is in Postcopy mode, ram_save_multifd_page() is
_not_ called on the source side. ie. no multifd data gets enqueued on
the multifd queue.
       1.1) multifd_queue_page() function also calls multifd_send() if
the queue is full, before enqueueing new pages.
   2) If a multifd page reaches the destination during Postcopy mode,
it must have been sent/written on the multifd channel before Postcopy
mode started, right?
   3) In this case, writing/flushing all multifd socket data onto the
wire/channel, before calling postcopy_start() function should help
IIUC.
       3.1) ie. calling multifd_send() before postcopy_start() should
help to clear the multifd queue, before Postcopy begins.
       3.2) Same can be done by - multifd_ram_flush_and_sync() ->
multifd_send() - sequence.

* If all multifd pages are sent/written/flushed onto the multifd
channels before postcopy_start() is called, then multifd pages should
not arrive at the destination after postcopy starts IIUC.  If that is
happening, we need a reproducer for such a case. Do we have such a
reproducer?

> That's why I was requesting to add an assert() in multifd recv thread to
> make sure we will never receive a page during postcopy.

* ie. Add  assert(!migrate_in_postcopy())  in multifd_recv_thread()
function?  Okay.


Thank you.
---
  - Prasad
Peter Xu Feb. 10, 2025, 4:59 p.m. UTC | #5
On Sat, Feb 08, 2025 at 04:06:56PM +0530, Prasad Pandit wrote:
> Hello Peter,
> 
> On Fri, 7 Feb 2025 at 21:16, Peter Xu <peterx@redhat.com> wrote:
> > This is not easy to follow neither with the current name, nor that you
> > "assumed this is main channel" and test it.
> 
> * It is not my doing, nor is there any assumption, but that is how
> current implementation works.
> ===
> static bool migration_should_start_incoming(bool main_channel)
> {
>     /* Multifd doesn't start unless all channels are established */
>     if (migrate_multifd()) {
>         return migration_has_all_channels();
>     }
> 
>     /* Preempt channel only starts when the main channel is created */
>     if (migrate_postcopy_preempt()) {
>         return main_channel;
>     }
> 
>     /*
>      * For all the rest types of migration, we should only reach here when
>      * it's the main channel that's being created, and we should always
>      * proceed with this channel.
>      */
>     assert(main_channel);
>     return true;
> }
> ===
> * Above code returns 'true' for 'multifd' and 'main_channel' cases.
> When migrate_postcopy_preempt() is true, main_channel is 'false', so
> it returns false. All I have done is reused the
> migration_should_start_incoming() function to simplify the 'if'
> conditional at the top.

Yes, and I suggest a rename or introduce a new helper, per previous reply.

> 
> > I think you may want to split
> > migration_has_all_channels() into migration_has_essential_channels() which
> > only covers main and multifd cases.  Then you can check if (!has_esential)
> > here.  You'd better also add a comment that all "essential channels" can be
> > peeked.
> >
> > You may also want to bypass a few things, e.g. "postcopy paused stage" here
> > rather than inside, because postcopy-recover only happens:
> >
> >   - First with a main channel, that is not peekable as no header when resume
> >   - Then with preempt channel, that is also not peekable
> >
> > [a]
> >
> > You may also need to keep the mapped-ram check.  They also don't support
> > peek.
> 
> * Instead of adding specific conditions and splitting functions, my
> request is, let's work towards consistent channel behaviour that will
> automatically simplify these conditions and channel handling. Maybe we
> can do that in a subsequent series.

I didn't follow, sorry - do you mean this patch is correct on dropping the
mapped-ram check? I don't yet understand how it can work if without.

> 
> > >
> > > > > +        } else if (mis->from_src_file
> > > > > +            && (!strcmp(ioc->name, "migration-tls-incoming")
> > > > > +                || !strcmp(ioc->name, "migration-file-incoming"))) {
> > > > > +            channel = CH_MULTIFD;
> > > >
> > > > Confused here too.  Why do we need to check ioc name? Shouldn't multifd has
> > > > the headers?
> > >
> > > * Because they are not 'multifd' channels, tls/file channels don't
> > > send magic values, but are still handled by
> >
> > It might be because you have a bug where you removed mapped-ram check at
> > [b] above.  I think we need to keep it.
> 
> * ie. Because I removed the mapped-ram check, so tls/file channels are
> handled by multifd_recv_new_channel()? No, that's not the case.
> Rather, that is how it works currently. I have not changed anything,
> only made it more explicit to see that when it is tls/file channel,
> handle it as a CH_MULTIFD type. Looking at the current code, one can
> not see clearly how tls/file channels are handled.
> 
> > Why TLS channels don't send magic?
> 
> * Probably because they do TLS hand-shake while establishing a connection?

I meant tls channels should have these magics too.  Do you mean they're
not?

> 
> > > because if multifd page is getting late, that network
> > > latency should affect 'postcopy' channel too, no? But still if it is
> >
> > I don't think so.  postcopy doesn't use any multifd channels.
> 
> * Yes, but it uses the same wire/network.
> 
> > > possible, do we want to call - multifd_ram_flush_and_sync() before
> > > postcopy_start()? Will that help?  I'll check if/how it works.
> >
> > Note that all things flushed may or may not be enough, because IIUC the
> > flush only makes sure all threads are synced.
> 
> * We are again using 'flush' and 'sync' interchangeably. What does -
> flush only makes sure all threads are synced - mean really? Is it not
> writing all socket data onto the wire/channel?
> 
> * Flush should write all socket data onto the network wire/channel.
> The _order_ in which threads flush/write their socket data onto the
> wire/channel is to synchronise them, maintaining/controlling that
> _order_ is not flush.
> 
> > It may not make sure the order of things to happen in multifd threads and postcopy thread.  The
> > latter is what we need - we need to make sure no page land in postcopy threads.
> 
> * Let's see:
>    1) When migration is in Postcopy mode, ram_save_multifd_page() is
> _not_ called on the source side. ie. no multifd data gets enqueued on
> the multifd queue.
>        1.1) multifd_queue_page() function also calls multifd_send() if
> the queue is full, before enqueueing new pages.
>    2) If a multifd page reaches the destination during Postcopy mode,
> it must have been sent/written on the multifd channel before Postcopy
> mode started, right?

Yes.

>    3) In this case, writing/flushing all multifd socket data onto the
> wire/channel, before calling postcopy_start() function should help
> IIUC.
>        3.1) ie. calling multifd_send() before postcopy_start() should
> help to clear the multifd queue, before Postcopy begins.
>        3.2) Same can be done by - multifd_ram_flush_and_sync() ->
> multifd_send() - sequence.

No I don't think so.

Flushing sending side makes sure send buffer is empty.  It doesn't
guarantee recv buffer is empty on the other side.

> 
> * If all multifd pages are sent/written/flushed onto the multifd
> channels before postcopy_start() is called, then multifd pages should
> not arrive at the destination after postcopy starts IIUC.  If that is
> happening, we need a reproducer for such a case. Do we have such a
> reproducer?

With or without a reproducer, we need to at least justify it in theory.  If
it doesn't even work in theory, it's a problem.

> 
> > That's why I was requesting to add an assert() in multifd recv thread to
> > make sure we will never receive a page during postcopy.
> 
> * ie. Add  assert(!migrate_in_postcopy())  in multifd_recv_thread()
> function?  Okay.

Yes, probably before multifd_recv_state->ops->recv().

Thanks,
Prasad Pandit Feb. 11, 2025, 9:04 a.m. UTC | #6
On Mon, 10 Feb 2025 at 22:29, Peter Xu <peterx@redhat.com> wrote:
> Yes, and I suggest a rename or introduce a new helper, per previous reply.

* Okay, will try it.

> I didn't follow, sorry - do you mean this patch is correct on dropping the
> mapped-ram check? I don't yet understand how it can work if without.

* It goes for channel peek if '!migrate_mapped_ram', ie. when
mapped_ram is not set. When it is enabled, likely it just falls into
the multifd channel, like other tls/file channels. I'll see if we have
to add a check for mapped_ram stream, like tls/file one.

> I meant tls channels should have these magics too.  Do you mean they're not?

* Yes. AFAIU, tls/file channels don't send magic values.

> No I don't think so.
> Flushing sending side makes sure send buffer is empty.  It doesn't
> guarantee recv buffer is empty on the other side.

* A simple 'flush' operation is not supposed to guarantee reception on
the destination side. It is just a 'flush' operation. If we want to
_confirm_ whether the pages sent to the destination are received or
not, then the destination side should send an 'Acknowledgement' to the
source side. Is there such a mechanism in place currently?

> >
> > * If all multifd pages are sent/written/flushed onto the multifd
> > channels before postcopy_start() is called, then multifd pages should
> > not arrive at the destination after postcopy starts IIUC.  If that is
> > happening, we need a reproducer for such a case. Do we have such a
> > reproducer?
>
> With or without a reproducer, we need to at least justify it in theory.  If
> it doesn't even work in theory, it's a problem.

* The theory that both multifd and postcopy channels use the same
underlying network wire; And in that multifd pages get delayed, but
postcopy pages don't, is not understandable. There must be something
else happening in such a case, which a reproducer could help with.

Thank you.
---
  - Prasad
Peter Xu Feb. 11, 2025, 3:20 p.m. UTC | #7
On Tue, Feb 11, 2025 at 02:34:07PM +0530, Prasad Pandit wrote:
> On Mon, 10 Feb 2025 at 22:29, Peter Xu <peterx@redhat.com> wrote:
> > Yes, and I suggest a rename or introduce a new helper, per previous reply.
> 
> * Okay, will try it.
> 
> > I didn't follow, sorry - do you mean this patch is correct on dropping the
> > mapped-ram check? I don't yet understand how it can work if without.
> 
> * It goes for channel peek if '!migrate_mapped_ram', ie. when
> mapped_ram is not set. When it is enabled, likely it just falls into
> the multifd channel, like other tls/file channels. I'll see if we have
> to add a check for mapped_ram stream, like tls/file one.
> 
> > I meant tls channels should have these magics too.  Do you mean they're not?
> 
> * Yes. AFAIU, tls/file channels don't send magic values.

Please double check whether TLS will send magics.  AFAICT, they should.

> 
> > No I don't think so.
> > Flushing sending side makes sure send buffer is empty.  It doesn't
> > guarantee recv buffer is empty on the other side.
> 
> * A simple 'flush' operation is not supposed to guarantee reception on
> the destination side. It is just a 'flush' operation. If we want to
> _confirm_ whether the pages sent to the destination are received or
> not, then the destination side should send an 'Acknowledgement' to the
> source side. Is there such a mechanism in place currently?

No.  We need to figure out a way to do that properly, and that's exactly
what I mentioned as one of the core changes we need in this series, which
is still missing.  We may or may not need an ACK message.  Please think
about it.

> 
> > >
> > > * If all multifd pages are sent/written/flushed onto the multifd
> > > channels before postcopy_start() is called, then multifd pages should
> > > not arrive at the destination after postcopy starts IIUC.  If that is
> > > happening, we need a reproducer for such a case. Do we have such a
> > > reproducer?
> >
> > With or without a reproducer, we need to at least justify it in theory.  If
> > it doesn't even work in theory, it's a problem.
> 
> * The theory that both multifd and postcopy channels use the same
> underlying network wire; And in that multifd pages get delayed, but
> postcopy pages don't, is not understandable. There must be something
> else happening in such a case, which a reproducer could help with.

Please consider the case where multifd recv threads may get scheduled out
on dest host during precopy phase, not getting chance to be scheduled until
postcopy already started running on dst, then the recv thread can stumble
upon a page that was sent during precopy.  As long as that can be always
avoided, I think we should be good.

Thanks,
Prasad Pandit Feb. 12, 2025, 1:27 p.m. UTC | #8
Hi,

On Tue, 11 Feb 2025 at 20:50, Peter Xu <peterx@redhat.com> wrote:
> > * Yes. AFAIU, tls/file channels don't send magic values.
> Please double check whether TLS will send magics.  AFAICT, they should.
===
  * ... Also tls live migration already does
  * tls handshake while initializing main channel so with tls this
  * issue is not possible.
  */
  if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
  } else if (mis->from_src_file
           && (!strcmp(ioc->name, "migration-tls-incoming")
            || !strcmp(ioc->name, "migration-file-incoming"))) {
        channel = CH_MULTIFD;
  }
===
* From the comment and condition above, both 'tls' and 'file' channels
are not peekable, ie. no magic value to peek. The
'migration-file-incoming' check also helps to cover the
migrate_mapped_ram() case IIUC.

> No.  We need to figure out a way to do that properly, and that's exactly
> what I mentioned as one of the core changes we need in this series, which
> is still missing.  We may or may not need an ACK message.  Please think
> about it.

* First we tried to call 'multifd_send_shutdown()' to close multifd
channels before calling postcopy_start(). That's the best case
scenario wherein multifd channels are closed before postcopy starts.
So that there's no confusion and/or jumbling of different data
packets. It did not work, as qemu would crash during
multifd_shutdown().

* Second is we push/flush all multifd pages before calling
postcopy_start() and let the multifd channels exist/stay till the
migration ends, after that they are duly shutdown. It is working well
so far, passing all migration tests too.

* Third, if we want to confirm that multifd pages are received on the
destination before calling postcopy_start(), then the best way is for
the destination to send an acknowledgement to the source side that it
has received and processed all multifd pages and nothing is pending on
the multifd channels.

* Another could be to define a multifd_recv_flush() function, which
could process and clear the receive side multifd queue, so that no
multifd pages are pending there. Not sure how best to do this yet.
Also considering it lacks proper communication and synchronisation
between source and destination sides, it does not seem like the best
solution.

* Do you have any other option/approach in mind?

> Please consider the case where multifd recv threads may get scheduled out
> on dest host during precopy phase, not getting chance to be scheduled until
> postcopy already started running on dst, then the recv thread can stumble
> upon a page that was sent during precopy.  As long as that can be always
> avoided, I think we should be good.

* TBH, this sounds like a remote corner case.

* I'm testing the revised patch series and will send it shortly.

Thank you.
---
  - Prasad
Peter Xu Feb. 12, 2025, 2:37 p.m. UTC | #9
On Wed, Feb 12, 2025 at 06:57:48PM +0530, Prasad Pandit wrote:
> Hi,
> 
> On Tue, 11 Feb 2025 at 20:50, Peter Xu <peterx@redhat.com> wrote:
> > > * Yes. AFAIU, tls/file channels don't send magic values.
> > Please double check whether TLS will send magics.  AFAICT, they should.
> ===
>   * ... Also tls live migration already does
>   * tls handshake while initializing main channel so with tls this
>   * issue is not possible.
>   */
>   if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
>   } else if (mis->from_src_file
>            && (!strcmp(ioc->name, "migration-tls-incoming")
>             || !strcmp(ioc->name, "migration-file-incoming"))) {
>         channel = CH_MULTIFD;
>   }
> ===
> * From the comment and condition above, both 'tls' and 'file' channels
> are not peekable, ie. no magic value to peek. The
> 'migration-file-incoming' check also helps to cover the
> migrate_mapped_ram() case IIUC.

I think it's not because TLS channels don't send magic, but TLS channels
are not prone to ordering issues.

In general, I'm not convinced we need to check names of iochannels.

> 
> > No.  We need to figure out a way to do that properly, and that's exactly
> > what I mentioned as one of the core changes we need in this series, which
> > is still missing.  We may or may not need an ACK message.  Please think
> > about it.
> 
> * First we tried to call 'multifd_send_shutdown()' to close multifd
> channels before calling postcopy_start(). That's the best case
> scenario wherein multifd channels are closed before postcopy starts.
> So that there's no confusion and/or jumbling of different data
> packets. It did not work, as qemu would crash during
> multifd_shutdown().

Have we debugged the crash? I'm not saying we should go with this, but
crash isn't a reason to not choose a design.

> 
> * Second is we push/flush all multifd pages before calling
> postcopy_start() and let the multifd channels exist/stay till the
> migration ends, after that they are duly shutdown. It is working well
> so far, passing all migration tests too.
> 
> * Third, if we want to confirm that multifd pages are received on the
> destination before calling postcopy_start(), then the best way is for
> the destination to send an acknowledgement to the source side that it
> has received and processed all multifd pages and nothing is pending on
> the multifd channels.
> 
> * Another could be to define a multifd_recv_flush() function, which
> could process and clear the receive side multifd queue, so that no
> multifd pages are pending there. Not sure how best to do this yet.
> Also considering it lacks proper communication and synchronisation
> between source and destination sides, it does not seem like the best
> solution.
> 
> * Do you have any other option/approach in mind?
> 
> > Please consider the case where multifd recv threads may get scheduled out
> > on dest host during precopy phase, not getting chance to be scheduled until
> > postcopy already started running on dst, then the recv thread can stumble
> > upon a page that was sent during precopy.  As long as that can be always
> > avoided, I think we should be good.
> 
> * TBH, this sounds like a remote corner case.

No this is not.  As I mentioned tons of times.. copying page in socket
buffers directly into guest page during vcpus running / postcopy is a very
hard to debug issue.  If it's possible to happen, the design is flawed.

> 
> * I'm testing the revised patch series and will send it shortly.

I don't think passing the unit tests prove the series is correct and should
be merged. We need to understand how it work, or we can't merge it.

I feel very frustrated multiple times that you seem to want to ignore what
I comment.  I don't know why you rush to repost things.

After a 2nd thought, I think maybe multifd flush and sync could work on src
side indeed, because when flush and sync there'll be a message
(MULTIFD_FLUSH) on main channel and that should order against the rest
postcopy messages that will also be sent on the main channel (if we do
multifd flush before all the postcopy processes).  Then it should guarantee
when postcopy starts on dest, dest multifd recv threads flushed all
messages, and no multifd message will arrive anymore.

But again we should guard it with an assert() in recv threads if you want
to postpone recycling of multifd threads, just to double check no outliers
we overlooked.

When proposing the patches you need to justify the design first before
anything.  Please evaluate above, these things are critical to appear in
either code comments or commit messages. Please digest everything before
reposting.

Thanks,
Prasad Pandit Feb. 12, 2025, 5:36 p.m. UTC | #10
Hi,

On Wed, 12 Feb 2025 at 20:08, Peter Xu <peterx@redhat.com> wrote:
> I think it's not because TLS channels don't send magic, but TLS channels
> are not prone to ordering issues.
> In general, I'm not convinced we need to check names of iochannels.

* If the channel does not set '_READ_MSG_SEEK' flag, which magic value
are we going to read? As for checking names of the channels, it tells
the reader how an incoming channel is processed.

>> It did not work, as qemu would crash during multifd_shutdown().
>
> Have we debugged the crash? I'm not saying we should go with this, but
> crash isn't a reason to not choose a design.

* Yes, I did try to debug it but couldn't get to a conclusion in time.

> No this is not.  As I mentioned tons of times.. copying page in socket
> buffers directly into guest page during vcpus running / postcopy is a very
> hard to debug issue. If it's possible to happen, the design is flawed.

* Sure, agreed. So far in my testing it does not seem to happen. The
theory of multifd_recv_threads getting scheduled out and causing guest
page over-write seems remote to me, but I get the
possibility/probability. One possible solution is to have the
destination side send an 'acknowledgement' to the source side.

> I don't think passing the unit tests prove the series is correct and should
> be merged. We need to understand how it work, or we can't merge it.

* Well, passing unit tests should confirm that it does not break
existing functionality. Is there any metric/limit to such
understanding? Everybody understands/sees things differently.
Understanding is an ever evolving thing. Saying that merge should
happen based on understanding sounds weird to me.

> I feel very frustrated multiple times that you seem to want to ignore what
> I comment.  I don't know why you rush to repost things.

* I guess we are seeing/understanding things differently here.

> After a 2nd thought, I think maybe multifd flush and sync could work on src
> side indeed, because when flush and sync there'll be a message
> (MULTIFD_FLUSH) on main channel and that should order against the rest
> postcopy messages that will also be sent on the main channel (if we do
> multifd flush before all the postcopy processes).  Then it should guarantee
> when postcopy starts on dest, dest multifd recv threads flushed all
> messages, and no multifd message will arrive anymore.

* After a 2nd thought - that's evolving understanding right there. :)

* To mention here, to flush multifd channels with
'multifd_ram_flush_and_sync()' I tried calling

       migration_iteration_run -> multifd_ram_flush_and_sync(s->to_dst_file)

It does not work, it crashes. Is 's->to_dst_file' a right parameter there?

But calling a function below works
===
/* Send enqueued data pages onto next available multifd channel */
int multifd_send_flush(void)
{
    if (!multifd_payload_empty(multifd_ram_send)) {
        if (!multifd_send(&multifd_ram_send)) {
            error_report("%s: multifd_send fail", __func__);
            return -1;
        }
    }

    return 0;
}
===
    migration_iteration_run -> multifd_send_flush()

Works, but it is not sending the 'RAM_SAVE_FLAG_MULTIFD_FLUSH' message
as done by 'multifd_ram_flush_and_sync()' function.

> But again we should guard it with an assert() in recv threads if you want
> to postpone recycling of multifd threads, just to double check no outliers
> we overlooked.

* Yes, the revised patch is working with the
assert(!migrate_in_postcopy) in multifd_recv_thread.

* I was going to send a revised series with these changes, but will
wait on that for now.

Thank you.
---
  - Prasad
Peter Xu Feb. 12, 2025, 5:52 p.m. UTC | #11
On Wed, Feb 12, 2025 at 11:06:00PM +0530, Prasad Pandit wrote:
> * I was going to send a revised series with these changes, but will
> wait on that for now.

You were going to send some changes that you don't yet fully understand how
it works?  How would you do the flush in the new revision if your existing
code crashes?  Or did you do that in the revised series at all?

Sorry, no - I don't think this is how it should or could ever work.  I've
repeated myself multiple times, I'll stop.
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 2d1da917c7..a280722e9e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -92,6 +92,9 @@  enum mig_rp_message_type {
     MIG_RP_MSG_MAX
 };
 
+/* Migration channel types */
+enum { CH_DEFAULT, CH_MULTIFD, CH_POSTCOPY };
+
 /* When we add fault tolerance, we could have several
    migrations at once.  For now we don't need to add
    dynamic creation of migration */
@@ -929,26 +932,33 @@  void migration_fd_process_incoming(QEMUFile *f)
 /*
  * Returns true when we want to start a new incoming migration process,
  * false otherwise.
+ *
+ * All the required channels must be in place before a new incoming
+ * migration process starts.
+ *  - Multifd enabled:
+ *    The main channel and the multifd channels are required.
+ *  - Multifd/Postcopy disabled:
+ *    The main channel is required.
+ *  - Postcopy enabled:
+ *    We don't want to start a new incoming migration when
+ *    the postcopy channel is created. Because it is created
+ *    towards the end of the precopy migration.
+ *
  */
-static bool migration_should_start_incoming(bool main_channel)
+static bool migration_should_start_incoming(uint8_t channel)
 {
-    /* Multifd doesn't start unless all channels are established */
-    if (migrate_multifd()) {
-        return migration_has_all_channels();
-    }
+    bool ret = false;
+
+    if (channel != CH_POSTCOPY) {
+        MigrationIncomingState *mis = migration_incoming_get_current();
+        ret = mis->from_src_file ? true : false;
 
-    /* Preempt channel only starts when the main channel is created */
-    if (migrate_postcopy_preempt()) {
-        return main_channel;
+        if (ret && migrate_multifd()) {
+            ret = multifd_recv_all_channels_created();
+        }
     }
 
-    /*
-     * For all the rest types of migration, we should only reach here when
-     * it's the main channel that's being created, and we should always
-     * proceed with this channel.
-     */
-    assert(main_channel);
-    return true;
+    return ret;
 }
 
 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
@@ -956,13 +966,12 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
     MigrationIncomingState *mis = migration_incoming_get_current();
     Error *local_err = NULL;
     QEMUFile *f;
-    bool default_channel = true;
     uint32_t channel_magic = 0;
+    uint8_t channel = CH_DEFAULT;
     int ret = 0;
 
-    if (migrate_multifd() && !migrate_mapped_ram() &&
-        !migrate_postcopy_ram() &&
-        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+    if (!migration_should_start_incoming(channel)) {
+        if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
         /*
          * With multiple channels, it is possible that we receive channels
          * out of order on destination side, causing incorrect mapping of
@@ -973,42 +982,58 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
          * tls handshake while initializing main channel so with tls this
          * issue is not possible.
          */
-        ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
-                                          sizeof(channel_magic), errp);
+            ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
+                                              sizeof(channel_magic), errp);
+            if (ret != 0) {
+                return;
+            }
 
-        if (ret != 0) {
-            return;
+            if (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)) {
+                channel = CH_DEFAULT;
+            } else if (channel_magic == cpu_to_be32(MULTIFD_MAGIC)) {
+                channel = CH_MULTIFD;
+            } else if (!mis->from_src_file
+                        && mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+                /* reconnect default channel for postcopy recovery */
+                channel = CH_DEFAULT;
+            } else {
+                error_report("%s: could not identify channel, unknown magic: %u",
+                                __func__, channel_magic);
+                return;
+            }
+        } else if (mis->from_src_file
+            && (!strcmp(ioc->name, "migration-tls-incoming")
+                || !strcmp(ioc->name, "migration-file-incoming"))) {
+            channel = CH_MULTIFD;
         }
-
-        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
-    } else {
-        default_channel = !mis->from_src_file;
+    } else if (mis->from_src_file) { // && migrate_postcopy_preempt()
+        channel = CH_POSTCOPY;
     }
 
     if (multifd_recv_setup(errp) != 0) {
         return;
     }
 
-    if (default_channel) {
+    if (channel == CH_DEFAULT) {
         f = qemu_file_new_input(ioc);
         migration_incoming_setup(f);
-    } else {
+    } else if (channel == CH_MULTIFD) {
         /* Multiple connections */
-        assert(migration_needs_multiple_sockets());
         if (migrate_multifd()) {
             multifd_recv_new_channel(ioc, &local_err);
-        } else {
-            assert(migrate_postcopy_preempt());
-            f = qemu_file_new_input(ioc);
-            postcopy_preempt_new_channel(mis, f);
         }
         if (local_err) {
             error_propagate(errp, local_err);
             return;
         }
+    } else if (channel == CH_POSTCOPY) {
+        assert(migrate_postcopy_preempt());
+        assert(!mis->postcopy_qemufile_dst);
+        f = qemu_file_new_input(ioc);
+        postcopy_preempt_new_channel(mis, f);
     }
 
-    if (migration_should_start_incoming(default_channel)) {
+    if (migration_should_start_incoming(channel)) {
         /* If it's a recovery, we're done */
         if (postcopy_try_recover()) {
             return;
@@ -1025,21 +1050,22 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
  */
 bool migration_has_all_channels(void)
 {
+    bool ret = false;
     MigrationIncomingState *mis = migration_incoming_get_current();
 
     if (!mis->from_src_file) {
-        return false;
+        return ret;
     }
 
     if (migrate_multifd()) {
-        return multifd_recv_all_channels_created();
+        ret = multifd_recv_all_channels_created();
     }
 
-    if (migrate_postcopy_preempt()) {
-        return mis->postcopy_qemufile_dst != NULL;
+    if (ret && migrate_postcopy_preempt()) {
+        ret = mis->postcopy_qemufile_dst != NULL;
     }
 
-    return true;
+    return ret;
 }
 
 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
index 1325dba97c..d0edec7cd1 100644
--- a/migration/multifd-nocomp.c
+++ b/migration/multifd-nocomp.c
@@ -16,6 +16,7 @@ 
 #include "file.h"
 #include "multifd.h"
 #include "options.h"
+#include "migration.h"
 #include "qapi/error.h"
 #include "qemu/cutils.h"
 #include "qemu/error-report.h"
@@ -391,7 +392,7 @@  int multifd_ram_flush_and_sync(QEMUFile *f)
     MultiFDSyncReq req;
     int ret;
 
-    if (!migrate_multifd()) {
+    if (!migrate_multifd() || migration_in_postcopy()) {
         return 0;
     }
 
diff --git a/migration/options.c b/migration/options.c
index b8d5300326..8c878dea49 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -479,11 +479,6 @@  bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
             error_setg(errp, "Postcopy is not compatible with ignore-shared");
             return false;
         }
-
-        if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
-            error_setg(errp, "Postcopy is not yet compatible with multifd");
-            return false;
-        }
     }
 
     if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
diff --git a/migration/ram.c b/migration/ram.c
index f2326788de..bdba7abe73 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1295,7 +1295,7 @@  static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
         pss->page = 0;
         pss->block = QLIST_NEXT_RCU(pss->block, next);
         if (!pss->block) {
-            if (multifd_ram_sync_per_round()) {
+            if (multifd_ram_sync_per_round() && !migration_in_postcopy()) {
                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
                 int ret = multifd_ram_flush_and_sync(f);
                 if (ret < 0) {
@@ -1969,7 +1969,7 @@  static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
         }
     }
 
-    if (migrate_multifd()) {
+    if (migrate_multifd() && !migration_in_postcopy()) {
         RAMBlock *block = pss->block;
         return ram_save_multifd_page(block, offset);
     }