Message ID | 20230731223148.1002258-5-yuri.benditovich@daynix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | virtio-net: add USO feature (UDP segmentation offload) | expand |
On 2023/08/01 7:31, Yuri Benditovich wrote: > USO features of virtio-net device depend on kernel ability > to support them, for backward compatibility by default the > features are disabled on 8.0 and earlier. > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > --- > hw/core/machine.c | 4 ++++ > hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++-- > 2 files changed, 33 insertions(+), 2 deletions(-) > > diff --git a/hw/core/machine.c b/hw/core/machine.c > index f0d35c6401..a725e76738 100644 > --- a/hw/core/machine.c > +++ b/hw/core/machine.c > @@ -38,10 +38,14 @@ > #include "exec/confidential-guest-support.h" > #include "hw/virtio/virtio.h" > #include "hw/virtio/virtio-pci.h" > +#include "hw/virtio/virtio-net.h" > > GlobalProperty hw_compat_8_0[] = { > { "migration", "multifd-flush-after-each-section", "on"}, > { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, > + { TYPE_VIRTIO_NET, "host_uso", "off"}, > + { TYPE_VIRTIO_NET, "guest_uso4", "off"}, > + { TYPE_VIRTIO_NET, "guest_uso6", "off"}, Nitpick: Add a whitespace before closing brackets '}'. > }; > const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0); > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > index d2311e7d6e..bd0ead94fe 100644 > --- a/hw/net/virtio-net.c > +++ b/hw/net/virtio-net.c > @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n) > return n->has_ufo; > } > > +static int peer_has_uso(VirtIONet *n) > +{ > + if (!peer_has_vnet_hdr(n)) { > + return 0; > + } > + > + return qemu_has_uso(qemu_get_queue(n->nic)->peer); > +} > + > static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, > int version_1, int hash_report) > { > @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); > > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > + > virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); > } > > @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); > } > > + if (!peer_has_uso(n)) { > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > + } > + > if (!get_vhost_net(nc->peer)) { > return features; > } > @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n) > !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); > } > > -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features) > +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) > { > static const uint64_t guest_offloads_mask = > (1ULL << VIRTIO_NET_F_GUEST_CSUM) | > (1ULL << VIRTIO_NET_F_GUEST_TSO4) | > (1ULL << VIRTIO_NET_F_GUEST_TSO6) | > (1ULL << VIRTIO_NET_F_GUEST_ECN) | > - (1ULL << VIRTIO_NET_F_GUEST_UFO); > + (1ULL << VIRTIO_NET_F_GUEST_UFO) | > + (1ULL << VIRTIO_NET_F_GUEST_USO4) | > + (1ULL << VIRTIO_NET_F_GUEST_USO6); > > return guest_offloads_mask & features; > } > @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = { > DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), > DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), > DEFINE_PROP_BOOL("failover", VirtIONet, failover, false), > + DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features, > + VIRTIO_NET_F_GUEST_USO4, true), > + DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features, > + VIRTIO_NET_F_GUEST_USO6, true), > + DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, > + VIRTIO_NET_F_HOST_USO, true), > DEFINE_PROP_END_OF_LIST(), > }; >
On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > USO features of virtio-net device depend on kernel ability > to support them, for backward compatibility by default the > features are disabled on 8.0 and earlier. > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> Looks like this patch broke migration when the VM starts on a host that has USO supported, to another host that doesn't.. Yuri, would it be possible we always keep all the USO* features off by default (so this feature bit never affects migration ABI), but then: - only enable them when the user specified ON - meanwhile, if detecting host feature doesn't support USO*, it could fail qemu from boot, rather than silently turning it from ON->OFF ? Silently flipping the bit may cause migration issues like this. Or any suggestion on how to fix migration? Thanks, > --- > hw/core/machine.c | 4 ++++ > hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++-- > 2 files changed, 33 insertions(+), 2 deletions(-) > > diff --git a/hw/core/machine.c b/hw/core/machine.c > index f0d35c6401..a725e76738 100644 > --- a/hw/core/machine.c > +++ b/hw/core/machine.c > @@ -38,10 +38,14 @@ > #include "exec/confidential-guest-support.h" > #include "hw/virtio/virtio.h" > #include "hw/virtio/virtio-pci.h" > +#include "hw/virtio/virtio-net.h" > > GlobalProperty hw_compat_8_0[] = { > { "migration", "multifd-flush-after-each-section", "on"}, > { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, > + { TYPE_VIRTIO_NET, "host_uso", "off"}, > + { TYPE_VIRTIO_NET, "guest_uso4", "off"}, > + { TYPE_VIRTIO_NET, "guest_uso6", "off"}, > }; > const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0); > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > index d2311e7d6e..bd0ead94fe 100644 > --- a/hw/net/virtio-net.c > +++ b/hw/net/virtio-net.c > @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n) > return n->has_ufo; > } > > +static int peer_has_uso(VirtIONet *n) > +{ > + if (!peer_has_vnet_hdr(n)) { > + return 0; > + } > + > + return qemu_has_uso(qemu_get_queue(n->nic)->peer); > +} > + > static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, > int version_1, int hash_report) > { > @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); > > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > + > virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); > } > > @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); > } > > + if (!peer_has_uso(n)) { > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > + } > + > if (!get_vhost_net(nc->peer)) { > return features; > } > @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n) > !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); > } > > -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features) > +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) > { > static const uint64_t guest_offloads_mask = > (1ULL << VIRTIO_NET_F_GUEST_CSUM) | > (1ULL << VIRTIO_NET_F_GUEST_TSO4) | > (1ULL << VIRTIO_NET_F_GUEST_TSO6) | > (1ULL << VIRTIO_NET_F_GUEST_ECN) | > - (1ULL << VIRTIO_NET_F_GUEST_UFO); > + (1ULL << VIRTIO_NET_F_GUEST_UFO) | > + (1ULL << VIRTIO_NET_F_GUEST_USO4) | > + (1ULL << VIRTIO_NET_F_GUEST_USO6); > > return guest_offloads_mask & features; > } > @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = { > DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), > DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), > DEFINE_PROP_BOOL("failover", VirtIONet, failover, false), > + DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features, > + VIRTIO_NET_F_GUEST_USO4, true), > + DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features, > + VIRTIO_NET_F_GUEST_USO6, true), > + DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, > + VIRTIO_NET_F_HOST_USO, true), > DEFINE_PROP_END_OF_LIST(), > }; > > -- > 2.34.3 > >
On Fri, Jul 26, 2024 at 6:19 AM Peter Xu <peterx@redhat.com> wrote: > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > USO features of virtio-net device depend on kernel ability > > to support them, for backward compatibility by default the > > features are disabled on 8.0 and earlier. > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > Looks like this patch broke migration when the VM starts on a host that has > USO supported, to another host that doesn't.. > > Yuri, would it be possible we always keep all the USO* features off by > default (so this feature bit never affects migration ABI), but then: > > - only enable them when the user specified ON > > - meanwhile, if detecting host feature doesn't support USO*, it could > fail qemu from boot, rather than silently turning it from ON->OFF > > ? I agree, I have raised the same issue several times in the past. > > Silently flipping the bit may cause migration issues like this. Looking at virtio_net_get_features(), it silently clears a lot of features... Thanks > > Or any suggestion on how to fix migration? > > Thanks, > > > --- > > hw/core/machine.c | 4 ++++ > > hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++-- > > 2 files changed, 33 insertions(+), 2 deletions(-) > > > > diff --git a/hw/core/machine.c b/hw/core/machine.c > > index f0d35c6401..a725e76738 100644 > > --- a/hw/core/machine.c > > +++ b/hw/core/machine.c > > @@ -38,10 +38,14 @@ > > #include "exec/confidential-guest-support.h" > > #include "hw/virtio/virtio.h" > > #include "hw/virtio/virtio-pci.h" > > +#include "hw/virtio/virtio-net.h" > > > > GlobalProperty hw_compat_8_0[] = { > > { "migration", "multifd-flush-after-each-section", "on"}, > > { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, > > + { TYPE_VIRTIO_NET, "host_uso", "off"}, > > + { TYPE_VIRTIO_NET, "guest_uso4", "off"}, > > + { TYPE_VIRTIO_NET, "guest_uso6", "off"}, > > }; > > const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0); > > > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > > index d2311e7d6e..bd0ead94fe 100644 > > --- a/hw/net/virtio-net.c > > +++ b/hw/net/virtio-net.c > > @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n) > > return n->has_ufo; > > } > > > > +static int peer_has_uso(VirtIONet *n) > > +{ > > + if (!peer_has_vnet_hdr(n)) { > > + return 0; > > + } > > + > > + return qemu_has_uso(qemu_get_queue(n->nic)->peer); > > +} > > + > > static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, > > int version_1, int hash_report) > > { > > @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > > virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); > > > > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > > + > > virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); > > } > > > > @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, > > virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); > > } > > > > + if (!peer_has_uso(n)) { > > + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); > > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); > > + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); > > + } > > + > > if (!get_vhost_net(nc->peer)) { > > return features; > > } > > @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n) > > !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); > > } > > > > -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features) > > +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) > > { > > static const uint64_t guest_offloads_mask = > > (1ULL << VIRTIO_NET_F_GUEST_CSUM) | > > (1ULL << VIRTIO_NET_F_GUEST_TSO4) | > > (1ULL << VIRTIO_NET_F_GUEST_TSO6) | > > (1ULL << VIRTIO_NET_F_GUEST_ECN) | > > - (1ULL << VIRTIO_NET_F_GUEST_UFO); > > + (1ULL << VIRTIO_NET_F_GUEST_UFO) | > > + (1ULL << VIRTIO_NET_F_GUEST_USO4) | > > + (1ULL << VIRTIO_NET_F_GUEST_USO6); > > > > return guest_offloads_mask & features; > > } > > @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = { > > DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), > > DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), > > DEFINE_PROP_BOOL("failover", VirtIONet, failover, false), > > + DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features, > > + VIRTIO_NET_F_GUEST_USO4, true), > > + DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features, > > + VIRTIO_NET_F_GUEST_USO6, true), > > + DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, > > + VIRTIO_NET_F_HOST_USO, true), > > DEFINE_PROP_END_OF_LIST(), > > }; > > > > -- > > 2.34.3 > > > > > > -- > Peter Xu >
On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > USO features of virtio-net device depend on kernel ability > > to support them, for backward compatibility by default the > > features are disabled on 8.0 and earlier. > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > Looks like this patch broke migration when the VM starts on a host that has > USO supported, to another host that doesn't.. This was always the case with all offloads. The answer at the moment is, don't do this. Long term, we need to start exposing management APIs to discover this, and management has to disable unsupported features.
On 26/07/2024 08.08, Michael S. Tsirkin wrote: > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>> USO features of virtio-net device depend on kernel ability >>> to support them, for backward compatibility by default the >>> features are disabled on 8.0 and earlier. >>> >>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >> >> Looks like this patch broke migration when the VM starts on a host that has >> USO supported, to another host that doesn't.. > > This was always the case with all offloads. The answer at the moment is, > don't do this. May I ask for my understanding: "don't do this" = don't automatically enable/disable virtio features in QEMU depending on host kernel features, or "don't do this" = don't try to migrate between machines that have different host kernel features? > Long term, we need to start exposing management APIs > to discover this, and management has to disable unsupported features. Ack, this likely needs some treatments from the libvirt side, too. Thomas
On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > USO features of virtio-net device depend on kernel ability > > > > to support them, for backward compatibility by default the > > > > features are disabled on 8.0 and earlier. > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > USO supported, to another host that doesn't.. > > > > This was always the case with all offloads. The answer at the moment is, > > don't do this. > > May I ask for my understanding: > "don't do this" = don't automatically enable/disable virtio features in QEMU > depending on host kernel features, or "don't do this" = don't try to migrate > between machines that have different host kernel features? The later. > > Long term, we need to start exposing management APIs > > to discover this, and management has to disable unsupported features. > > Ack, this likely needs some treatments from the libvirt side, too. > > Thomas
On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > USO features of virtio-net device depend on kernel ability > > > > to support them, for backward compatibility by default the > > > > features are disabled on 8.0 and earlier. > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > USO supported, to another host that doesn't.. > > > > This was always the case with all offloads. The answer at the moment is, > > don't do this. > > May I ask for my understanding: > "don't do this" = don't automatically enable/disable virtio features in QEMU > depending on host kernel features, or "don't do this" = don't try to migrate > between machines that have different host kernel features? > > > Long term, we need to start exposing management APIs > > to discover this, and management has to disable unsupported features. > > Ack, this likely needs some treatments from the libvirt side, too. When QEMU automatically toggles machine type featuers based on host kernel, relying on libvirt to then disable them again is impractical, as we cannot assume that the libvirt people are using knows about newly introduced features. Even if libvirt is updated to know about it, people can easily be using a previous libvirt release. QEMU itself needs to make the machine types do that they are there todo, which is to define a stable machine ABI. What QEMU is missing here is a "platform ABI" concept, to encode sets of features which are tied to specific platform generations. As long as we don't have that we'll keep having these broken migration problems from machine types dynamically changing instead of providing a stable guest ABI. With regards, Daniel
On Fri, Jul 26, 2024 at 03:25:31AM -0400, Michael S. Tsirkin wrote: > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > USO features of virtio-net device depend on kernel ability > > > > > to support them, for backward compatibility by default the > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > USO supported, to another host that doesn't.. > > > > > > This was always the case with all offloads. The answer at the moment is, > > > don't do this. > > > > May I ask for my understanding: > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > depending on host kernel features, or "don't do this" = don't try to migrate > > between machines that have different host kernel features? > > The later. The question is how should an user know a migration is not supported? The user can be using exactly the same QEMU binary on two hosts, while there can be a tiny slight difference in host kernel version, then migration can fail between them misterously. There're too many kernel features that can be on/off when kernels are different, even if slightly. Then I don't see how someone can even identify such issue, unless one uses exactly the same host kernels on both sides..
On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > USO features of virtio-net device depend on kernel ability > > > > > to support them, for backward compatibility by default the > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > USO supported, to another host that doesn't.. > > > > > > This was always the case with all offloads. The answer at the moment is, > > > don't do this. > > > > May I ask for my understanding: > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > depending on host kernel features, or "don't do this" = don't try to migrate > > between machines that have different host kernel features? > > > > > Long term, we need to start exposing management APIs > > > to discover this, and management has to disable unsupported features. > > > > Ack, this likely needs some treatments from the libvirt side, too. > > When QEMU automatically toggles machine type featuers based on host > kernel, relying on libvirt to then disable them again is impractical, > as we cannot assume that the libvirt people are using knows about > newly introduced features. Even if libvirt is updated to know about > it, people can easily be using a previous libvirt release. > > QEMU itself needs to make the machine types do that they are there > todo, which is to define a stable machine ABI. > > What QEMU is missing here is a "platform ABI" concept, to encode > sets of features which are tied to specific platform generations. > As long as we don't have that we'll keep having these broken > migration problems from machine types dynamically changing instead > of providing a stable guest ABI. Any more elaboration on this idea? Would it be easily feasible in implementation? I'd second any sane solution that we can avoid happening similar breakages in the future. I also wonder what else might be easily affected like this too when migration can break with changed kernel or changed HW. I suppose the CPU model is well covered by Libvirt so we're fine at least on x86 etc. While IIUC KVM always have such thoughts in mind, so that KVM will make sure to not break an userspace in such way or it'll simply be a KVM bug and fixed. Thanks,
On Fri, Jul 26, 2024 at 10:12:31AM +0800, Jason Wang wrote: > On Fri, Jul 26, 2024 at 6:19 AM Peter Xu <peterx@redhat.com> wrote: > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > USO features of virtio-net device depend on kernel ability > > > to support them, for backward compatibility by default the > > > features are disabled on 8.0 and earlier. > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > Looks like this patch broke migration when the VM starts on a host that has > > USO supported, to another host that doesn't.. > > > > Yuri, would it be possible we always keep all the USO* features off by > > default (so this feature bit never affects migration ABI), but then: > > > > - only enable them when the user specified ON > > > > - meanwhile, if detecting host feature doesn't support USO*, it could > > fail qemu from boot, rather than silently turning it from ON->OFF > > > > ? > > I agree, I have raised the same issue several times in the past. > > > > > Silently flipping the bit may cause migration issues like this. > > Looking at virtio_net_get_features(), it silently clears a lot of features... Yes.. :-( I saw that too when looking at this. Is it because most of those features are supported on most of the kernels, so we're good until now by chance? While it looks like e.g. TUN_F_USO4 was supported only since ~1.5 years ago so it looks relatively new. Thanks,
On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > > USO features of virtio-net device depend on kernel ability > > > > > > to support them, for backward compatibility by default the > > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > > USO supported, to another host that doesn't.. > > > > > > > > This was always the case with all offloads. The answer at the moment is, > > > > don't do this. > > > > > > May I ask for my understanding: > > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > > depending on host kernel features, or "don't do this" = don't try to migrate > > > between machines that have different host kernel features? > > > > > > > Long term, we need to start exposing management APIs > > > > to discover this, and management has to disable unsupported features. > > > > > > Ack, this likely needs some treatments from the libvirt side, too. > > > > When QEMU automatically toggles machine type featuers based on host > > kernel, relying on libvirt to then disable them again is impractical, > > as we cannot assume that the libvirt people are using knows about > > newly introduced features. Even if libvirt is updated to know about > > it, people can easily be using a previous libvirt release. > > > > QEMU itself needs to make the machine types do that they are there > > todo, which is to define a stable machine ABI. > > > > What QEMU is missing here is a "platform ABI" concept, to encode > > sets of features which are tied to specific platform generations. > > As long as we don't have that we'll keep having these broken > > migration problems from machine types dynamically changing instead > > of providing a stable guest ABI. > > Any more elaboration on this idea? Would it be easily feasible in > implementation? In terms of launching QEMU I'd imagine: $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... Any virtual machine HW features which are tied to host kernel features would have their defaults set based on the requested -platform. The -machine will be fully invariant wrt the host kernel. You would have -platform hlep to list available platforms, and corresonding QMP "query-platforms" command to list what platforms are supported on a given host OS. Downstream distros can provide their own platforms definitions (eg "linux-rhel-9.5") if they have kernels whose feature set diverges from upstream due to backports. Mgmt apps won't need to be taught about every single little QEMU setting whose default is derived from the kernel. Individual defaults are opaque and controlled by the requested platform. Live migration has clearly defined semantics, and mgmt app can use query-platforms to validate two hosts are compatible. Omitting -platform should pick the very latest platform that is cmpatible with the current host (not neccessarily the latest platform built-in to QEMU). With regards, Daniel
On 26/07/2024 09.25, Michael S. Tsirkin wrote: > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>> USO features of virtio-net device depend on kernel ability >>>>> to support them, for backward compatibility by default the >>>>> features are disabled on 8.0 and earlier. >>>>> >>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>> >>>> Looks like this patch broke migration when the VM starts on a host that has >>>> USO supported, to another host that doesn't.. >>> >>> This was always the case with all offloads. The answer at the moment is, >>> don't do this. >> >> May I ask for my understanding: >> "don't do this" = don't automatically enable/disable virtio features in QEMU >> depending on host kernel features, or "don't do this" = don't try to migrate >> between machines that have different host kernel features? > > The later. From my experience, it should rather be the former. We've seen similar issues with the s390x machine in the past when trying to automatically enable features depending on the availability of a kernel features. While it looks nicer at a very first glance ("hey, a new feature is available, we enable that for you, dear user!"), you end up in migration hell pretty quickly. Maybe we could elevate the "--nodefaults" command line switch to avoid enabling such features automatically? Anyway, while we're discussing solutions: We are in softfreeze already. Should we disable the UFO bits in the new 9.1 machine type for the time being to avoid that more people are running into this problem? Thomas
On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > > On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > > > USO features of virtio-net device depend on kernel ability > > > > > > > to support them, for backward compatibility by default the > > > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > > > USO supported, to another host that doesn't.. > > > > > > > > > > This was always the case with all offloads. The answer at the moment is, > > > > > don't do this. > > > > > > > > May I ask for my understanding: > > > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > > > depending on host kernel features, or "don't do this" = don't try to migrate > > > > between machines that have different host kernel features? > > > > > > > > > Long term, we need to start exposing management APIs > > > > > to discover this, and management has to disable unsupported features. > > > > > > > > Ack, this likely needs some treatments from the libvirt side, too. > > > > > > When QEMU automatically toggles machine type featuers based on host > > > kernel, relying on libvirt to then disable them again is impractical, > > > as we cannot assume that the libvirt people are using knows about > > > newly introduced features. Even if libvirt is updated to know about > > > it, people can easily be using a previous libvirt release. > > > > > > QEMU itself needs to make the machine types do that they are there > > > todo, which is to define a stable machine ABI. > > > > > > What QEMU is missing here is a "platform ABI" concept, to encode > > > sets of features which are tied to specific platform generations. > > > As long as we don't have that we'll keep having these broken > > > migration problems from machine types dynamically changing instead > > > of providing a stable guest ABI. > > > > Any more elaboration on this idea? Would it be easily feasible in > > implementation? > > In terms of launching QEMU I'd imagine: > > $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > > Any virtual machine HW features which are tied to host kernel features > would have their defaults set based on the requested -platform. The > -machine will be fully invariant wrt the host kernel. > > You would have -platform hlep to list available platforms, and > corresonding QMP "query-platforms" command to list what platforms > are supported on a given host OS. > > Downstream distros can provide their own platforms definitions > (eg "linux-rhel-9.5") if they have kernels whose feature set > diverges from upstream due to backports. > > Mgmt apps won't need to be taught about every single little QEMU > setting whose default is derived from the kernel. Individual > defaults are opaque and controlled by the requested platform. > > Live migration has clearly defined semantics, and mgmt app can > use query-platforms to validate two hosts are compatible. > > Omitting -platform should pick the very latest platform that is > cmpatible with the current host (not neccessarily the latest > platform built-in to QEMU). This seems to add one more layer to maintain, and so far I don't know whether it's a must. To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I thought it was mostly the case already, except some extremely rare outliers. When we have one host that boots up a VM using: $QEMU1 $cmdline Then another host boots up: $QEMU2 $cmdline -incoming XXX Then migration should succeed if $cmdline is exactly the same, and the VM can boot up all fine without errors on both sides. AFAICT this has nothing to do with what kernel is underneath, even not Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it didn't, I thought the ABI should be guaranteed. That's why I think this is a migration violation, as 99.99% of other device properties should be following this rule. The issue here is, we have the same virtio-net-pci cmdline on both sides in this case, but the ABI got break. That's also why I was suggesting if the property contributes to the guest ABI, then AFAIU QEMU needs to: - Firstly, never quietly flipping any bit that affects the ABI... - Have a default value of off, then QEMU will always allow the VM to boot by default, while advanced users can opt-in on new features. We can't make this ON by default otherwise some VMs can already fail to boot, - If the host doesn't support the feature while the cmdline enabled it, it needs to fail QEMU boot rather than flipping, so that it says "hey, this host does not support running such VM specified, due to XXX feature missing". That's the only way an user could understand what happened, and IMHO that's a clean way that we stick with QEMU cmdline on defining the guest ABI, while in which the machine type is the fundation of such definition, as the machine type can decides many of the rest compat properties. And that's the whole point of the compat properties too (to make sure the guest ABI is stable). If kernel breaks it easily, all compat property things that we maintain can already stop making sense in general, because it didn't define the whole guest ABI.. So AFAIU that's really what we used for years, I hope I didn't overlook somehting. And maybe we don't yet need the "-platform" layer if we can keep up with this rule? Thanks,
On Fri, Jul 26, 2024 at 07:39:46PM +0200, Thomas Huth wrote: > On 26/07/2024 09.25, Michael S. Tsirkin wrote: > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > > USO features of virtio-net device depend on kernel ability > > > > > > to support them, for backward compatibility by default the > > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > > USO supported, to another host that doesn't.. > > > > > > > > This was always the case with all offloads. The answer at the moment is, > > > > don't do this. > > > > > > May I ask for my understanding: > > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > > depending on host kernel features, or "don't do this" = don't try to migrate > > > between machines that have different host kernel features? > > > > The later. > > From my experience, it should rather be the former. We've seen similar > issues with the s390x machine in the past when trying to automatically > enable features depending on the availability of a kernel features. While it > looks nicer at a very first glance ("hey, a new feature is available, we > enable that for you, dear user!"), you end up in migration hell pretty > quickly. > > Maybe we could elevate the "--nodefaults" command line switch to avoid > enabling such features automatically? > > Anyway, while we're discussing solutions: We are in softfreeze already. > Should we disable the UFO bits in the new 9.1 machine type for the time > being to avoid that more people are running into this problem? Probably too late for this one; this patch was merged in 8.2. Unfortunately CIs won't even cover a test across two host kernels, even so it'll need to be unlucky enough to one has USO one not.. But I do agree with Thomas here. I think the only feature that can be auto-enabled is the ones that do not affect guest ABI. When affected, the only right way to me to enable them should be exporting -device interface so that Libvirt can opt-in on enabling them when the host support is detected. For QEMU users, that means user needs to explicitly enable them or they're off. Or, there's also another option that we turn default to ON for such feature, but when most of the kernels should support it. With that, we can set OFF in compat property for old machines, and we should fail the new machine from boot when running on an old kenrel without the feature. Thanks,
On 2024/07/27 5:47, Peter Xu wrote: > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>> to support them, for backward compatibility by default the >>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>> >>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>> >>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>> USO supported, to another host that doesn't.. >>>>>> >>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>> don't do this. >>>>> >>>>> May I ask for my understanding: >>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>> between machines that have different host kernel features? >>>>> >>>>>> Long term, we need to start exposing management APIs >>>>>> to discover this, and management has to disable unsupported features. >>>>> >>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>> >>>> When QEMU automatically toggles machine type featuers based on host >>>> kernel, relying on libvirt to then disable them again is impractical, >>>> as we cannot assume that the libvirt people are using knows about >>>> newly introduced features. Even if libvirt is updated to know about >>>> it, people can easily be using a previous libvirt release. >>>> >>>> QEMU itself needs to make the machine types do that they are there >>>> todo, which is to define a stable machine ABI. >>>> >>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>> sets of features which are tied to specific platform generations. >>>> As long as we don't have that we'll keep having these broken >>>> migration problems from machine types dynamically changing instead >>>> of providing a stable guest ABI. >>> >>> Any more elaboration on this idea? Would it be easily feasible in >>> implementation? >> >> In terms of launching QEMU I'd imagine: >> >> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >> >> Any virtual machine HW features which are tied to host kernel features >> would have their defaults set based on the requested -platform. The >> -machine will be fully invariant wrt the host kernel. >> >> You would have -platform hlep to list available platforms, and >> corresonding QMP "query-platforms" command to list what platforms >> are supported on a given host OS. >> >> Downstream distros can provide their own platforms definitions >> (eg "linux-rhel-9.5") if they have kernels whose feature set >> diverges from upstream due to backports. >> >> Mgmt apps won't need to be taught about every single little QEMU >> setting whose default is derived from the kernel. Individual >> defaults are opaque and controlled by the requested platform. >> >> Live migration has clearly defined semantics, and mgmt app can >> use query-platforms to validate two hosts are compatible. >> >> Omitting -platform should pick the very latest platform that is >> cmpatible with the current host (not neccessarily the latest >> platform built-in to QEMU). > > This seems to add one more layer to maintain, and so far I don't know > whether it's a must. > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > thought it was mostly the case already, except some extremely rare > outliers. > > When we have one host that boots up a VM using: > > $QEMU1 $cmdline > > Then another host boots up: > > $QEMU2 $cmdline -incoming XXX > > Then migration should succeed if $cmdline is exactly the same, and the VM > can boot up all fine without errors on both sides. > > AFAICT this has nothing to do with what kernel is underneath, even not > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > didn't, I thought the ABI should be guaranteed. > > That's why I think this is a migration violation, as 99.99% of other device > properties should be following this rule. The issue here is, we have the > same virtio-net-pci cmdline on both sides in this case, but the ABI got > break. > > That's also why I was suggesting if the property contributes to the guest > ABI, then AFAIU QEMU needs to: > > - Firstly, never quietly flipping any bit that affects the ABI... > > - Have a default value of off, then QEMU will always allow the VM to boot > by default, while advanced users can opt-in on new features. We can't > make this ON by default otherwise some VMs can already fail to boot, It may not be necessary the case that old features are supported by every systems. In an extreme case, a user may migrate a VM from Linux to Windows, which probably doesn't support any offloading at all. A more convincing scenario is RSS offloading with eBPF; using eBPF requires a privilege so we cannot assume it is always available even on the latest version of Linux. > > - If the host doesn't support the feature while the cmdline enabled it, > it needs to fail QEMU boot rather than flipping, so that it says "hey, > this host does not support running such VM specified, due to XXX > feature missing". This is handled in: "virtio-net: Convert feature properties to OnOffAuto" https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ > > That's the only way an user could understand what happened, and IMHO that's > a clean way that we stick with QEMU cmdline on defining the guest ABI, > while in which the machine type is the fundation of such definition, as the > machine type can decides many of the rest compat properties. And that's > the whole point of the compat properties too (to make sure the guest ABI is > stable). > > If kernel breaks it easily, all compat property things that we maintain can > already stop making sense in general, because it didn't define the whol > guest ABI.. > > So AFAIU that's really what we used for years, I hope I didn't overlook > somehting. And maybe we don't yet need the "-platform" layer if we can > keep up with this rule? I think a device which cannot conform to that rule should be non-migratable. For example, virtio-gpu-gl does not conform to it, and does not support migration either. Regards, Akihiko Odaki
On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/27 5:47, Peter Xu wrote: > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>> to support them, for backward compatibility by default the > >>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>> > >>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>> > >>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>> USO supported, to another host that doesn't.. > >>>>>> > >>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>> don't do this. > >>>>> > >>>>> May I ask for my understanding: > >>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>> between machines that have different host kernel features? > >>>>> > >>>>>> Long term, we need to start exposing management APIs > >>>>>> to discover this, and management has to disable unsupported features. > >>>>> > >>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>> > >>>> When QEMU automatically toggles machine type featuers based on host > >>>> kernel, relying on libvirt to then disable them again is impractical, > >>>> as we cannot assume that the libvirt people are using knows about > >>>> newly introduced features. Even if libvirt is updated to know about > >>>> it, people can easily be using a previous libvirt release. > >>>> > >>>> QEMU itself needs to make the machine types do that they are there > >>>> todo, which is to define a stable machine ABI. > >>>> > >>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>> sets of features which are tied to specific platform generations. > >>>> As long as we don't have that we'll keep having these broken > >>>> migration problems from machine types dynamically changing instead > >>>> of providing a stable guest ABI. > >>> > >>> Any more elaboration on this idea? Would it be easily feasible in > >>> implementation? > >> > >> In terms of launching QEMU I'd imagine: > >> > >> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >> > >> Any virtual machine HW features which are tied to host kernel features > >> would have their defaults set based on the requested -platform. The > >> -machine will be fully invariant wrt the host kernel. > >> > >> You would have -platform hlep to list available platforms, and > >> corresonding QMP "query-platforms" command to list what platforms > >> are supported on a given host OS. > >> > >> Downstream distros can provide their own platforms definitions > >> (eg "linux-rhel-9.5") if they have kernels whose feature set > >> diverges from upstream due to backports. > >> > >> Mgmt apps won't need to be taught about every single little QEMU > >> setting whose default is derived from the kernel. Individual > >> defaults are opaque and controlled by the requested platform. > >> > >> Live migration has clearly defined semantics, and mgmt app can > >> use query-platforms to validate two hosts are compatible. > >> > >> Omitting -platform should pick the very latest platform that is > >> cmpatible with the current host (not neccessarily the latest > >> platform built-in to QEMU). > > > > This seems to add one more layer to maintain, and so far I don't know > > whether it's a must. > > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > > thought it was mostly the case already, except some extremely rare > > outliers. > > > > When we have one host that boots up a VM using: > > > > $QEMU1 $cmdline > > > > Then another host boots up: > > > > $QEMU2 $cmdline -incoming XXX > > > > Then migration should succeed if $cmdline is exactly the same, and the VM > > can boot up all fine without errors on both sides. > > > > AFAICT this has nothing to do with what kernel is underneath, even not > > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > > didn't, I thought the ABI should be guaranteed. > > > > That's why I think this is a migration violation, as 99.99% of other device > > properties should be following this rule. The issue here is, we have the > > same virtio-net-pci cmdline on both sides in this case, but the ABI got > > break. > > > > That's also why I was suggesting if the property contributes to the guest > > ABI, then AFAIU QEMU needs to: > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > by default, while advanced users can opt-in on new features. We can't > > make this ON by default otherwise some VMs can already fail to boot, > > It may not be necessary the case that old features are supported by > every systems. In an extreme case, a user may migrate a VM from Linux to > Windows, which probably doesn't support any offloading at all. A more > convincing scenario is RSS offloading with eBPF; using eBPF requires a > privilege so we cannot assume it is always available even on the latest > version of Linux. I don't get why eBPF matters here. It is something that is not noticed by the guest and we have a fallback anyhow. > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > this host does not support running such VM specified, due to XXX > > feature missing". > > This is handled in: > > "virtio-net: Convert feature properties to OnOffAuto" > https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ I may miss something but I think "Auto" doesn't make sense to libvirt. > > > > > That's the only way an user could understand what happened, and IMHO that's > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > while in which the machine type is the fundation of such definition, as the > > machine type can decides many of the rest compat properties. And that's > > the whole point of the compat properties too (to make sure the guest ABI is > > stable). > > > > If kernel breaks it easily, all compat property things that we maintain can > > already stop making sense in general, because it didn't define the whol > > guest ABI.. > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > somehting. And maybe we don't yet need the "-platform" layer if we can > > keep up with this rule? > > I think a device which cannot conform to that rule should be > non-migratable. For example, virtio-gpu-gl does not conform to it, and > does not support migration either. > > Regards, > Akihiko Odaki > Thanks
On Fri, Jul 26, 2024 at 2:08 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > USO features of virtio-net device depend on kernel ability > > > to support them, for backward compatibility by default the > > > features are disabled on 8.0 and earlier. > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > Looks like this patch broke migration when the VM starts on a host that has > > USO supported, to another host that doesn't.. > > This was always the case with all offloads. The answer at the moment is, > don't do this. Sometimes, it's not easy for management to know this. For example, in the past we suffered from the removal of UFO .... > Long term, we need to start exposing management APIs > to discover this, and management has to disable unsupported features. > > -- > MST > Thanks
On 2024/07/29 12:50, Jason Wang wrote: > On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2024/07/27 5:47, Peter Xu wrote: >>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>> >>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>> >>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>> >>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>> don't do this. >>>>>>> >>>>>>> May I ask for my understanding: >>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>> between machines that have different host kernel features? >>>>>>> >>>>>>>> Long term, we need to start exposing management APIs >>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>> >>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>> >>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>> as we cannot assume that the libvirt people are using knows about >>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>> it, people can easily be using a previous libvirt release. >>>>>> >>>>>> QEMU itself needs to make the machine types do that they are there >>>>>> todo, which is to define a stable machine ABI. >>>>>> >>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>> sets of features which are tied to specific platform generations. >>>>>> As long as we don't have that we'll keep having these broken >>>>>> migration problems from machine types dynamically changing instead >>>>>> of providing a stable guest ABI. >>>>> >>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>> implementation? >>>> >>>> In terms of launching QEMU I'd imagine: >>>> >>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>> >>>> Any virtual machine HW features which are tied to host kernel features >>>> would have their defaults set based on the requested -platform. The >>>> -machine will be fully invariant wrt the host kernel. >>>> >>>> You would have -platform hlep to list available platforms, and >>>> corresonding QMP "query-platforms" command to list what platforms >>>> are supported on a given host OS. >>>> >>>> Downstream distros can provide their own platforms definitions >>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>> diverges from upstream due to backports. >>>> >>>> Mgmt apps won't need to be taught about every single little QEMU >>>> setting whose default is derived from the kernel. Individual >>>> defaults are opaque and controlled by the requested platform. >>>> >>>> Live migration has clearly defined semantics, and mgmt app can >>>> use query-platforms to validate two hosts are compatible. >>>> >>>> Omitting -platform should pick the very latest platform that is >>>> cmpatible with the current host (not neccessarily the latest >>>> platform built-in to QEMU). >>> >>> This seems to add one more layer to maintain, and so far I don't know >>> whether it's a must. >>> >>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>> thought it was mostly the case already, except some extremely rare >>> outliers. >>> >>> When we have one host that boots up a VM using: >>> >>> $QEMU1 $cmdline >>> >>> Then another host boots up: >>> >>> $QEMU2 $cmdline -incoming XXX >>> >>> Then migration should succeed if $cmdline is exactly the same, and the VM >>> can boot up all fine without errors on both sides. >>> >>> AFAICT this has nothing to do with what kernel is underneath, even not >>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>> didn't, I thought the ABI should be guaranteed. >>> >>> That's why I think this is a migration violation, as 99.99% of other device >>> properties should be following this rule. The issue here is, we have the >>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>> break. >>> >>> That's also why I was suggesting if the property contributes to the guest >>> ABI, then AFAIU QEMU needs to: >>> >>> - Firstly, never quietly flipping any bit that affects the ABI... >>> >>> - Have a default value of off, then QEMU will always allow the VM to boot >>> by default, while advanced users can opt-in on new features. We can't >>> make this ON by default otherwise some VMs can already fail to boot, >> >> It may not be necessary the case that old features are supported by >> every systems. In an extreme case, a user may migrate a VM from Linux to >> Windows, which probably doesn't support any offloading at all. A more >> convincing scenario is RSS offloading with eBPF; using eBPF requires a >> privilege so we cannot assume it is always available even on the latest >> version of Linux. > > I don't get why eBPF matters here. It is something that is not noticed > by the guest and we have a fallback anyhow. > >> >>> >>> - If the host doesn't support the feature while the cmdline enabled it, >>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>> this host does not support running such VM specified, due to XXX >>> feature missing". >> >> This is handled in: >> >> "virtio-net: Convert feature properties to OnOffAuto" >> https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ > > I may miss something but I think "Auto" doesn't make sense to libvirt. The point is libvirt can explicitly set "on" to avoid the "auto" behavior. libvirt does not have to use the "auto" value. libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() in libvirt actually parses tristate values (libvirt uses "default" instead of "auto" as the mnemonic) for these features though "default" is currently disabled by the schema (src/conf/schemas/domaincommon.rng). Allowing user to specify "default" is only a matter of editing the schema. Of course specifying "default" will make the VM unsafe for migration. Regards, Akihiko Odaki
On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > On 2024/07/29 12:50, Jason Wang wrote: > > On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > > > > > On 2024/07/27 5:47, Peter Xu wrote: > > > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > > > > > On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > > > > > > On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > > > > > > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > > > > > > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote: > > > > > > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > > > > > > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > > > > > > > > > > > USO features of virtio-net device depend on kernel ability > > > > > > > > > > > to support them, for backward compatibility by default the > > > > > > > > > > > features are disabled on 8.0 and earlier. > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > > > > > > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > > > > > > > > > > > > > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has > > > > > > > > > > USO supported, to another host that doesn't.. > > > > > > > > > > > > > > > > > > This was always the case with all offloads. The answer at the moment is, > > > > > > > > > don't do this. > > > > > > > > > > > > > > > > May I ask for my understanding: > > > > > > > > "don't do this" = don't automatically enable/disable virtio features in QEMU > > > > > > > > depending on host kernel features, or "don't do this" = don't try to migrate > > > > > > > > between machines that have different host kernel features? > > > > > > > > > > > > > > > > > Long term, we need to start exposing management APIs > > > > > > > > > to discover this, and management has to disable unsupported features. > > > > > > > > > > > > > > > > Ack, this likely needs some treatments from the libvirt side, too. > > > > > > > > > > > > > > When QEMU automatically toggles machine type featuers based on host > > > > > > > kernel, relying on libvirt to then disable them again is impractical, > > > > > > > as we cannot assume that the libvirt people are using knows about > > > > > > > newly introduced features. Even if libvirt is updated to know about > > > > > > > it, people can easily be using a previous libvirt release. > > > > > > > > > > > > > > QEMU itself needs to make the machine types do that they are there > > > > > > > todo, which is to define a stable machine ABI. > > > > > > > > > > > > > > What QEMU is missing here is a "platform ABI" concept, to encode > > > > > > > sets of features which are tied to specific platform generations. > > > > > > > As long as we don't have that we'll keep having these broken > > > > > > > migration problems from machine types dynamically changing instead > > > > > > > of providing a stable guest ABI. > > > > > > > > > > > > Any more elaboration on this idea? Would it be easily feasible in > > > > > > implementation? > > > > > > > > > > In terms of launching QEMU I'd imagine: > > > > > > > > > > $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > > > > > > > > > > Any virtual machine HW features which are tied to host kernel features > > > > > would have their defaults set based on the requested -platform. The > > > > > -machine will be fully invariant wrt the host kernel. > > > > > > > > > > You would have -platform hlep to list available platforms, and > > > > > corresonding QMP "query-platforms" command to list what platforms > > > > > are supported on a given host OS. > > > > > > > > > > Downstream distros can provide their own platforms definitions > > > > > (eg "linux-rhel-9.5") if they have kernels whose feature set > > > > > diverges from upstream due to backports. > > > > > > > > > > Mgmt apps won't need to be taught about every single little QEMU > > > > > setting whose default is derived from the kernel. Individual > > > > > defaults are opaque and controlled by the requested platform. > > > > > > > > > > Live migration has clearly defined semantics, and mgmt app can > > > > > use query-platforms to validate two hosts are compatible. > > > > > > > > > > Omitting -platform should pick the very latest platform that is > > > > > cmpatible with the current host (not neccessarily the latest > > > > > platform built-in to QEMU). > > > > > > > > This seems to add one more layer to maintain, and so far I don't know > > > > whether it's a must. > > > > > > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > > > > thought it was mostly the case already, except some extremely rare > > > > outliers. > > > > > > > > When we have one host that boots up a VM using: > > > > > > > > $QEMU1 $cmdline > > > > > > > > Then another host boots up: > > > > > > > > $QEMU2 $cmdline -incoming XXX > > > > > > > > Then migration should succeed if $cmdline is exactly the same, and the VM > > > > can boot up all fine without errors on both sides. > > > > > > > > AFAICT this has nothing to do with what kernel is underneath, even not > > > > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > > > > didn't, I thought the ABI should be guaranteed. > > > > > > > > That's why I think this is a migration violation, as 99.99% of other device > > > > properties should be following this rule. The issue here is, we have the > > > > same virtio-net-pci cmdline on both sides in this case, but the ABI got > > > > break. > > > > > > > > That's also why I was suggesting if the property contributes to the guest > > > > ABI, then AFAIU QEMU needs to: > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > by default, while advanced users can opt-in on new features. We can't > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > It may not be necessary the case that old features are supported by > > > every systems. In an extreme case, a user may migrate a VM from Linux to > > > Windows, which probably doesn't support any offloading at all. A more > > > convincing scenario is RSS offloading with eBPF; using eBPF requires a > > > privilege so we cannot assume it is always available even on the latest > > > version of Linux. > > > > I don't get why eBPF matters here. It is something that is not noticed > > by the guest and we have a fallback anyhow. > > > > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > this host does not support running such VM specified, due to XXX > > > > feature missing". > > > > > > This is handled in: > > > > > > "virtio-net: Convert feature properties to OnOffAuto" > > > https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ > > > > I may miss something but I think "Auto" doesn't make sense to libvirt. > > The point is libvirt can explicitly set "on" to avoid the "auto" behavior. > libvirt does not have to use the "auto" value. > > libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() in > libvirt actually parses tristate values (libvirt uses "default" instead of > "auto" as the mnemonic) for these features though "default" is currently > disabled by the schema (src/conf/schemas/domaincommon.rng). Allowing user to > specify "default" is only a matter of editing the schema. Of course > specifying "default" will make the VM unsafe for migration. Isn't keeping the default AUTO the same as before when it used to be ON? I mean, AUTO in a qemu cmdline doesn't guarantee guest API either. Indeed it looks like it's a step forward to make ON having the clear semantics of "fail when unsupported". It's just that I am not sure how useful is AUTO here, because anyway we'll need to break ON semantics even with AUTO, so that an old QEMU script with USO=ON used to boot on old kernels but not it won't. What I was trying to say is whether we should make the default parameter to be migratable. IOW, it looks to me AUTO should deserve a migration blocker when chosen. After all, Libvirt hopefully shouldn't use AUTO at all but only ON/OFF, while any user when not caring much on these perf details should always use OFF on any kernel dependent features that may affect the guest ABI. Thanks,
On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote: > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > > > > In terms of launching QEMU I'd imagine: > > > > $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > > > > Any virtual machine HW features which are tied to host kernel features > > would have their defaults set based on the requested -platform. The > > -machine will be fully invariant wrt the host kernel. > > > > You would have -platform hlep to list available platforms, and > > corresonding QMP "query-platforms" command to list what platforms > > are supported on a given host OS. > > > > Downstream distros can provide their own platforms definitions > > (eg "linux-rhel-9.5") if they have kernels whose feature set > > diverges from upstream due to backports. > > > > Mgmt apps won't need to be taught about every single little QEMU > > setting whose default is derived from the kernel. Individual > > defaults are opaque and controlled by the requested platform. > > > > Live migration has clearly defined semantics, and mgmt app can > > use query-platforms to validate two hosts are compatible. > > > > Omitting -platform should pick the very latest platform that is > > cmpatible with the current host (not neccessarily the latest > > platform built-in to QEMU). > > This seems to add one more layer to maintain, and so far I don't know > whether it's a must. > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > thought it was mostly the case already, except some extremely rare > outliers. > > When we have one host that boots up a VM using: > > $QEMU1 $cmdline > > Then another host boots up: > > $QEMU2 $cmdline -incoming XXX > > Then migration should succeed if $cmdline is exactly the same, and the VM > can boot up all fine without errors on both sides. > > AFAICT this has nothing to do with what kernel is underneath, even not > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > didn't, I thought the ABI should be guaranteed. We've got two mutually conflicting goals with the machine type definitions. Primarily we use them to ensure stable ABI, but an important secondary goal is to enable new tunables to have new defaults set, without having to update every mgmt app. The latter works very well when the defaults have no dependancy on the platform kernel/OS, but breaks migration when they do have a platform dependancy. > - Firstly, never quietly flipping any bit that affects the ABI... > > - Have a default value of off, then QEMU will always allow the VM to boot > by default, while advanced users can opt-in on new features. We can't > make this ON by default otherwise some VMs can already fail to boot, > > - If the host doesn't support the feature while the cmdline enabled it, > it needs to fail QEMU boot rather than flipping, so that it says "hey, > this host does not support running such VM specified, due to XXX > feature missing". > > That's the only way an user could understand what happened, and IMHO that's > a clean way that we stick with QEMU cmdline on defining the guest ABI, > while in which the machine type is the fundation of such definition, as the > machine type can decides many of the rest compat properties. And that's > the whole point of the compat properties too (to make sure the guest ABI is > stable). > > If kernel breaks it easily, all compat property things that we maintain can > already stop making sense in general, because it didn't define the whole > guest ABI.. > > So AFAIU that's really what we used for years, I hope I didn't overlook > somehting. And maybe we don't yet need the "-platform" layer if we can > keep up with this rule? We've failed at this for years wrt enabling use of new defaults that have a platform depedancy, so historical practice isn't a good reference. There are 100's (possibly 1000's) of tunables set implicitly as part of the machine type, and of those, libvirt likely only exposes a few 10's of tunables. The vast majority are low level details that no mgmt app wants to know about, they just want to accept QEMU's new defaults, while preserving machine ABI. This is a good thing. No one wants the burden of wiring up every single tunable into libvirt and mgmt apps. This is what the "-platform" concept would be intended to preserve. It would allow a way to enable groups of settings that have a platform level dependancy, without ever having to teach either libvirt or the mgmt apps about the individual tunables. With regards, Daniel
On 2024/07/29 23:29, Peter Xu wrote: > On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: >> On 2024/07/29 12:50, Jason Wang wrote: >>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2024/07/27 5:47, Peter Xu wrote: >>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>>>> >>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>>>> >>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>>>> >>>>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>>>> don't do this. >>>>>>>>> >>>>>>>>> May I ask for my understanding: >>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>>>> between machines that have different host kernel features? >>>>>>>>> >>>>>>>>>> Long term, we need to start exposing management APIs >>>>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>>>> >>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>>>> >>>>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>>>> as we cannot assume that the libvirt people are using knows about >>>>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>>>> it, people can easily be using a previous libvirt release. >>>>>>>> >>>>>>>> QEMU itself needs to make the machine types do that they are there >>>>>>>> todo, which is to define a stable machine ABI. >>>>>>>> >>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>>>> sets of features which are tied to specific platform generations. >>>>>>>> As long as we don't have that we'll keep having these broken >>>>>>>> migration problems from machine types dynamically changing instead >>>>>>>> of providing a stable guest ABI. >>>>>>> >>>>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>>>> implementation? >>>>>> >>>>>> In terms of launching QEMU I'd imagine: >>>>>> >>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>>>> >>>>>> Any virtual machine HW features which are tied to host kernel features >>>>>> would have their defaults set based on the requested -platform. The >>>>>> -machine will be fully invariant wrt the host kernel. >>>>>> >>>>>> You would have -platform hlep to list available platforms, and >>>>>> corresonding QMP "query-platforms" command to list what platforms >>>>>> are supported on a given host OS. >>>>>> >>>>>> Downstream distros can provide their own platforms definitions >>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>>>> diverges from upstream due to backports. >>>>>> >>>>>> Mgmt apps won't need to be taught about every single little QEMU >>>>>> setting whose default is derived from the kernel. Individual >>>>>> defaults are opaque and controlled by the requested platform. >>>>>> >>>>>> Live migration has clearly defined semantics, and mgmt app can >>>>>> use query-platforms to validate two hosts are compatible. >>>>>> >>>>>> Omitting -platform should pick the very latest platform that is >>>>>> cmpatible with the current host (not neccessarily the latest >>>>>> platform built-in to QEMU). >>>>> >>>>> This seems to add one more layer to maintain, and so far I don't know >>>>> whether it's a must. >>>>> >>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>>>> thought it was mostly the case already, except some extremely rare >>>>> outliers. >>>>> >>>>> When we have one host that boots up a VM using: >>>>> >>>>> $QEMU1 $cmdline >>>>> >>>>> Then another host boots up: >>>>> >>>>> $QEMU2 $cmdline -incoming XXX >>>>> >>>>> Then migration should succeed if $cmdline is exactly the same, and the VM >>>>> can boot up all fine without errors on both sides. >>>>> >>>>> AFAICT this has nothing to do with what kernel is underneath, even not >>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>>>> didn't, I thought the ABI should be guaranteed. >>>>> >>>>> That's why I think this is a migration violation, as 99.99% of other device >>>>> properties should be following this rule. The issue here is, we have the >>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>>>> break. >>>>> >>>>> That's also why I was suggesting if the property contributes to the guest >>>>> ABI, then AFAIU QEMU needs to: >>>>> >>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>> >>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>> by default, while advanced users can opt-in on new features. We can't >>>>> make this ON by default otherwise some VMs can already fail to boot, >>>> >>>> It may not be necessary the case that old features are supported by >>>> every systems. In an extreme case, a user may migrate a VM from Linux to >>>> Windows, which probably doesn't support any offloading at all. A more >>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a >>>> privilege so we cannot assume it is always available even on the latest >>>> version of Linux. >>> >>> I don't get why eBPF matters here. It is something that is not noticed >>> by the guest and we have a fallback anyhow. It is noticeable for the guest, and the fallback is not effective with vhost. Enabling RSS by default will result in a similar problem although it is older than USO. >>> >>>> >>>>> >>>>> - If the host doesn't support the feature while the cmdline enabled it, >>>>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>>>> this host does not support running such VM specified, due to XXX >>>>> feature missing". >>>> >>>> This is handled in: >>>> >>>> "virtio-net: Convert feature properties to OnOffAuto" >>>> https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ >>> >>> I may miss something but I think "Auto" doesn't make sense to libvirt. >> >> The point is libvirt can explicitly set "on" to avoid the "auto" behavior. >> libvirt does not have to use the "auto" value. >> >> libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() in >> libvirt actually parses tristate values (libvirt uses "default" instead of >> "auto" as the mnemonic) for these features though "default" is currently >> disabled by the schema (src/conf/schemas/domaincommon.rng). Allowing user to >> specify "default" is only a matter of editing the schema. Of course >> specifying "default" will make the VM unsafe for migration. > > Isn't keeping the default AUTO the same as before when it used to be ON? I > mean, AUTO in a qemu cmdline doesn't guarantee guest API either. True. It only deals with the situation that "the host doesn't support the feature while the cmdline enabled it". > > Indeed it looks like it's a step forward to make ON having the clear > semantics of "fail when unsupported". It's just that I am not sure how > useful is AUTO here, because anyway we'll need to break ON semantics even > with AUTO, so that an old QEMU script with USO=ON used to boot on old > kernels but not it won't. > > What I was trying to say is whether we should make the default parameter to > be migratable. IOW, it looks to me AUTO should deserve a migration > blocker when chosen. > > After all, Libvirt hopefully shouldn't use AUTO at all but only ON/OFF, > while any user when not caring much on these perf details should always use > OFF on any kernel dependent features that may affect the guest ABI. Well, there should be libvirt users who care performance and do not use migration so it's better for them if libvirt can use auto. But the use of "auto" should be mutually exclusive with migration of course. Regards, Akihiko Odaki
On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote: > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > > > > > > In terms of launching QEMU I'd imagine: > > > > > > $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > > > > > > Any virtual machine HW features which are tied to host kernel features > > > would have their defaults set based on the requested -platform. The > > > -machine will be fully invariant wrt the host kernel. > > > > > > You would have -platform hlep to list available platforms, and > > > corresonding QMP "query-platforms" command to list what platforms > > > are supported on a given host OS. > > > > > > Downstream distros can provide their own platforms definitions > > > (eg "linux-rhel-9.5") if they have kernels whose feature set > > > diverges from upstream due to backports. > > > > > > Mgmt apps won't need to be taught about every single little QEMU > > > setting whose default is derived from the kernel. Individual > > > defaults are opaque and controlled by the requested platform. > > > > > > Live migration has clearly defined semantics, and mgmt app can > > > use query-platforms to validate two hosts are compatible. > > > > > > Omitting -platform should pick the very latest platform that is > > > cmpatible with the current host (not neccessarily the latest > > > platform built-in to QEMU). > > > > This seems to add one more layer to maintain, and so far I don't know > > whether it's a must. > > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > > thought it was mostly the case already, except some extremely rare > > outliers. > > > > When we have one host that boots up a VM using: > > > > $QEMU1 $cmdline > > > > Then another host boots up: > > > > $QEMU2 $cmdline -incoming XXX > > > > Then migration should succeed if $cmdline is exactly the same, and the VM > > can boot up all fine without errors on both sides. > > > > AFAICT this has nothing to do with what kernel is underneath, even not > > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > > didn't, I thought the ABI should be guaranteed. > > We've got two mutually conflicting goals with the machine type > definitions. > > Primarily we use them to ensure stable ABI, but an important > secondary goal is to enable new tunables to have new defaults > set, without having to update every mgmt app. The latter > works very well when the defaults have no dependancy on the > platform kernel/OS, but breaks migration when they do have a > platform dependancy. > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > by default, while advanced users can opt-in on new features. We can't > > make this ON by default otherwise some VMs can already fail to boot, > > > > - If the host doesn't support the feature while the cmdline enabled it, > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > this host does not support running such VM specified, due to XXX > > feature missing". > > > > That's the only way an user could understand what happened, and IMHO that's > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > while in which the machine type is the fundation of such definition, as the > > machine type can decides many of the rest compat properties. And that's > > the whole point of the compat properties too (to make sure the guest ABI is > > stable). > > > > If kernel breaks it easily, all compat property things that we maintain can > > already stop making sense in general, because it didn't define the whole > > guest ABI.. > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > somehting. And maybe we don't yet need the "-platform" layer if we can > > keep up with this rule? > > We've failed at this for years wrt enabling use of new defaults that have > a platform depedancy, so historical practice isn't a good reference. > > There are 100's (possibly 1000's) of tunables set implicitly as part of > the machine type, and of those, libvirt likely only exposes a few 10's > of tunables. The vast majority are low level details that no mgmt app > wants to know about, they just want to accept QEMU's new defaults, > while preserving machine ABI. This is a good thing. No one wants the > burden of wiring up every single tunable into libvirt and mgmt apps. > > This is what the "-platform" concept would be intended to preserve. It > would allow a way to enable groups of settings that have a platform level > dependancy, without ever having to teach either libvirt or the mgmt apps > about the individual tunables. Do you think we can achieve similar goal by simply turning the feature to ON only after a few QEMU releases? I also mentioned that idea below. https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n So far it really sounds like the right thing to do to me to fix all similar issues, even without introducing anything new we need to maintain. To put that again, what we need to do is this: - To start: we should NEVER turn any guest ABI relevant bits automatically by QEMU, for sure.. - When introducing any new device feature that may both (1) affects guest ABI, and (2) depends on host kernel features, we set those default values to OFF always at start. So this already covers old machine types, no compat property needed so far. - We always fail hard on QEMU boot whenever we detected such property is not supported by the current host when with ON (and since it's OFF by default it must be that the user specified that ON). - When after a stablized period of time for that new feature to land most kernels (we may consider to look at how major Linux distros updates the kernel versions) when we're pretty sure the new feature should be available on most of the QEMU modern users, we add a patch to make the property default ON on the new machine type, add a compat property for old machines. The last bullet also means we'll start to fail new machine type from booting when running that very new QEMU on a very old kernel, but that's the trade-off, and when doing it right on "stablizing the feature in the kernel world", it should really be corner case. The user should simply invoke an old machine type on that old kernel, even if the qemu is new. Thanks,
On 2024/07/30 0:58, Daniel P. Berrangé wrote: > On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote: >> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>> >>> In terms of launching QEMU I'd imagine: >>> >>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>> >>> Any virtual machine HW features which are tied to host kernel features >>> would have their defaults set based on the requested -platform. The >>> -machine will be fully invariant wrt the host kernel. >>> >>> You would have -platform hlep to list available platforms, and >>> corresonding QMP "query-platforms" command to list what platforms >>> are supported on a given host OS. >>> >>> Downstream distros can provide their own platforms definitions >>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>> diverges from upstream due to backports. >>> >>> Mgmt apps won't need to be taught about every single little QEMU >>> setting whose default is derived from the kernel. Individual >>> defaults are opaque and controlled by the requested platform. >>> >>> Live migration has clearly defined semantics, and mgmt app can >>> use query-platforms to validate two hosts are compatible. >>> >>> Omitting -platform should pick the very latest platform that is >>> cmpatible with the current host (not neccessarily the latest >>> platform built-in to QEMU). >> >> This seems to add one more layer to maintain, and so far I don't know >> whether it's a must. >> >> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >> thought it was mostly the case already, except some extremely rare >> outliers. >> >> When we have one host that boots up a VM using: >> >> $QEMU1 $cmdline >> >> Then another host boots up: >> >> $QEMU2 $cmdline -incoming XXX >> >> Then migration should succeed if $cmdline is exactly the same, and the VM >> can boot up all fine without errors on both sides. >> >> AFAICT this has nothing to do with what kernel is underneath, even not >> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >> didn't, I thought the ABI should be guaranteed. > > We've got two mutually conflicting goals with the machine type > definitions. > > Primarily we use them to ensure stable ABI, but an important > secondary goal is to enable new tunables to have new defaults > set, without having to update every mgmt app. The latter > works very well when the defaults have no dependancy on the > platform kernel/OS, but breaks migration when they do have a > platform dependancy. > >> - Firstly, never quietly flipping any bit that affects the ABI... >> >> - Have a default value of off, then QEMU will always allow the VM to boot >> by default, while advanced users can opt-in on new features. We can't >> make this ON by default otherwise some VMs can already fail to boot, >> >> - If the host doesn't support the feature while the cmdline enabled it, >> it needs to fail QEMU boot rather than flipping, so that it says "hey, >> this host does not support running such VM specified, due to XXX >> feature missing". >> >> That's the only way an user could understand what happened, and IMHO that's >> a clean way that we stick with QEMU cmdline on defining the guest ABI, >> while in which the machine type is the fundation of such definition, as the >> machine type can decides many of the rest compat properties. And that's >> the whole point of the compat properties too (to make sure the guest ABI is >> stable). >> >> If kernel breaks it easily, all compat property things that we maintain can >> already stop making sense in general, because it didn't define the whole >> guest ABI.. >> >> So AFAIU that's really what we used for years, I hope I didn't overlook >> somehting. And maybe we don't yet need the "-platform" layer if we can >> keep up with this rule? > > We've failed at this for years wrt enabling use of new defaults that have > a platform depedancy, so historical practice isn't a good reference. > > There are 100's (possibly 1000's) of tunables set implicitly as part of > the machine type, and of those, libvirt likely only exposes a few 10's > of tunables. The vast majority are low level details that no mgmt app > wants to know about, they just want to accept QEMU's new defaults, > while preserving machine ABI. This is a good thing. No one wants the > burden of wiring up every single tunable into libvirt and mgmt apps. > > This is what the "-platform" concept would be intended to preserve. It > would allow a way to enable groups of settings that have a platform level > dependancy, without ever having to teach either libvirt or the mgmt apps > about the individual tunables. The concept of -platform will certainly reduce the number of tunables, but I'm a bit worried that such platform definitions can still have too much variety. The variety of kernel is one; a downstream distro can have linux-rhel-9.5 or something as you suggested, but it is still a chore. Some features like eBPF may need privilege. Others may depend on hardware features. I think it is simpler to analyze the platform dependency and dump it for the management layer. For example, libvirt can request QEMU to analyze the platform dependency when it creates a new domain. QEMU will then figure out that the host kernel is capable of USO and bake it as a platform dependency. Regards, Akihiko Odaki
On 2024/07/30 2:00, Peter Xu wrote: > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: >> On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote: >>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>> >>>> In terms of launching QEMU I'd imagine: >>>> >>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>> >>>> Any virtual machine HW features which are tied to host kernel features >>>> would have their defaults set based on the requested -platform. The >>>> -machine will be fully invariant wrt the host kernel. >>>> >>>> You would have -platform hlep to list available platforms, and >>>> corresonding QMP "query-platforms" command to list what platforms >>>> are supported on a given host OS. >>>> >>>> Downstream distros can provide their own platforms definitions >>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>> diverges from upstream due to backports. >>>> >>>> Mgmt apps won't need to be taught about every single little QEMU >>>> setting whose default is derived from the kernel. Individual >>>> defaults are opaque and controlled by the requested platform. >>>> >>>> Live migration has clearly defined semantics, and mgmt app can >>>> use query-platforms to validate two hosts are compatible. >>>> >>>> Omitting -platform should pick the very latest platform that is >>>> cmpatible with the current host (not neccessarily the latest >>>> platform built-in to QEMU). >>> >>> This seems to add one more layer to maintain, and so far I don't know >>> whether it's a must. >>> >>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>> thought it was mostly the case already, except some extremely rare >>> outliers. >>> >>> When we have one host that boots up a VM using: >>> >>> $QEMU1 $cmdline >>> >>> Then another host boots up: >>> >>> $QEMU2 $cmdline -incoming XXX >>> >>> Then migration should succeed if $cmdline is exactly the same, and the VM >>> can boot up all fine without errors on both sides. >>> >>> AFAICT this has nothing to do with what kernel is underneath, even not >>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>> didn't, I thought the ABI should be guaranteed. >> >> We've got two mutually conflicting goals with the machine type >> definitions. >> >> Primarily we use them to ensure stable ABI, but an important >> secondary goal is to enable new tunables to have new defaults >> set, without having to update every mgmt app. The latter >> works very well when the defaults have no dependancy on the >> platform kernel/OS, but breaks migration when they do have a >> platform dependancy. >> >>> - Firstly, never quietly flipping any bit that affects the ABI... >>> >>> - Have a default value of off, then QEMU will always allow the VM to boot >>> by default, while advanced users can opt-in on new features. We can't >>> make this ON by default otherwise some VMs can already fail to boot, >>> >>> - If the host doesn't support the feature while the cmdline enabled it, >>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>> this host does not support running such VM specified, due to XXX >>> feature missing". >>> >>> That's the only way an user could understand what happened, and IMHO that's >>> a clean way that we stick with QEMU cmdline on defining the guest ABI, >>> while in which the machine type is the fundation of such definition, as the >>> machine type can decides many of the rest compat properties. And that's >>> the whole point of the compat properties too (to make sure the guest ABI is >>> stable). >>> >>> If kernel breaks it easily, all compat property things that we maintain can >>> already stop making sense in general, because it didn't define the whole >>> guest ABI.. >>> >>> So AFAIU that's really what we used for years, I hope I didn't overlook >>> somehting. And maybe we don't yet need the "-platform" layer if we can >>> keep up with this rule? >> >> We've failed at this for years wrt enabling use of new defaults that have >> a platform depedancy, so historical practice isn't a good reference. >> >> There are 100's (possibly 1000's) of tunables set implicitly as part of >> the machine type, and of those, libvirt likely only exposes a few 10's >> of tunables. The vast majority are low level details that no mgmt app >> wants to know about, they just want to accept QEMU's new defaults, >> while preserving machine ABI. This is a good thing. No one wants the >> burden of wiring up every single tunable into libvirt and mgmt apps. >> >> This is what the "-platform" concept would be intended to preserve. It >> would allow a way to enable groups of settings that have a platform level >> dependancy, without ever having to teach either libvirt or the mgmt apps >> about the individual tunables. > > Do you think we can achieve similar goal by simply turning the feature to > ON only after a few QEMU releases? I also mentioned that idea below. > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > So far it really sounds like the right thing to do to me to fix all similar > issues, even without introducing anything new we need to maintain. > > To put that again, what we need to do is this: > > - To start: we should NEVER turn any guest ABI relevant bits > automatically by QEMU, for sure.. > > - When introducing any new device feature that may both (1) affects guest > ABI, and (2) depends on host kernel features, we set those default > values to OFF always at start. So this already covers old machine > types, no compat property needed so far. > > - We always fail hard on QEMU boot whenever we detected such property is > not supported by the current host when with ON (and since it's OFF by > default it must be that the user specified that ON). > > - When after a stablized period of time for that new feature to land most > kernels (we may consider to look at how major Linux distros updates the > kernel versions) when we're pretty sure the new feature should be > available on most of the QEMU modern users, we add a patch to make the > property default ON on the new machine type, add a compat property for > old machines. > > The last bullet also means we'll start to fail new machine type from > booting when running that very new QEMU on a very old kernel, but that's > the trade-off, and when doing it right on "stablizing the feature in the > kernel world", it should really be corner case. The user should simply > invoke an old machine type on that old kernel, even if the qemu is new. docs/about/build-platforms.rst already defines supported platforms. One of the supported platforms is Debian 11 (bullseye), and it carries Linux 5.10, which was released December 2020. If we follow this platform support, a new feature added to upstream Linux may take about 4 years before it gets enabled by default on QEMU. As an upstream developer, I feel it is too long, but I'm sure there are different opinions from different perspectives. Regards, Akihiko Odaki
On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > We've got two mutually conflicting goals with the machine type > > definitions. > > > > Primarily we use them to ensure stable ABI, but an important > > secondary goal is to enable new tunables to have new defaults > > set, without having to update every mgmt app. The latter > > works very well when the defaults have no dependancy on the > > platform kernel/OS, but breaks migration when they do have a > > platform dependancy. > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > by default, while advanced users can opt-in on new features. We can't > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > this host does not support running such VM specified, due to XXX > > > feature missing". > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > while in which the machine type is the fundation of such definition, as the > > > machine type can decides many of the rest compat properties. And that's > > > the whole point of the compat properties too (to make sure the guest ABI is > > > stable). > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > already stop making sense in general, because it didn't define the whole > > > guest ABI.. > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > keep up with this rule? > > > > We've failed at this for years wrt enabling use of new defaults that have > > a platform depedancy, so historical practice isn't a good reference. > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > the machine type, and of those, libvirt likely only exposes a few 10's > > of tunables. The vast majority are low level details that no mgmt app > > wants to know about, they just want to accept QEMU's new defaults, > > while preserving machine ABI. This is a good thing. No one wants the > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > This is what the "-platform" concept would be intended to preserve. It > > would allow a way to enable groups of settings that have a platform level > > dependancy, without ever having to teach either libvirt or the mgmt apps > > about the individual tunables. > > Do you think we can achieve similar goal by simply turning the feature to > ON only after a few QEMU releases? I also mentioned that idea below. > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > So far it really sounds like the right thing to do to me to fix all similar > issues, even without introducing anything new we need to maintain. Turning a feature with a platform dependency to "on" implies that the machine type will cease to work out of the box for platforms which lack the feature. IMHO that's not acceptable behaviour for any of our supported platforms. IOW, "after a few QEMU releases" implies a delay of as much as 5 years, while we wait for platforms which don't support the feature to drop out of our supported targets list. I don't think that'll satisfy the desire to get the new feature available to users as soon as practical for their particular platform. > > To put that again, what we need to do is this: > > - To start: we should NEVER turn any guest ABI relevant bits > automatically by QEMU, for sure.. > > - When introducing any new device feature that may both (1) affects guest > ABI, and (2) depends on host kernel features, we set those default > values to OFF always at start. So this already covers old machine > types, no compat property needed so far. > > - We always fail hard on QEMU boot whenever we detected such property is > not supported by the current host when with ON (and since it's OFF by > default it must be that the user specified that ON). > > - When after a stablized period of time for that new feature to land most > kernels (we may consider to look at how major Linux distros updates the > kernel versions) when we're pretty sure the new feature should be > available on most of the QEMU modern users, we add a patch to make the > property default ON on the new machine type, add a compat property for > old machines. Our supported platform list determines when this will be, and given our current criteria, this can be as long as 5 years. With regards, Daniel
On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/29 23:29, Peter Xu wrote: > > On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >> On 2024/07/29 12:50, Jason Wang wrote: > >>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>> > >>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>> > >>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>> > >>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>> don't do this. > >>>>>>>>> > >>>>>>>>> May I ask for my understanding: > >>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>> between machines that have different host kernel features? > >>>>>>>>> > >>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>> > >>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>> > >>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>> > >>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>> > >>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>> of providing a stable guest ABI. > >>>>>>> > >>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>> implementation? > >>>>>> > >>>>>> In terms of launching QEMU I'd imagine: > >>>>>> > >>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>> > >>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>> would have their defaults set based on the requested -platform. The > >>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>> > >>>>>> You would have -platform hlep to list available platforms, and > >>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>> are supported on a given host OS. > >>>>>> > >>>>>> Downstream distros can provide their own platforms definitions > >>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>> diverges from upstream due to backports. > >>>>>> > >>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>> setting whose default is derived from the kernel. Individual > >>>>>> defaults are opaque and controlled by the requested platform. > >>>>>> > >>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>> use query-platforms to validate two hosts are compatible. > >>>>>> > >>>>>> Omitting -platform should pick the very latest platform that is > >>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>> platform built-in to QEMU). > >>>>> > >>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>> whether it's a must. > >>>>> > >>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>> thought it was mostly the case already, except some extremely rare > >>>>> outliers. > >>>>> > >>>>> When we have one host that boots up a VM using: > >>>>> > >>>>> $QEMU1 $cmdline > >>>>> > >>>>> Then another host boots up: > >>>>> > >>>>> $QEMU2 $cmdline -incoming XXX > >>>>> > >>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>> can boot up all fine without errors on both sides. > >>>>> > >>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>> didn't, I thought the ABI should be guaranteed. > >>>>> > >>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>> properties should be following this rule. The issue here is, we have the > >>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>> break. > >>>>> > >>>>> That's also why I was suggesting if the property contributes to the guest > >>>>> ABI, then AFAIU QEMU needs to: > >>>>> > >>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>> > >>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>> by default, while advanced users can opt-in on new features. We can't > >>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>> > >>>> It may not be necessary the case that old features are supported by > >>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>> Windows, which probably doesn't support any offloading at all. A more > >>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>> privilege so we cannot assume it is always available even on the latest > >>>> version of Linux. > >>> > >>> I don't get why eBPF matters here. It is something that is not noticed > >>> by the guest and we have a fallback anyhow. > > It is noticeable for the guest, and the fallback is not effective with > vhost. It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. Thanks
On 2024/07/30 11:04, Jason Wang wrote: > On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2024/07/29 23:29, Peter Xu wrote: >>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: >>>> On 2024/07/29 12:50, Jason Wang wrote: >>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2024/07/27 5:47, Peter Xu wrote: >>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>>>>>> >>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>>>>>> >>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>>>>>> don't do this. >>>>>>>>>>> >>>>>>>>>>> May I ask for my understanding: >>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>>>>>> between machines that have different host kernel features? >>>>>>>>>>> >>>>>>>>>>>> Long term, we need to start exposing management APIs >>>>>>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>>>>>> >>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>>>>>> >>>>>>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>>>>>> as we cannot assume that the libvirt people are using knows about >>>>>>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>>>>>> it, people can easily be using a previous libvirt release. >>>>>>>>>> >>>>>>>>>> QEMU itself needs to make the machine types do that they are there >>>>>>>>>> todo, which is to define a stable machine ABI. >>>>>>>>>> >>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>>>>>> sets of features which are tied to specific platform generations. >>>>>>>>>> As long as we don't have that we'll keep having these broken >>>>>>>>>> migration problems from machine types dynamically changing instead >>>>>>>>>> of providing a stable guest ABI. >>>>>>>>> >>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>>>>>> implementation? >>>>>>>> >>>>>>>> In terms of launching QEMU I'd imagine: >>>>>>>> >>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>>>>>> >>>>>>>> Any virtual machine HW features which are tied to host kernel features >>>>>>>> would have their defaults set based on the requested -platform. The >>>>>>>> -machine will be fully invariant wrt the host kernel. >>>>>>>> >>>>>>>> You would have -platform hlep to list available platforms, and >>>>>>>> corresonding QMP "query-platforms" command to list what platforms >>>>>>>> are supported on a given host OS. >>>>>>>> >>>>>>>> Downstream distros can provide their own platforms definitions >>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>>>>>> diverges from upstream due to backports. >>>>>>>> >>>>>>>> Mgmt apps won't need to be taught about every single little QEMU >>>>>>>> setting whose default is derived from the kernel. Individual >>>>>>>> defaults are opaque and controlled by the requested platform. >>>>>>>> >>>>>>>> Live migration has clearly defined semantics, and mgmt app can >>>>>>>> use query-platforms to validate two hosts are compatible. >>>>>>>> >>>>>>>> Omitting -platform should pick the very latest platform that is >>>>>>>> cmpatible with the current host (not neccessarily the latest >>>>>>>> platform built-in to QEMU). >>>>>>> >>>>>>> This seems to add one more layer to maintain, and so far I don't know >>>>>>> whether it's a must. >>>>>>> >>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>>>>>> thought it was mostly the case already, except some extremely rare >>>>>>> outliers. >>>>>>> >>>>>>> When we have one host that boots up a VM using: >>>>>>> >>>>>>> $QEMU1 $cmdline >>>>>>> >>>>>>> Then another host boots up: >>>>>>> >>>>>>> $QEMU2 $cmdline -incoming XXX >>>>>>> >>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM >>>>>>> can boot up all fine without errors on both sides. >>>>>>> >>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not >>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>>>>>> didn't, I thought the ABI should be guaranteed. >>>>>>> >>>>>>> That's why I think this is a migration violation, as 99.99% of other device >>>>>>> properties should be following this rule. The issue here is, we have the >>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>>>>>> break. >>>>>>> >>>>>>> That's also why I was suggesting if the property contributes to the guest >>>>>>> ABI, then AFAIU QEMU needs to: >>>>>>> >>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>> >>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>> >>>>>> It may not be necessary the case that old features are supported by >>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to >>>>>> Windows, which probably doesn't support any offloading at all. A more >>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a >>>>>> privilege so we cannot assume it is always available even on the latest >>>>>> version of Linux. >>>>> >>>>> I don't get why eBPF matters here. It is something that is not noticed >>>>> by the guest and we have a fallback anyhow. >> >> It is noticeable for the guest, and the fallback is not effective with >> vhost. > > It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. We can certainly fallback to in-QEMU RSS by disabling vhost, but I would not say lack of such fallback is a bug. We don't provide in-QEMU fallback for other offloads. Regards, Akihiko Odaki
On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/30 11:04, Jason Wang wrote: > > On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2024/07/29 23:29, Peter Xu wrote: > >>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >>>> On 2024/07/29 12:50, Jason Wang wrote: > >>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>>>> > >>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>>>> > >>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>>>> don't do this. > >>>>>>>>>>> > >>>>>>>>>>> May I ask for my understanding: > >>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>>>> between machines that have different host kernel features? > >>>>>>>>>>> > >>>>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>>>> > >>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>>>> > >>>>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>>>> > >>>>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>>>> > >>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>>>> of providing a stable guest ABI. > >>>>>>>>> > >>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>>>> implementation? > >>>>>>>> > >>>>>>>> In terms of launching QEMU I'd imagine: > >>>>>>>> > >>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>>>> > >>>>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>>>> would have their defaults set based on the requested -platform. The > >>>>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>>>> > >>>>>>>> You would have -platform hlep to list available platforms, and > >>>>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>>>> are supported on a given host OS. > >>>>>>>> > >>>>>>>> Downstream distros can provide their own platforms definitions > >>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>>>> diverges from upstream due to backports. > >>>>>>>> > >>>>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>>>> setting whose default is derived from the kernel. Individual > >>>>>>>> defaults are opaque and controlled by the requested platform. > >>>>>>>> > >>>>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>>>> use query-platforms to validate two hosts are compatible. > >>>>>>>> > >>>>>>>> Omitting -platform should pick the very latest platform that is > >>>>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>>>> platform built-in to QEMU). > >>>>>>> > >>>>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>>>> whether it's a must. > >>>>>>> > >>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>>>> thought it was mostly the case already, except some extremely rare > >>>>>>> outliers. > >>>>>>> > >>>>>>> When we have one host that boots up a VM using: > >>>>>>> > >>>>>>> $QEMU1 $cmdline > >>>>>>> > >>>>>>> Then another host boots up: > >>>>>>> > >>>>>>> $QEMU2 $cmdline -incoming XXX > >>>>>>> > >>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>>>> can boot up all fine without errors on both sides. > >>>>>>> > >>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>>>> didn't, I thought the ABI should be guaranteed. > >>>>>>> > >>>>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>>>> properties should be following this rule. The issue here is, we have the > >>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>>>> break. > >>>>>>> > >>>>>>> That's also why I was suggesting if the property contributes to the guest > >>>>>>> ABI, then AFAIU QEMU needs to: > >>>>>>> > >>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>>>> > >>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>>>> by default, while advanced users can opt-in on new features. We can't > >>>>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>>>> > >>>>>> It may not be necessary the case that old features are supported by > >>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>>>> Windows, which probably doesn't support any offloading at all. A more > >>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>>>> privilege so we cannot assume it is always available even on the latest > >>>>>> version of Linux. > >>>>> > >>>>> I don't get why eBPF matters here. It is something that is not noticed > >>>>> by the guest and we have a fallback anyhow. > >> > >> It is noticeable for the guest, and the fallback is not effective with > >> vhost. > > > > It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. > > We can certainly fallback to in-QEMU RSS by disabling vhost, but I would > not say lack of such fallback is a bug. Such fallback is by design since the introduction of vhost. > We don't provide in-QEMU > fallback for other offloads. Yes but what I want to say is that eBPF RSS is different from those segmentation offloads. And technically, Qemu can do fallback for offloads (as RSC did). Thanks > > Regards, > Akihiko Odaki >
On 2024/07/30 12:03, Jason Wang wrote: > On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2024/07/30 11:04, Jason Wang wrote: >>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2024/07/29 23:29, Peter Xu wrote: >>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: >>>>>> On 2024/07/29 12:50, Jason Wang wrote: >>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: >>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>>>>>>>> >>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>>>>>>>> don't do this. >>>>>>>>>>>>> >>>>>>>>>>>>> May I ask for my understanding: >>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>>>>>>>> between machines that have different host kernel features? >>>>>>>>>>>>> >>>>>>>>>>>>>> Long term, we need to start exposing management APIs >>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>>>>>>>> >>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>>>>>>>> >>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about >>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>>>>>>>> it, people can easily be using a previous libvirt release. >>>>>>>>>>>> >>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there >>>>>>>>>>>> todo, which is to define a stable machine ABI. >>>>>>>>>>>> >>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>>>>>>>> sets of features which are tied to specific platform generations. >>>>>>>>>>>> As long as we don't have that we'll keep having these broken >>>>>>>>>>>> migration problems from machine types dynamically changing instead >>>>>>>>>>>> of providing a stable guest ABI. >>>>>>>>>>> >>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>>>>>>>> implementation? >>>>>>>>>> >>>>>>>>>> In terms of launching QEMU I'd imagine: >>>>>>>>>> >>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>>>>>>>> >>>>>>>>>> Any virtual machine HW features which are tied to host kernel features >>>>>>>>>> would have their defaults set based on the requested -platform. The >>>>>>>>>> -machine will be fully invariant wrt the host kernel. >>>>>>>>>> >>>>>>>>>> You would have -platform hlep to list available platforms, and >>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms >>>>>>>>>> are supported on a given host OS. >>>>>>>>>> >>>>>>>>>> Downstream distros can provide their own platforms definitions >>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>>>>>>>> diverges from upstream due to backports. >>>>>>>>>> >>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU >>>>>>>>>> setting whose default is derived from the kernel. Individual >>>>>>>>>> defaults are opaque and controlled by the requested platform. >>>>>>>>>> >>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can >>>>>>>>>> use query-platforms to validate two hosts are compatible. >>>>>>>>>> >>>>>>>>>> Omitting -platform should pick the very latest platform that is >>>>>>>>>> cmpatible with the current host (not neccessarily the latest >>>>>>>>>> platform built-in to QEMU). >>>>>>>>> >>>>>>>>> This seems to add one more layer to maintain, and so far I don't know >>>>>>>>> whether it's a must. >>>>>>>>> >>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>>>>>>>> thought it was mostly the case already, except some extremely rare >>>>>>>>> outliers. >>>>>>>>> >>>>>>>>> When we have one host that boots up a VM using: >>>>>>>>> >>>>>>>>> $QEMU1 $cmdline >>>>>>>>> >>>>>>>>> Then another host boots up: >>>>>>>>> >>>>>>>>> $QEMU2 $cmdline -incoming XXX >>>>>>>>> >>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM >>>>>>>>> can boot up all fine without errors on both sides. >>>>>>>>> >>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not >>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>>>>>>>> didn't, I thought the ABI should be guaranteed. >>>>>>>>> >>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device >>>>>>>>> properties should be following this rule. The issue here is, we have the >>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>>>>>>>> break. >>>>>>>>> >>>>>>>>> That's also why I was suggesting if the property contributes to the guest >>>>>>>>> ABI, then AFAIU QEMU needs to: >>>>>>>>> >>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>>>> >>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>>> >>>>>>>> It may not be necessary the case that old features are supported by >>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to >>>>>>>> Windows, which probably doesn't support any offloading at all. A more >>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a >>>>>>>> privilege so we cannot assume it is always available even on the latest >>>>>>>> version of Linux. >>>>>>> >>>>>>> I don't get why eBPF matters here. It is something that is not noticed >>>>>>> by the guest and we have a fallback anyhow. >>>> >>>> It is noticeable for the guest, and the fallback is not effective with >>>> vhost. >>> >>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. >> >> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would >> not say lack of such fallback is a bug. > > Such fallback is by design since the introduction of vhost. > >> We don't provide in-QEMU >> fallback for other offloads. > > Yes but what I want to say is that eBPF RSS is different from those > segmentation offloads. And technically, Qemu can do fallback for > offloads (as RSC did). Well, I couldn't find any code disabling vhost for the in-QEMU RSC implementation. Looking at the code, I also found the case of vhost-vdpa. vhost can be simply disabled if it is backed by tuntap, but it is not the case for vDPA. Regards, Akihiko Odaki
On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/30 12:03, Jason Wang wrote: > > On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2024/07/30 11:04, Jason Wang wrote: > >>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2024/07/29 23:29, Peter Xu wrote: > >>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >>>>>> On 2024/07/29 12:50, Jason Wang wrote: > >>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>>>>>> don't do this. > >>>>>>>>>>>>> > >>>>>>>>>>>>> May I ask for my understanding: > >>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>>>>>> between machines that have different host kernel features? > >>>>>>>>>>>>> > >>>>>>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>>>>>> > >>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>>>>>> > >>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>>>>>> > >>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>>>>>> > >>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>>>>>> of providing a stable guest ABI. > >>>>>>>>>>> > >>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>>>>>> implementation? > >>>>>>>>>> > >>>>>>>>>> In terms of launching QEMU I'd imagine: > >>>>>>>>>> > >>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>>>>>> > >>>>>>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>>>>>> would have their defaults set based on the requested -platform. The > >>>>>>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>>>>>> > >>>>>>>>>> You would have -platform hlep to list available platforms, and > >>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>>>>>> are supported on a given host OS. > >>>>>>>>>> > >>>>>>>>>> Downstream distros can provide their own platforms definitions > >>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>>>>>> diverges from upstream due to backports. > >>>>>>>>>> > >>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>>>>>> setting whose default is derived from the kernel. Individual > >>>>>>>>>> defaults are opaque and controlled by the requested platform. > >>>>>>>>>> > >>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>>>>>> use query-platforms to validate two hosts are compatible. > >>>>>>>>>> > >>>>>>>>>> Omitting -platform should pick the very latest platform that is > >>>>>>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>>>>>> platform built-in to QEMU). > >>>>>>>>> > >>>>>>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>>>>>> whether it's a must. > >>>>>>>>> > >>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>>>>>> thought it was mostly the case already, except some extremely rare > >>>>>>>>> outliers. > >>>>>>>>> > >>>>>>>>> When we have one host that boots up a VM using: > >>>>>>>>> > >>>>>>>>> $QEMU1 $cmdline > >>>>>>>>> > >>>>>>>>> Then another host boots up: > >>>>>>>>> > >>>>>>>>> $QEMU2 $cmdline -incoming XXX > >>>>>>>>> > >>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>>>>>> can boot up all fine without errors on both sides. > >>>>>>>>> > >>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>>>>>> didn't, I thought the ABI should be guaranteed. > >>>>>>>>> > >>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>>>>>> properties should be following this rule. The issue here is, we have the > >>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>>>>>> break. > >>>>>>>>> > >>>>>>>>> That's also why I was suggesting if the property contributes to the guest > >>>>>>>>> ABI, then AFAIU QEMU needs to: > >>>>>>>>> > >>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>>>>>> > >>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>>>>>> by default, while advanced users can opt-in on new features. We can't > >>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>>>>>> > >>>>>>>> It may not be necessary the case that old features are supported by > >>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>>>>>> Windows, which probably doesn't support any offloading at all. A more > >>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>>>>>> privilege so we cannot assume it is always available even on the latest > >>>>>>>> version of Linux. > >>>>>>> > >>>>>>> I don't get why eBPF matters here. It is something that is not noticed > >>>>>>> by the guest and we have a fallback anyhow. > >>>> > >>>> It is noticeable for the guest, and the fallback is not effective with > >>>> vhost. > >>> > >>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. > >> > >> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would > >> not say lack of such fallback is a bug. > > > > Such fallback is by design since the introduction of vhost. > > > >> We don't provide in-QEMU > >> fallback for other offloads. > > > > Yes but what I want to say is that eBPF RSS is different from those > > segmentation offloads. And technically, Qemu can do fallback for > > offloads (as RSC did). > > Well, I couldn't find any code disabling vhost for the in-QEMU RSC > implementation. It should be a bug (and I remember we disabled vhost when the patches were merged). Have you tested it in a guest to see if it can see RSC when vhost is enabled? I suspect we need to add the RSC bit into current kernel_feature_bits: /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_F_VERSION_1, VIRTIO_NET_F_MTU, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, VIRTIO_NET_F_HASH_REPORT, VHOST_INVALID_FEATURE_BIT }; As RSC won't be provided by TUN/TAP anyhow. > > Looking at the code, I also found the case of vhost-vdpa. vhost can be > simply disabled if it is backed by tuntap, but it is not the case for vDPA. True, technically, vDPA can fallback to SVQ, but it's another topic. Thanks > > Regards, > Akihiko Odaki >
On 2024/07/30 12:17, Jason Wang wrote: > On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2024/07/30 12:03, Jason Wang wrote: >>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2024/07/30 11:04, Jason Wang wrote: >>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2024/07/29 23:29, Peter Xu wrote: >>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: >>>>>>>> On 2024/07/29 12:50, Jason Wang wrote: >>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>> >>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: >>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>>>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>>>>>>>>>> don't do this. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> May I ask for my understanding: >>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>>>>>>>>>> between machines that have different host kernel features? >>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs >>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>>>>>>>>>> >>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about >>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>>>>>>>>>> it, people can easily be using a previous libvirt release. >>>>>>>>>>>>>> >>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there >>>>>>>>>>>>>> todo, which is to define a stable machine ABI. >>>>>>>>>>>>>> >>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>>>>>>>>>> sets of features which are tied to specific platform generations. >>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken >>>>>>>>>>>>>> migration problems from machine types dynamically changing instead >>>>>>>>>>>>>> of providing a stable guest ABI. >>>>>>>>>>>>> >>>>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>>>>>>>>>> implementation? >>>>>>>>>>>> >>>>>>>>>>>> In terms of launching QEMU I'd imagine: >>>>>>>>>>>> >>>>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>>>>>>>>>> >>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features >>>>>>>>>>>> would have their defaults set based on the requested -platform. The >>>>>>>>>>>> -machine will be fully invariant wrt the host kernel. >>>>>>>>>>>> >>>>>>>>>>>> You would have -platform hlep to list available platforms, and >>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms >>>>>>>>>>>> are supported on a given host OS. >>>>>>>>>>>> >>>>>>>>>>>> Downstream distros can provide their own platforms definitions >>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>>>>>>>>>> diverges from upstream due to backports. >>>>>>>>>>>> >>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU >>>>>>>>>>>> setting whose default is derived from the kernel. Individual >>>>>>>>>>>> defaults are opaque and controlled by the requested platform. >>>>>>>>>>>> >>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can >>>>>>>>>>>> use query-platforms to validate two hosts are compatible. >>>>>>>>>>>> >>>>>>>>>>>> Omitting -platform should pick the very latest platform that is >>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest >>>>>>>>>>>> platform built-in to QEMU). >>>>>>>>>>> >>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know >>>>>>>>>>> whether it's a must. >>>>>>>>>>> >>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>>>>>>>>>> thought it was mostly the case already, except some extremely rare >>>>>>>>>>> outliers. >>>>>>>>>>> >>>>>>>>>>> When we have one host that boots up a VM using: >>>>>>>>>>> >>>>>>>>>>> $QEMU1 $cmdline >>>>>>>>>>> >>>>>>>>>>> Then another host boots up: >>>>>>>>>>> >>>>>>>>>>> $QEMU2 $cmdline -incoming XXX >>>>>>>>>>> >>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM >>>>>>>>>>> can boot up all fine without errors on both sides. >>>>>>>>>>> >>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not >>>>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>>>>>>>>>> didn't, I thought the ABI should be guaranteed. >>>>>>>>>>> >>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device >>>>>>>>>>> properties should be following this rule. The issue here is, we have the >>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>>>>>>>>>> break. >>>>>>>>>>> >>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest >>>>>>>>>>> ABI, then AFAIU QEMU needs to: >>>>>>>>>>> >>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>>>>>> >>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>>>>> >>>>>>>>>> It may not be necessary the case that old features are supported by >>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to >>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more >>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a >>>>>>>>>> privilege so we cannot assume it is always available even on the latest >>>>>>>>>> version of Linux. >>>>>>>>> >>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed >>>>>>>>> by the guest and we have a fallback anyhow. >>>>>> >>>>>> It is noticeable for the guest, and the fallback is not effective with >>>>>> vhost. >>>>> >>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. >>>> >>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would >>>> not say lack of such fallback is a bug. >>> >>> Such fallback is by design since the introduction of vhost. >>> >>>> We don't provide in-QEMU >>>> fallback for other offloads. >>> >>> Yes but what I want to say is that eBPF RSS is different from those >>> segmentation offloads. And technically, Qemu can do fallback for >>> offloads (as RSC did). >> >> Well, I couldn't find any code disabling vhost for the in-QEMU RSC >> implementation. > > It should be a bug (and I remember we disabled vhost when the patches > were merged). Have you tested it in a guest to see if it can see RSC > when vhost is enabled? > > I suspect we need to add the RSC bit into current kernel_feature_bits: > > /* Features supported by host kernel. */ > static const int kernel_feature_bits[] = { > VIRTIO_F_NOTIFY_ON_EMPTY, > VIRTIO_RING_F_INDIRECT_DESC, > VIRTIO_RING_F_EVENT_IDX, > VIRTIO_NET_F_MRG_RXBUF, > VIRTIO_F_VERSION_1, > VIRTIO_NET_F_MTU, > VIRTIO_F_IOMMU_PLATFORM, > VIRTIO_F_RING_PACKED, > VIRTIO_F_RING_RESET, > VIRTIO_NET_F_HASH_REPORT, > VHOST_INVALID_FEATURE_BIT > }; > > As RSC won't be provided by TUN/TAP anyhow. Adding the RSC bit does not let QEMU disable vhost for RSC, but instead it implicitly disables RSC in my understanding. It is still better than advertising the availability of that feature while it is missing. > >> >> Looking at the code, I also found the case of vhost-vdpa. vhost can be >> simply disabled if it is backed by tuntap, but it is not the case for vDPA. > > True, technically, vDPA can fallback to SVQ, but it's another topic. My point of this discussion is that we cannot enable features just because they are sufficiently old or because the user claims QEMU runs on Linux sufficiently new. eBPF requires privilege, and vDPA requires hardware feature. A fallback is not a silver bullet either, and there are situations that providing a fallback is not a trivial task. Regards, Akihiko Odaki
On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/30 12:17, Jason Wang wrote: > > On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2024/07/30 12:03, Jason Wang wrote: > >>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2024/07/30 11:04, Jason Wang wrote: > >>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2024/07/29 23:29, Peter Xu wrote: > >>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >>>>>>>> On 2024/07/29 12:50, Jason Wang wrote: > >>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>> > >>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>>>>>>>> don't do this. > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>> May I ask for my understanding: > >>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>>>>>>>> between machines that have different host kernel features? > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>>>>>>>> of providing a stable guest ABI. > >>>>>>>>>>>>> > >>>>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>>>>>>>> implementation? > >>>>>>>>>>>> > >>>>>>>>>>>> In terms of launching QEMU I'd imagine: > >>>>>>>>>>>> > >>>>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>>>>>>>> > >>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>>>>>>>> would have their defaults set based on the requested -platform. The > >>>>>>>>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>>>>>>>> > >>>>>>>>>>>> You would have -platform hlep to list available platforms, and > >>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>>>>>>>> are supported on a given host OS. > >>>>>>>>>>>> > >>>>>>>>>>>> Downstream distros can provide their own platforms definitions > >>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>>>>>>>> diverges from upstream due to backports. > >>>>>>>>>>>> > >>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>>>>>>>> setting whose default is derived from the kernel. Individual > >>>>>>>>>>>> defaults are opaque and controlled by the requested platform. > >>>>>>>>>>>> > >>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>>>>>>>> use query-platforms to validate two hosts are compatible. > >>>>>>>>>>>> > >>>>>>>>>>>> Omitting -platform should pick the very latest platform that is > >>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>>>>>>>> platform built-in to QEMU). > >>>>>>>>>>> > >>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>>>>>>>> whether it's a must. > >>>>>>>>>>> > >>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>>>>>>>> thought it was mostly the case already, except some extremely rare > >>>>>>>>>>> outliers. > >>>>>>>>>>> > >>>>>>>>>>> When we have one host that boots up a VM using: > >>>>>>>>>>> > >>>>>>>>>>> $QEMU1 $cmdline > >>>>>>>>>>> > >>>>>>>>>>> Then another host boots up: > >>>>>>>>>>> > >>>>>>>>>>> $QEMU2 $cmdline -incoming XXX > >>>>>>>>>>> > >>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>>>>>>>> can boot up all fine without errors on both sides. > >>>>>>>>>>> > >>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>>>>>>>> didn't, I thought the ABI should be guaranteed. > >>>>>>>>>>> > >>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>>>>>>>> properties should be following this rule. The issue here is, we have the > >>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>>>>>>>> break. > >>>>>>>>>>> > >>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest > >>>>>>>>>>> ABI, then AFAIU QEMU needs to: > >>>>>>>>>>> > >>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>>>>>>>> > >>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't > >>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>>>>>>>> > >>>>>>>>>> It may not be necessary the case that old features are supported by > >>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more > >>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>>>>>>>> privilege so we cannot assume it is always available even on the latest > >>>>>>>>>> version of Linux. > >>>>>>>>> > >>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed > >>>>>>>>> by the guest and we have a fallback anyhow. > >>>>>> > >>>>>> It is noticeable for the guest, and the fallback is not effective with > >>>>>> vhost. > >>>>> > >>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. > >>>> > >>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would > >>>> not say lack of such fallback is a bug. > >>> > >>> Such fallback is by design since the introduction of vhost. > >>> > >>>> We don't provide in-QEMU > >>>> fallback for other offloads. > >>> > >>> Yes but what I want to say is that eBPF RSS is different from those > >>> segmentation offloads. And technically, Qemu can do fallback for > >>> offloads (as RSC did). > >> > >> Well, I couldn't find any code disabling vhost for the in-QEMU RSC > >> implementation. > > > > It should be a bug (and I remember we disabled vhost when the patches > > were merged). Have you tested it in a guest to see if it can see RSC > > when vhost is enabled? > > > > I suspect we need to add the RSC bit into current kernel_feature_bits: > > > > /* Features supported by host kernel. */ > > static const int kernel_feature_bits[] = { > > VIRTIO_F_NOTIFY_ON_EMPTY, > > VIRTIO_RING_F_INDIRECT_DESC, > > VIRTIO_RING_F_EVENT_IDX, > > VIRTIO_NET_F_MRG_RXBUF, > > VIRTIO_F_VERSION_1, > > VIRTIO_NET_F_MTU, > > VIRTIO_F_IOMMU_PLATFORM, > > VIRTIO_F_RING_PACKED, > > VIRTIO_F_RING_RESET, > > VIRTIO_NET_F_HASH_REPORT, > > VHOST_INVALID_FEATURE_BIT > > }; > > > > As RSC won't be provided by TUN/TAP anyhow. > > Adding the RSC bit does not let QEMU disable vhost for RSC, but instead > it implicitly disables RSC in my understanding. Yes. > It is still better than > advertising the availability of that feature while it is missing. Down the road, we probably need to change the behaviour of disabling vhost-net. > > > > >> > >> Looking at the code, I also found the case of vhost-vdpa. vhost can be > >> simply disabled if it is backed by tuntap, but it is not the case for vDPA. > > > > True, technically, vDPA can fallback to SVQ, but it's another topic. > > My point of this discussion is that we cannot enable features just > because they are sufficiently old or because the user claims QEMU runs > on Linux sufficiently new. eBPF requires privilege, and vDPA requires > hardware feature. A fallback is not a silver bullet either, and there > are situations that providing a fallback is not a trivial task. To make sure we are on the same page. I just want to point out that eBPF RSS is not a good example in this context. It works only for tuntap, so we should stick to the behaviour of trying to fallback to userspace if we can as we've already had a userspace fallback. This is the fundamental difference with other features (like segmentation offload) or backend (vDPA) that doesn't have an existing fallback. Thanks > > Regards, > Akihiko Odaki >
On 2024/07/30 12:45, Jason Wang wrote: > On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >> >> On 2024/07/30 12:17, Jason Wang wrote: >>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>> >>>> On 2024/07/30 12:03, Jason Wang wrote: >>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>> >>>>>> On 2024/07/30 11:04, Jason Wang wrote: >>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>> >>>>>>>> On 2024/07/29 23:29, Peter Xu wrote: >>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: >>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote: >>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: >>>>>>>>>>>> >>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: >>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: >>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: >>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: >>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: >>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: >>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability >>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the >>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. >>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> >>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has >>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't.. >>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, >>>>>>>>>>>>>>>>>> don't do this. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> May I ask for my understanding: >>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU >>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate >>>>>>>>>>>>>>>>> between machines that have different host kernel features? >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs >>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host >>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, >>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about >>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about >>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there >>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI. >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode >>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations. >>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken >>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead >>>>>>>>>>>>>>>> of providing a stable guest ABI. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in >>>>>>>>>>>>>>> implementation? >>>>>>>>>>>>>> >>>>>>>>>>>>>> In terms of launching QEMU I'd imagine: >>>>>>>>>>>>>> >>>>>>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... >>>>>>>>>>>>>> >>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features >>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The >>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel. >>>>>>>>>>>>>> >>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and >>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms >>>>>>>>>>>>>> are supported on a given host OS. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions >>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set >>>>>>>>>>>>>> diverges from upstream due to backports. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU >>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual >>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can >>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible. >>>>>>>>>>>>>> >>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is >>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest >>>>>>>>>>>>>> platform built-in to QEMU). >>>>>>>>>>>>> >>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know >>>>>>>>>>>>> whether it's a must. >>>>>>>>>>>>> >>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I >>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare >>>>>>>>>>>>> outliers. >>>>>>>>>>>>> >>>>>>>>>>>>> When we have one host that boots up a VM using: >>>>>>>>>>>>> >>>>>>>>>>>>> $QEMU1 $cmdline >>>>>>>>>>>>> >>>>>>>>>>>>> Then another host boots up: >>>>>>>>>>>>> >>>>>>>>>>>>> $QEMU2 $cmdline -incoming XXX >>>>>>>>>>>>> >>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM >>>>>>>>>>>>> can boot up all fine without errors on both sides. >>>>>>>>>>>>> >>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not >>>>>>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it >>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed. >>>>>>>>>>>>> >>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device >>>>>>>>>>>>> properties should be following this rule. The issue here is, we have the >>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got >>>>>>>>>>>>> break. >>>>>>>>>>>>> >>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest >>>>>>>>>>>>> ABI, then AFAIU QEMU needs to: >>>>>>>>>>>>> >>>>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>>>>>>>> >>>>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>>>>>>> >>>>>>>>>>>> It may not be necessary the case that old features are supported by >>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to >>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more >>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a >>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest >>>>>>>>>>>> version of Linux. >>>>>>>>>>> >>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed >>>>>>>>>>> by the guest and we have a fallback anyhow. >>>>>>>> >>>>>>>> It is noticeable for the guest, and the fallback is not effective with >>>>>>>> vhost. >>>>>>> >>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. >>>>>> >>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would >>>>>> not say lack of such fallback is a bug. >>>>> >>>>> Such fallback is by design since the introduction of vhost. >>>>> >>>>>> We don't provide in-QEMU >>>>>> fallback for other offloads. >>>>> >>>>> Yes but what I want to say is that eBPF RSS is different from those >>>>> segmentation offloads. And technically, Qemu can do fallback for >>>>> offloads (as RSC did). >>>> >>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC >>>> implementation. >>> >>> It should be a bug (and I remember we disabled vhost when the patches >>> were merged). Have you tested it in a guest to see if it can see RSC >>> when vhost is enabled? >>> >>> I suspect we need to add the RSC bit into current kernel_feature_bits: >>> >>> /* Features supported by host kernel. */ >>> static const int kernel_feature_bits[] = { >>> VIRTIO_F_NOTIFY_ON_EMPTY, >>> VIRTIO_RING_F_INDIRECT_DESC, >>> VIRTIO_RING_F_EVENT_IDX, >>> VIRTIO_NET_F_MRG_RXBUF, >>> VIRTIO_F_VERSION_1, >>> VIRTIO_NET_F_MTU, >>> VIRTIO_F_IOMMU_PLATFORM, >>> VIRTIO_F_RING_PACKED, >>> VIRTIO_F_RING_RESET, >>> VIRTIO_NET_F_HASH_REPORT, >>> VHOST_INVALID_FEATURE_BIT >>> }; >>> >>> As RSC won't be provided by TUN/TAP anyhow. >> >> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead >> it implicitly disables RSC in my understanding. > > Yes. > >> It is still better than >> advertising the availability of that feature while it is missing. > > Down the road, we probably need to change the behaviour of disabling vhost-net. > >> >>> >>>> >>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be >>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA. >>> >>> True, technically, vDPA can fallback to SVQ, but it's another topic. >> >> My point of this discussion is that we cannot enable features just >> because they are sufficiently old or because the user claims QEMU runs >> on Linux sufficiently new. eBPF requires privilege, and vDPA requires >> hardware feature. A fallback is not a silver bullet either, and there >> are situations that providing a fallback is not a trivial task. > > To make sure we are on the same page. I just want to point out that > eBPF RSS is not a good example in this context. > > It works only for tuntap, so we should stick to the behaviour of > trying to fallback to userspace if we can as we've already had a > userspace fallback. This is the fundamental difference with other > features (like segmentation offload) or backend (vDPA) that doesn't > have an existing fallback. Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c. They are not wired up to behave as a fallback when tuntap's vhost is enabled as the in-QEMU RSS is not. In either case, we need to pay some effort to wiring things. I'm not sure it is worthwhile. I think there is a high chance that selectively disabling vhost and keeping RSS enabled with fallback will result in worse performance than keeping vhost enabled and disabling RSS. Such a fallback can still function as an emergency escape hatch, but it is also incomplete as we don't have fallbacks for other features. I would rather make any features missing in the vhost backend fail to keep things consistent. Regards, Akihiko Odaki
@Akihiko Odaki The RSC is supported with vhost and without vhost The 'in-qemu RSC' is related to VIRTIO_NET_F_RSC_EXT feature, it is intended for one specific WHCK test only and should not be used in any functional setup. When it is used the vhost should be off On Tue, Jul 30, 2024 at 1:23 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/30 12:45, Jason Wang wrote: > > On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2024/07/30 12:17, Jason Wang wrote: > >>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2024/07/30 12:03, Jason Wang wrote: > >>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2024/07/30 11:04, Jason Wang wrote: > >>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2024/07/29 23:29, Peter Xu wrote: > >>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote: > >>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>>>>>>>>>> don't do this. > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>> May I ask for my understanding: > >>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>>>>>>>>>> between machines that have different host kernel features? > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>>>>>>>>>> of providing a stable guest ABI. > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>>>>>>>>>> implementation? > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> In terms of launching QEMU I'd imagine: > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The > >>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and > >>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>>>>>>>>>> are supported on a given host OS. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions > >>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>>>>>>>>>> diverges from upstream due to backports. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual > >>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is > >>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>>>>>>>>>> platform built-in to QEMU). > >>>>>>>>>>>>> > >>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>>>>>>>>>> whether it's a must. > >>>>>>>>>>>>> > >>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare > >>>>>>>>>>>>> outliers. > >>>>>>>>>>>>> > >>>>>>>>>>>>> When we have one host that boots up a VM using: > >>>>>>>>>>>>> > >>>>>>>>>>>>> $QEMU1 $cmdline > >>>>>>>>>>>>> > >>>>>>>>>>>>> Then another host boots up: > >>>>>>>>>>>>> > >>>>>>>>>>>>> $QEMU2 $cmdline -incoming XXX > >>>>>>>>>>>>> > >>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>>>>>>>>>> can boot up all fine without errors on both sides. > >>>>>>>>>>>>> > >>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>>>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed. > >>>>>>>>>>>>> > >>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>>>>>>>>>> properties should be following this rule. The issue here is, we have the > >>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>>>>>>>>>> break. > >>>>>>>>>>>>> > >>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest > >>>>>>>>>>>>> ABI, then AFAIU QEMU needs to: > >>>>>>>>>>>>> > >>>>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>>>>>>>>>> > >>>>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't > >>>>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>>>>>>>>>> > >>>>>>>>>>>> It may not be necessary the case that old features are supported by > >>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more > >>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest > >>>>>>>>>>>> version of Linux. > >>>>>>>>>>> > >>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed > >>>>>>>>>>> by the guest and we have a fallback anyhow. > >>>>>>>> > >>>>>>>> It is noticeable for the guest, and the fallback is not effective with > >>>>>>>> vhost. > >>>>>>> > >>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. > >>>>>> > >>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would > >>>>>> not say lack of such fallback is a bug. > >>>>> > >>>>> Such fallback is by design since the introduction of vhost. > >>>>> > >>>>>> We don't provide in-QEMU > >>>>>> fallback for other offloads. > >>>>> > >>>>> Yes but what I want to say is that eBPF RSS is different from those > >>>>> segmentation offloads. And technically, Qemu can do fallback for > >>>>> offloads (as RSC did). > >>>> > >>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC > >>>> implementation. > >>> > >>> It should be a bug (and I remember we disabled vhost when the patches > >>> were merged). Have you tested it in a guest to see if it can see RSC > >>> when vhost is enabled? > >>> > >>> I suspect we need to add the RSC bit into current kernel_feature_bits: > >>> > >>> /* Features supported by host kernel. */ > >>> static const int kernel_feature_bits[] = { > >>> VIRTIO_F_NOTIFY_ON_EMPTY, > >>> VIRTIO_RING_F_INDIRECT_DESC, > >>> VIRTIO_RING_F_EVENT_IDX, > >>> VIRTIO_NET_F_MRG_RXBUF, > >>> VIRTIO_F_VERSION_1, > >>> VIRTIO_NET_F_MTU, > >>> VIRTIO_F_IOMMU_PLATFORM, > >>> VIRTIO_F_RING_PACKED, > >>> VIRTIO_F_RING_RESET, > >>> VIRTIO_NET_F_HASH_REPORT, > >>> VHOST_INVALID_FEATURE_BIT > >>> }; > >>> > >>> As RSC won't be provided by TUN/TAP anyhow. > >> > >> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead > >> it implicitly disables RSC in my understanding. > > > > Yes. > > > >> It is still better than > >> advertising the availability of that feature while it is missing. > > > > Down the road, we probably need to change the behaviour of disabling vhost-net. > > > >> > >>> > >>>> > >>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be > >>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA. > >>> > >>> True, technically, vDPA can fallback to SVQ, but it's another topic. > >> > >> My point of this discussion is that we cannot enable features just > >> because they are sufficiently old or because the user claims QEMU runs > >> on Linux sufficiently new. eBPF requires privilege, and vDPA requires > >> hardware feature. A fallback is not a silver bullet either, and there > >> are situations that providing a fallback is not a trivial task. > > > > To make sure we are on the same page. I just want to point out that > > eBPF RSS is not a good example in this context. > > > > It works only for tuntap, so we should stick to the behaviour of > > trying to fallback to userspace if we can as we've already had a > > userspace fallback. This is the fundamental difference with other > > features (like segmentation offload) or backend (vDPA) that doesn't > > have an existing fallback. > > Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c. > They are not wired up to behave as a fallback when tuntap's vhost is > enabled as the in-QEMU RSS is not. In either case, we need to pay some > effort to wiring things. > > I'm not sure it is worthwhile. I think there is a high chance that > selectively disabling vhost and keeping RSS enabled with fallback will > result in worse performance than keeping vhost enabled and disabling > RSS. Such a fallback can still function as an emergency escape hatch, > but it is also incomplete as we don't have fallbacks for other features. > I would rather make any features missing in the vhost backend fail to > keep things consistent. > > Regards, > Akihiko Odaki
On Tue, Jul 30, 2024 at 02:23:46AM +0900, Akihiko Odaki wrote: > On 2024/07/30 2:00, Peter Xu wrote: > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote: > > > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > > > > > > > > > > In terms of launching QEMU I'd imagine: > > > > > > > > > > $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > > > > > > > > > > Any virtual machine HW features which are tied to host kernel features > > > > > would have their defaults set based on the requested -platform. The > > > > > -machine will be fully invariant wrt the host kernel. > > > > > > > > > > You would have -platform hlep to list available platforms, and > > > > > corresonding QMP "query-platforms" command to list what platforms > > > > > are supported on a given host OS. > > > > > > > > > > Downstream distros can provide their own platforms definitions > > > > > (eg "linux-rhel-9.5") if they have kernels whose feature set > > > > > diverges from upstream due to backports. > > > > > > > > > > Mgmt apps won't need to be taught about every single little QEMU > > > > > setting whose default is derived from the kernel. Individual > > > > > defaults are opaque and controlled by the requested platform. > > > > > > > > > > Live migration has clearly defined semantics, and mgmt app can > > > > > use query-platforms to validate two hosts are compatible. > > > > > > > > > > Omitting -platform should pick the very latest platform that is > > > > > cmpatible with the current host (not neccessarily the latest > > > > > platform built-in to QEMU). > > > > > > > > This seems to add one more layer to maintain, and so far I don't know > > > > whether it's a must. > > > > > > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > > > > thought it was mostly the case already, except some extremely rare > > > > outliers. > > > > > > > > When we have one host that boots up a VM using: > > > > > > > > $QEMU1 $cmdline > > > > > > > > Then another host boots up: > > > > > > > > $QEMU2 $cmdline -incoming XXX > > > > > > > > Then migration should succeed if $cmdline is exactly the same, and the VM > > > > can boot up all fine without errors on both sides. > > > > > > > > AFAICT this has nothing to do with what kernel is underneath, even not > > > > Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > > > > didn't, I thought the ABI should be guaranteed. > > > > > > We've got two mutually conflicting goals with the machine type > > > definitions. > > > > > > Primarily we use them to ensure stable ABI, but an important > > > secondary goal is to enable new tunables to have new defaults > > > set, without having to update every mgmt app. The latter > > > works very well when the defaults have no dependancy on the > > > platform kernel/OS, but breaks migration when they do have a > > > platform dependancy. > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > by default, while advanced users can opt-in on new features. We can't > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > this host does not support running such VM specified, due to XXX > > > > feature missing". > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > while in which the machine type is the fundation of such definition, as the > > > > machine type can decides many of the rest compat properties. And that's > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > stable). > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > already stop making sense in general, because it didn't define the whole > > > > guest ABI.. > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > keep up with this rule? > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > of tunables. The vast majority are low level details that no mgmt app > > > wants to know about, they just want to accept QEMU's new defaults, > > > while preserving machine ABI. This is a good thing. No one wants the > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > would allow a way to enable groups of settings that have a platform level > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > about the individual tunables. > > > > Do you think we can achieve similar goal by simply turning the feature to > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > So far it really sounds like the right thing to do to me to fix all similar > > issues, even without introducing anything new we need to maintain. > > > > To put that again, what we need to do is this: > > > > - To start: we should NEVER turn any guest ABI relevant bits > > automatically by QEMU, for sure.. > > > > - When introducing any new device feature that may both (1) affects guest > > ABI, and (2) depends on host kernel features, we set those default > > values to OFF always at start. So this already covers old machine > > types, no compat property needed so far. > > > > - We always fail hard on QEMU boot whenever we detected such property is > > not supported by the current host when with ON (and since it's OFF by > > default it must be that the user specified that ON). > > > > - When after a stablized period of time for that new feature to land most > > kernels (we may consider to look at how major Linux distros updates the > > kernel versions) when we're pretty sure the new feature should be > > available on most of the QEMU modern users, we add a patch to make the > > property default ON on the new machine type, add a compat property for > > old machines. > > > > The last bullet also means we'll start to fail new machine type from > > booting when running that very new QEMU on a very old kernel, but that's > > the trade-off, and when doing it right on "stablizing the feature in the > > kernel world", it should really be corner case. The user should simply > > invoke an old machine type on that old kernel, even if the qemu is new. > > docs/about/build-platforms.rst already defines supported platforms. One of > the supported platforms is Debian 11 (bullseye), and it carries Linux 5.10, > which was released December 2020. If we follow this platform support, a new > feature added to upstream Linux may take about 4 years before it gets > enabled by default on QEMU. > > As an upstream developer, I feel it is too long, but I'm sure there are > different opinions from different perspectives. Above rule won't stop the supported platforms to still run the QEMU binaries, am I right? Especially if it's a serious user the VMs should always be invoked with an old machine type, and that shouldn't be impacted, as the old machines should simply don't support such new kernel feature. The payoff here is only about when the user tries to start the VM using the default / latest machine type. Then with above rule it should fail clearly on what is required to turn OFF so as to boot that VM. Then the user has two options: turn that feature OFF manually, or switch to an old machine type. This is all still based on the fact that we do plan to keep that OFF for a while. So if we think "a few years" is too long, one option is we set it to ON after e.g. 1-2 years so it's in the middle ground where some such new users will fail booting the VM on old hosts, but it'll start to benefit whoever runs the same on a new host. So far I think it's not a major deal, especially considering that this so far looks like the easiest and (still looks to me..) workable solution to make migration always work, IMHO more important to serious VM users. I'm definitely open to other options or suggestions if there is. I just don't see anything yet that is easily applicable.. Thanks,
On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > We've got two mutually conflicting goals with the machine type > > > definitions. > > > > > > Primarily we use them to ensure stable ABI, but an important > > > secondary goal is to enable new tunables to have new defaults > > > set, without having to update every mgmt app. The latter > > > works very well when the defaults have no dependancy on the > > > platform kernel/OS, but breaks migration when they do have a > > > platform dependancy. > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > by default, while advanced users can opt-in on new features. We can't > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > this host does not support running such VM specified, due to XXX > > > > feature missing". > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > while in which the machine type is the fundation of such definition, as the > > > > machine type can decides many of the rest compat properties. And that's > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > stable). > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > already stop making sense in general, because it didn't define the whole > > > > guest ABI.. > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > keep up with this rule? > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > of tunables. The vast majority are low level details that no mgmt app > > > wants to know about, they just want to accept QEMU's new defaults, > > > while preserving machine ABI. This is a good thing. No one wants the > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > would allow a way to enable groups of settings that have a platform level > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > about the individual tunables. > > > > Do you think we can achieve similar goal by simply turning the feature to > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > So far it really sounds like the right thing to do to me to fix all similar > > issues, even without introducing anything new we need to maintain. > > Turning a feature with a platform dependency to "on" implies that > the machine type will cease to work out of the box for platforms > which lack the feature. IMHO that's not acceptable behaviour for > any of our supported platforms. Right, that's why I was thinking whether we should just always be on the safe side, even if I just replied in the other email to Akihiko, that we do have the option to make this more aggresive by turning those to ON after even 1-2 years or even less.. and we have control of how aggressive this can be. > > IOW, "after a few QEMU releases" implies a delay of as much as > 5 years, while we wait for platforms which don't support the > feature to drop out of our supported targets list. I don't > think that'll satisfy the desire to get the new feature > available to users as soon as practical for their particular > platform. The feature is always available since the 1st day, right? We just need the user to opt-in, by specifying ON in the cmdline. That'll be my take on this that QEMU's default VM setup should be always bootable, migratable, and so on. Then user opt-in on stuff like this one, where there's implication on the ABIs. The "user" can also include Libvirt. I mean when something is really important, Libvirt should, IMHO, opt-in by treating that similarly like many cpu properties, and by probing the host first. IIUC there aren't a lot of things like that (part of guest ABI & host kernel / HW dependent), am I right? Otherwise I would expect more failures like this one, but it isn't as much as that yet. IIUC it means the efforts to make Libvirt get involved should be hopefully under control too. The worst case is Libvirt doesn't auto-on it, but again the user should always have the option to turn it on when it's necessary. Thanks, > > > > > To put that again, what we need to do is this: > > > > - To start: we should NEVER turn any guest ABI relevant bits > > automatically by QEMU, for sure.. > > > > - When introducing any new device feature that may both (1) affects guest > > ABI, and (2) depends on host kernel features, we set those default > > values to OFF always at start. So this already covers old machine > > types, no compat property needed so far. > > > > - We always fail hard on QEMU boot whenever we detected such property is > > not supported by the current host when with ON (and since it's OFF by > > default it must be that the user specified that ON). > > > > - When after a stablized period of time for that new feature to land most > > kernels (we may consider to look at how major Linux distros updates the > > kernel versions) when we're pretty sure the new feature should be > > available on most of the QEMU modern users, we add a patch to make the > > property default ON on the new machine type, add a compat property for > > old machines. > > Our supported platform list determines when this will be, and given > our current criteria, this can be as long as 5 years. > > > With regards, > Daniel > -- > |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o- https://fstop138.berrange.com :| > |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| >
On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > > > We've got two mutually conflicting goals with the machine type > > > > definitions. > > > > > > > > Primarily we use them to ensure stable ABI, but an important > > > > secondary goal is to enable new tunables to have new defaults > > > > set, without having to update every mgmt app. The latter > > > > works very well when the defaults have no dependancy on the > > > > platform kernel/OS, but breaks migration when they do have a > > > > platform dependancy. > > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > > by default, while advanced users can opt-in on new features. We can't > > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > > this host does not support running such VM specified, due to XXX > > > > > feature missing". > > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > > while in which the machine type is the fundation of such definition, as the > > > > > machine type can decides many of the rest compat properties. And that's > > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > > stable). > > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > > already stop making sense in general, because it didn't define the whole > > > > > guest ABI.. > > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > > keep up with this rule? > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > > of tunables. The vast majority are low level details that no mgmt app > > > > wants to know about, they just want to accept QEMU's new defaults, > > > > while preserving machine ABI. This is a good thing. No one wants the > > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > > would allow a way to enable groups of settings that have a platform level > > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > > about the individual tunables. > > > > > > Do you think we can achieve similar goal by simply turning the feature to > > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > > > So far it really sounds like the right thing to do to me to fix all similar > > > issues, even without introducing anything new we need to maintain. > > > > Turning a feature with a platform dependency to "on" implies that > > the machine type will cease to work out of the box for platforms > > which lack the feature. IMHO that's not acceptable behaviour for > > any of our supported platforms. > > Right, that's why I was thinking whether we should just always be on the > safe side, even if I just replied in the other email to Akihiko, that we do > have the option to make this more aggresive by turning those to ON after > even 1-2 years or even less.. and we have control of how aggressive this > can be. > > > > > IOW, "after a few QEMU releases" implies a delay of as much as > > 5 years, while we wait for platforms which don't support the > > feature to drop out of our supported targets list. I don't > > think that'll satisfy the desire to get the new feature > > available to users as soon as practical for their particular > > platform. > > The feature is always available since the 1st day, right? We just need the > user to opt-in, by specifying ON in the cmdline. > > That'll be my take on this that QEMU's default VM setup should be always > bootable, migratable, and so on. Then user opt-in on stuff like this one, > where there's implication on the ABIs. The "user" can also include > Libvirt. I mean when something is really important, Libvirt should, IMHO, > opt-in by treating that similarly like many cpu properties, and by probing > the host first. > > IIUC there aren't a lot of things like that (part of guest ABI & host > kernel / HW dependent), am I right? Otherwise I would expect more failures > like this one, but it isn't as much as that yet. IIUC it means the efforts > to make Libvirt get involved should be hopefully under control too. The > worst case is Libvirt doesn't auto-on it, but again the user should always > have the option to turn it on when it's necessary. If it is left to libvirt, then it would very likely end up being a user opt-in, not auto-enabled. With regards, Daniel
On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > > > > > We've got two mutually conflicting goals with the machine type > > > > > definitions. > > > > > > > > > > Primarily we use them to ensure stable ABI, but an important > > > > > secondary goal is to enable new tunables to have new defaults > > > > > set, without having to update every mgmt app. The latter > > > > > works very well when the defaults have no dependancy on the > > > > > platform kernel/OS, but breaks migration when they do have a > > > > > platform dependancy. > > > > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > > > by default, while advanced users can opt-in on new features. We can't > > > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > > > this host does not support running such VM specified, due to XXX > > > > > > feature missing". > > > > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > > > while in which the machine type is the fundation of such definition, as the > > > > > > machine type can decides many of the rest compat properties. And that's > > > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > > > stable). > > > > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > > > already stop making sense in general, because it didn't define the whole > > > > > > guest ABI.. > > > > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > > > keep up with this rule? > > > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > > > of tunables. The vast majority are low level details that no mgmt app > > > > > wants to know about, they just want to accept QEMU's new defaults, > > > > > while preserving machine ABI. This is a good thing. No one wants the > > > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > > > would allow a way to enable groups of settings that have a platform level > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > > > about the individual tunables. > > > > > > > > Do you think we can achieve similar goal by simply turning the feature to > > > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > > > > > So far it really sounds like the right thing to do to me to fix all similar > > > > issues, even without introducing anything new we need to maintain. > > > > > > Turning a feature with a platform dependency to "on" implies that > > > the machine type will cease to work out of the box for platforms > > > which lack the feature. IMHO that's not acceptable behaviour for > > > any of our supported platforms. > > > > Right, that's why I was thinking whether we should just always be on the > > safe side, even if I just replied in the other email to Akihiko, that we do > > have the option to make this more aggresive by turning those to ON after > > even 1-2 years or even less.. and we have control of how aggressive this > > can be. > > > > > > > > IOW, "after a few QEMU releases" implies a delay of as much as > > > 5 years, while we wait for platforms which don't support the > > > feature to drop out of our supported targets list. I don't > > > think that'll satisfy the desire to get the new feature > > > available to users as soon as practical for their particular > > > platform. > > > > The feature is always available since the 1st day, right? We just need the > > user to opt-in, by specifying ON in the cmdline. > > > > That'll be my take on this that QEMU's default VM setup should be always > > bootable, migratable, and so on. Then user opt-in on stuff like this one, > > where there's implication on the ABIs. The "user" can also include > > Libvirt. I mean when something is really important, Libvirt should, IMHO, > > opt-in by treating that similarly like many cpu properties, and by probing > > the host first. > > > > IIUC there aren't a lot of things like that (part of guest ABI & host > > kernel / HW dependent), am I right? Otherwise I would expect more failures > > like this one, but it isn't as much as that yet. IIUC it means the efforts > > to make Libvirt get involved should be hopefully under control too. The > > worst case is Libvirt doesn't auto-on it, but again the user should always > > have the option to turn it on when it's necessary. > > If it is left to libvirt, then it would very likely end up being a user > opt-in, not auto-enabled. Not sure whether there's other opinions, but that's definitely fine by me. I think it even makes more sense, as even if Libvirt probed the host and auto-on the feature, it also means Libvirt made a decision for the user, saying "having a better performance" is more important than "being able to migrate this VM everywhere". I don't see a way that can make such fair decision besides requesting the user to opt-in always for those, then the user is fully aware what is enabled, with the hope that when a migration fails later with "target host doesn't support feature XXX" the user is crystal clear on what happened. Thanks,
On Tue, Jul 30, 2024 at 03:11:03PM -0400, Peter Xu wrote: > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: > > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: > > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > > > > > > > We've got two mutually conflicting goals with the machine type > > > > > > definitions. > > > > > > > > > > > > Primarily we use them to ensure stable ABI, but an important > > > > > > secondary goal is to enable new tunables to have new defaults > > > > > > set, without having to update every mgmt app. The latter > > > > > > works very well when the defaults have no dependancy on the > > > > > > platform kernel/OS, but breaks migration when they do have a > > > > > > platform dependancy. > > > > > > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > > > > by default, while advanced users can opt-in on new features. We can't > > > > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > > > > this host does not support running such VM specified, due to XXX > > > > > > > feature missing". > > > > > > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > > > > while in which the machine type is the fundation of such definition, as the > > > > > > > machine type can decides many of the rest compat properties. And that's > > > > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > > > > stable). > > > > > > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > > > > already stop making sense in general, because it didn't define the whole > > > > > > > guest ABI.. > > > > > > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > > > > keep up with this rule? > > > > > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > > > > of tunables. The vast majority are low level details that no mgmt app > > > > > > wants to know about, they just want to accept QEMU's new defaults, > > > > > > while preserving machine ABI. This is a good thing. No one wants the > > > > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > > > > would allow a way to enable groups of settings that have a platform level > > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > > > > about the individual tunables. > > > > > > > > > > Do you think we can achieve similar goal by simply turning the feature to > > > > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > > > > > > > So far it really sounds like the right thing to do to me to fix all similar > > > > > issues, even without introducing anything new we need to maintain. > > > > > > > > Turning a feature with a platform dependency to "on" implies that > > > > the machine type will cease to work out of the box for platforms > > > > which lack the feature. IMHO that's not acceptable behaviour for > > > > any of our supported platforms. > > > > > > Right, that's why I was thinking whether we should just always be on the > > > safe side, even if I just replied in the other email to Akihiko, that we do > > > have the option to make this more aggresive by turning those to ON after > > > even 1-2 years or even less.. and we have control of how aggressive this > > > can be. > > > > > > > > > > > IOW, "after a few QEMU releases" implies a delay of as much as > > > > 5 years, while we wait for platforms which don't support the > > > > feature to drop out of our supported targets list. I don't > > > > think that'll satisfy the desire to get the new feature > > > > available to users as soon as practical for their particular > > > > platform. > > > > > > The feature is always available since the 1st day, right? We just need the > > > user to opt-in, by specifying ON in the cmdline. > > > > > > That'll be my take on this that QEMU's default VM setup should be always > > > bootable, migratable, and so on. Then user opt-in on stuff like this one, > > > where there's implication on the ABIs. The "user" can also include > > > Libvirt. I mean when something is really important, Libvirt should, IMHO, > > > opt-in by treating that similarly like many cpu properties, and by probing > > > the host first. > > > > > > IIUC there aren't a lot of things like that (part of guest ABI & host > > > kernel / HW dependent), am I right? Otherwise I would expect more failures > > > like this one, but it isn't as much as that yet. IIUC it means the efforts > > > to make Libvirt get involved should be hopefully under control too. The > > > worst case is Libvirt doesn't auto-on it, but again the user should always > > > have the option to turn it on when it's necessary. > > > > If it is left to libvirt, then it would very likely end up being a user > > opt-in, not auto-enabled. > > Not sure whether there's other opinions, but that's definitely fine by me. > > I think it even makes more sense, as even if Libvirt probed the host and > auto-on the feature, it also means Libvirt made a decision for the user, > saying "having a better performance" is more important than "being able to > migrate this VM everywhere". > > I don't see a way that can make such fair decision besides requesting the > user to opt-in always for those, then the user is fully aware what is > enabled, with the hope that when a migration fails later with "target host > doesn't support feature XXX" the user is crystal clear on what happened. > > Thanks, > > -- > Peter Xu This is not what we did historically. Why should we start now?
On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> This is not what we did historically. Why should we start now?
It's a matter of whether we still want migration to randomly fail, like
what this patch does.
Or any better suggestions? I'm definitely open to that.
Thanks,
On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > This is not what we did historically. Why should we start now? > > It's a matter of whether we still want migration to randomly fail, like > what this patch does. > > Or any better suggestions? I'm definitely open to that. > > Thanks, > > -- > Peter Xu Randomly is an overstatement. You need to switch between kernels where this feature differs. We did it with a ton of features in the past, donnu why we single out USO now. Basically downstreams just don't separately add kernel features vs qemu features. There's little reason for them to do so.
On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > This is not what we did historically. Why should we start now? > > > > It's a matter of whether we still want migration to randomly fail, like > > what this patch does. > > > > Or any better suggestions? I'm definitely open to that. > > > > Thanks, > > > > -- > > Peter Xu > > Randomly is an overstatement. You need to switch between kernels > where this feature differs. We did it with a ton of features > in the past, donnu why we single out USO now. Right, my previous comment should apply to all such features, so it's not sololy about USO*. For old features that Jason mentioned that can also be auto-OFF, my wild guess was that most of them should be supported in most of the kernels that people are using, so they're fine. Otherwise I don't see what stops it from happening in other features too. And that's also why I am thinking maybe we don't need to fix old features, but for this USO* one - I'm not sure yet; it could hit already. For the future, I definitely want to avoid such issue; that's also one major reason / goal I wanted to discuss this thoroughly this time.. > > Basically downstreams just don't separately add kernel features vs > qemu features. There's little reason for them to do so. But we hit this bug in downstream tests.. IIUC it means this is not the case? To be explicit, for RHEL9 some version we added USO* features for QEMU, but not yet for the kernel TAP drivers. AFAIU that's the context where we trapped this failure where we have some system supporting the QEMU feature but not supporting the kernel ones. While some newer systems will support both. Then we hit this when migrating back to the RHEL9 system. Thanks,
On Wed, Jul 31, 2024 at 5:33 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > This is not what we did historically. Why should we start now? > > > > It's a matter of whether we still want migration to randomly fail, like > > what this patch does. > > > > Or any better suggestions? I'm definitely open to that. > > > > Thanks, > > > > -- > > Peter Xu > > Randomly is an overstatement. You need to switch between kernels > where this feature differs. We did it with a ton of features > in the past, donnu why we single out USO now. I guess the reason is that for offload features other than USO they are landed in early kernels so we don't have a chance to test/meet this case. But this is not the case for USO. Thanks > > Basically downstreams just don't separately add kernel features vs > qemu features. There's little reason for them to do so. > > > > -- > MST >
On Tue, Jul 30, 2024 at 6:23 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > > On 2024/07/30 12:45, Jason Wang wrote: > > On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >> > >> On 2024/07/30 12:17, Jason Wang wrote: > >>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>> > >>>> On 2024/07/30 12:03, Jason Wang wrote: > >>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>> > >>>>>> On 2024/07/30 11:04, Jason Wang wrote: > >>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>> > >>>>>>>> On 2024/07/29 23:29, Peter Xu wrote: > >>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote: > >>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote: > >>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote: > >>>>>>>>>>>> > >>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote: > >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote: > >>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote: > >>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote: > >>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote: > >>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote: > >>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability > >>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the > >>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier. > >>>>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com> > >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com> > >>>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has > >>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't.. > >>>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is, > >>>>>>>>>>>>>>>>>> don't do this. > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>> May I ask for my understanding: > >>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU > >>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate > >>>>>>>>>>>>>>>>> between machines that have different host kernel features? > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs > >>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features. > >>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host > >>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical, > >>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about > >>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about > >>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there > >>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI. > >>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode > >>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations. > >>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken > >>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead > >>>>>>>>>>>>>>>> of providing a stable guest ABI. > >>>>>>>>>>>>>>> > >>>>>>>>>>>>>>> Any more elaboration on this idea? Would it be easily feasible in > >>>>>>>>>>>>>>> implementation? > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> In terms of launching QEMU I'd imagine: > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args... > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features > >>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The > >>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and > >>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms > >>>>>>>>>>>>>> are supported on a given host OS. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions > >>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set > >>>>>>>>>>>>>> diverges from upstream due to backports. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU > >>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual > >>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can > >>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible. > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is > >>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest > >>>>>>>>>>>>>> platform built-in to QEMU). > >>>>>>>>>>>>> > >>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know > >>>>>>>>>>>>> whether it's a must. > >>>>>>>>>>>>> > >>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"? I > >>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare > >>>>>>>>>>>>> outliers. > >>>>>>>>>>>>> > >>>>>>>>>>>>> When we have one host that boots up a VM using: > >>>>>>>>>>>>> > >>>>>>>>>>>>> $QEMU1 $cmdline > >>>>>>>>>>>>> > >>>>>>>>>>>>> Then another host boots up: > >>>>>>>>>>>>> > >>>>>>>>>>>>> $QEMU2 $cmdline -incoming XXX > >>>>>>>>>>>>> > >>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM > >>>>>>>>>>>>> can boot up all fine without errors on both sides. > >>>>>>>>>>>>> > >>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not > >>>>>>>>>>>>> Linux? I think either QEMU1 / QEMU2 has the option to fail. But if it > >>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed. > >>>>>>>>>>>>> > >>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device > >>>>>>>>>>>>> properties should be following this rule. The issue here is, we have the > >>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got > >>>>>>>>>>>>> break. > >>>>>>>>>>>>> > >>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest > >>>>>>>>>>>>> ABI, then AFAIU QEMU needs to: > >>>>>>>>>>>>> > >>>>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... > >>>>>>>>>>>>> > >>>>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot > >>>>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't > >>>>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, > >>>>>>>>>>>> > >>>>>>>>>>>> It may not be necessary the case that old features are supported by > >>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to > >>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more > >>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a > >>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest > >>>>>>>>>>>> version of Linux. > >>>>>>>>>>> > >>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed > >>>>>>>>>>> by the guest and we have a fallback anyhow. > >>>>>>>> > >>>>>>>> It is noticeable for the guest, and the fallback is not effective with > >>>>>>>> vhost. > >>>>>>> > >>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost. > >>>>>> > >>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would > >>>>>> not say lack of such fallback is a bug. > >>>>> > >>>>> Such fallback is by design since the introduction of vhost. > >>>>> > >>>>>> We don't provide in-QEMU > >>>>>> fallback for other offloads. > >>>>> > >>>>> Yes but what I want to say is that eBPF RSS is different from those > >>>>> segmentation offloads. And technically, Qemu can do fallback for > >>>>> offloads (as RSC did). > >>>> > >>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC > >>>> implementation. > >>> > >>> It should be a bug (and I remember we disabled vhost when the patches > >>> were merged). Have you tested it in a guest to see if it can see RSC > >>> when vhost is enabled? > >>> > >>> I suspect we need to add the RSC bit into current kernel_feature_bits: > >>> > >>> /* Features supported by host kernel. */ > >>> static const int kernel_feature_bits[] = { > >>> VIRTIO_F_NOTIFY_ON_EMPTY, > >>> VIRTIO_RING_F_INDIRECT_DESC, > >>> VIRTIO_RING_F_EVENT_IDX, > >>> VIRTIO_NET_F_MRG_RXBUF, > >>> VIRTIO_F_VERSION_1, > >>> VIRTIO_NET_F_MTU, > >>> VIRTIO_F_IOMMU_PLATFORM, > >>> VIRTIO_F_RING_PACKED, > >>> VIRTIO_F_RING_RESET, > >>> VIRTIO_NET_F_HASH_REPORT, > >>> VHOST_INVALID_FEATURE_BIT > >>> }; > >>> > >>> As RSC won't be provided by TUN/TAP anyhow. > >> > >> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead > >> it implicitly disables RSC in my understanding. > > > > Yes. > > > >> It is still better than > >> advertising the availability of that feature while it is missing. > > > > Down the road, we probably need to change the behaviour of disabling vhost-net. > > > >> > >>> > >>>> > >>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be > >>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA. > >>> > >>> True, technically, vDPA can fallback to SVQ, but it's another topic. > >> > >> My point of this discussion is that we cannot enable features just > >> because they are sufficiently old or because the user claims QEMU runs > >> on Linux sufficiently new. eBPF requires privilege, and vDPA requires > >> hardware feature. A fallback is not a silver bullet either, and there > >> are situations that providing a fallback is not a trivial task. > > > > To make sure we are on the same page. I just want to point out that > > eBPF RSS is not a good example in this context. > > > > It works only for tuntap, so we should stick to the behaviour of > > trying to fallback to userspace if we can as we've already had a > > userspace fallback. This is the fundamental difference with other > > features (like segmentation offload) or backend (vDPA) that doesn't > > have an existing fallback. > > Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c. > They are not wired up to behave as a fallback when tuntap's vhost is > enabled as the in-QEMU RSS is not. In either case, we need to pay some > effort to wiring things. > > I'm not sure it is worthwhile. I think there is a high chance that > selectively disabling vhost and keeping RSS enabled with fallback will > result in worse performance than keeping vhost enabled and disabling > RSS. Such a fallback can still function as an emergency escape hatch, > but it is also incomplete as we don't have fallbacks for other features. The reason is that we depend on ioctl to configure and negotiate with tuntap correctly. > I would rather make any features missing in the vhost backend fail to > keep things consistent. You might be right but it's too late to do that. Thanks > > Regards, > Akihiko Odaki >
On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > This is not what we did historically. Why should we start now? > > > > It's a matter of whether we still want migration to randomly fail, like > > what this patch does. > > > > Or any better suggestions? I'm definitely open to that. > > > > Thanks, > > > > -- > > Peter Xu > > Randomly is an overstatement. You need to switch between kernels > where this feature differs. We did it with a ton of features > in the past, donnu why we single out USO now. This has been a problem with a ton of features in the past. We've ignored the problem, but that doesn't make it the right solution With regards, Daniel
On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote: > On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > > This is not what we did historically. Why should we start now? > > > > > > It's a matter of whether we still want migration to randomly fail, like > > > what this patch does. > > > > > > Or any better suggestions? I'm definitely open to that. > > > > > > Thanks, > > > > > > -- > > > Peter Xu > > > > Randomly is an overstatement. You need to switch between kernels > > where this feature differs. We did it with a ton of features > > in the past, donnu why we single out USO now. > > This has been a problem with a ton of features in the past. We've > ignored the problem, but that doesn't make it the right solution > > With regards, > Daniel Pushing it to domain xml does not really help, migration will still fail unexpectedly (after wasting a ton of resources copying memory, and getting a downtime bump, I might add). The right solution is to have a tool that can query backends, and that given the results from all of the cluster, generate a set of parameters that will ensure migration works. Kind of like qemu-img, but for migration.
On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote: > On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote: > > On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: > > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > > > This is not what we did historically. Why should we start now? > > > > > > > > It's a matter of whether we still want migration to randomly fail, like > > > > what this patch does. > > > > > > > > Or any better suggestions? I'm definitely open to that. > > > > > > > > Thanks, > > > > > > > > -- > > > > Peter Xu > > > > > > Randomly is an overstatement. You need to switch between kernels > > > where this feature differs. We did it with a ton of features > > > in the past, donnu why we single out USO now. > > > > This has been a problem with a ton of features in the past. We've > > ignored the problem, but that doesn't make it the right solution > > > > With regards, > > Daniel > > Pushing it to domain xml does not really help, > migration will still fail unexpectedly (after wasting > a ton of resources copying memory, and getting > a downtime bump, I might add). Could you elaborate why it would fail if with what I proposed? Note that if this is a generic comment about "any migration can fail if we found a device mismatch", we have plan to fix that to some degree. It's just that we don't have enough people working on these topics yet. See: https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake It includes: "Check device tree on both sides, etc., to make sure the migration is applicable. E.g., we should fail early and clearly on any device mismatch." However I don't think it'll cover all checks, e.g. I _think_ even if we verify VMSDs then post_load() hooks can still fail, and there can be some corner cases to think. And of course, this may not even apply to virtio since virtio manages migration itself, without providing a top-level vmsd. > > The right solution is to have a tool that can query > backends, and that given the results from all of the cluster, > generate a set of parameters that will ensure migration works. > Kind of like qemu-img, but for migration. This is adding extra work, IMHO. If we stick with "qemu cmdline as guest ABI" concept, I think we're all fine, as that work is done by QEMU booting up first on both sides, including dest. Basically Libvirt already plays this role of the new tool without any new code to be added at all: what captured on the boot failure log will be the output of that tool if we write it. Thanks,
On Wed, Jul 31, 2024 at 8:58 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote: > > On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote: > > > On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: > > > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: > > > > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: > > > > > > This is not what we did historically. Why should we start now? > > > > > > > > > > It's a matter of whether we still want migration to randomly fail, like > > > > > what this patch does. > > > > > > > > > > Or any better suggestions? I'm definitely open to that. > > > > > > > > > > Thanks, > > > > > > > > > > -- > > > > > Peter Xu > > > > > > > > Randomly is an overstatement. You need to switch between kernels > > > > where this feature differs. We did it with a ton of features > > > > in the past, donnu why we single out USO now. > > > > > > This has been a problem with a ton of features in the past. We've > > > ignored the problem, but that doesn't make it the right solution > > > > > > With regards, > > > Daniel > > > > Pushing it to domain xml does not really help, > > migration will still fail unexpectedly (after wasting > > a ton of resources copying memory, and getting > > a downtime bump, I might add). > > Could you elaborate why it would fail if with what I proposed? > > Note that if this is a generic comment about "any migration can fail if we > found a device mismatch", we have plan to fix that to some degree. It's > just that we don't have enough people working on these topics yet. See: > > https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake > > It includes: > > "Check device tree on both sides, etc., to make sure the migration is > applicable. E.g., we should fail early and clearly on any device > mismatch." > > However I don't think it'll cover all checks, e.g. I _think_ even if we > verify VMSDs then post_load() hooks can still fail, and there can be some > corner cases to think. And of course, this may not even apply to virtio > since virtio manages migration itself, without providing a top-level vmsd. > > > > > The right solution is to have a tool that can query > > backends, and that given the results from all of the cluster, > > generate a set of parameters that will ensure migration works. This seems to be very hard for vhost-users. > > Kind of like qemu-img, but for migration. > > This is adding extra work, IMHO. > > If we stick with "qemu cmdline as guest ABI" concept, I think we're all > fine, as that work is done by QEMU booting up first on both sides, > including dest. Probably, letting Qemu to probe is much easier than rewriting the probe in the upper layer. > Basically Libvirt already plays this role of the new tool > without any new code to be added at all: what captured on the boot failure > log will be the output of that tool if we write it. > > Thanks, Thanks > > -- > Peter Xu >
On 2024/07/31 4:11, Peter Xu wrote: > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: >> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: >>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: >>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: >>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: >>>>>> >>>>>> We've got two mutually conflicting goals with the machine type >>>>>> definitions. >>>>>> >>>>>> Primarily we use them to ensure stable ABI, but an important >>>>>> secondary goal is to enable new tunables to have new defaults >>>>>> set, without having to update every mgmt app. The latter >>>>>> works very well when the defaults have no dependancy on the >>>>>> platform kernel/OS, but breaks migration when they do have a >>>>>> platform dependancy. >>>>>> >>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>> >>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>> >>>>>>> - If the host doesn't support the feature while the cmdline enabled it, >>>>>>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>>>>>> this host does not support running such VM specified, due to XXX >>>>>>> feature missing". >>>>>>> >>>>>>> That's the only way an user could understand what happened, and IMHO that's >>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI, >>>>>>> while in which the machine type is the fundation of such definition, as the >>>>>>> machine type can decides many of the rest compat properties. And that's >>>>>>> the whole point of the compat properties too (to make sure the guest ABI is >>>>>>> stable). >>>>>>> >>>>>>> If kernel breaks it easily, all compat property things that we maintain can >>>>>>> already stop making sense in general, because it didn't define the whole >>>>>>> guest ABI.. >>>>>>> >>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook >>>>>>> somehting. And maybe we don't yet need the "-platform" layer if we can >>>>>>> keep up with this rule? >>>>>> >>>>>> We've failed at this for years wrt enabling use of new defaults that have >>>>>> a platform depedancy, so historical practice isn't a good reference. >>>>>> >>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of >>>>>> the machine type, and of those, libvirt likely only exposes a few 10's >>>>>> of tunables. The vast majority are low level details that no mgmt app >>>>>> wants to know about, they just want to accept QEMU's new defaults, >>>>>> while preserving machine ABI. This is a good thing. No one wants the >>>>>> burden of wiring up every single tunable into libvirt and mgmt apps. >>>>>> >>>>>> This is what the "-platform" concept would be intended to preserve. It >>>>>> would allow a way to enable groups of settings that have a platform level >>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps >>>>>> about the individual tunables. >>>>> >>>>> Do you think we can achieve similar goal by simply turning the feature to >>>>> ON only after a few QEMU releases? I also mentioned that idea below. >>>>> >>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n >>>>> >>>>> So far it really sounds like the right thing to do to me to fix all similar >>>>> issues, even without introducing anything new we need to maintain. >>>> >>>> Turning a feature with a platform dependency to "on" implies that >>>> the machine type will cease to work out of the box for platforms >>>> which lack the feature. IMHO that's not acceptable behaviour for >>>> any of our supported platforms. >>> >>> Right, that's why I was thinking whether we should just always be on the >>> safe side, even if I just replied in the other email to Akihiko, that we do >>> have the option to make this more aggresive by turning those to ON after >>> even 1-2 years or even less.. and we have control of how aggressive this >>> can be. >>> >>>> >>>> IOW, "after a few QEMU releases" implies a delay of as much as >>>> 5 years, while we wait for platforms which don't support the >>>> feature to drop out of our supported targets list. I don't >>>> think that'll satisfy the desire to get the new feature >>>> available to users as soon as practical for their particular >>>> platform. >>> >>> The feature is always available since the 1st day, right? We just need the >>> user to opt-in, by specifying ON in the cmdline. >>> >>> That'll be my take on this that QEMU's default VM setup should be always >>> bootable, migratable, and so on. Then user opt-in on stuff like this one, >>> where there's implication on the ABIs. The "user" can also include >>> Libvirt. I mean when something is really important, Libvirt should, IMHO, >>> opt-in by treating that similarly like many cpu properties, and by probing >>> the host first. >>> >>> IIUC there aren't a lot of things like that (part of guest ABI & host >>> kernel / HW dependent), am I right? Otherwise I would expect more failures >>> like this one, but it isn't as much as that yet. IIUC it means the efforts >>> to make Libvirt get involved should be hopefully under control too. The >>> worst case is Libvirt doesn't auto-on it, but again the user should always >>> have the option to turn it on when it's necessary. >> >> If it is left to libvirt, then it would very likely end up being a user >> opt-in, not auto-enabled. > > Not sure whether there's other opinions, but that's definitely fine by me. > > I think it even makes more sense, as even if Libvirt probed the host and > auto-on the feature, it also means Libvirt made a decision for the user, > saying "having a better performance" is more important than "being able to > migrate this VM everywhere". > > I don't see a way that can make such fair decision besides requesting the > user to opt-in always for those, then the user is fully aware what is > enabled, with the hope that when a migration fails later with "target host > doesn't support feature XXX" the user is crystal clear on what happened. I think it is better to distinguish saying "having a better performance is more important than being able to migrate this VM everywhere" from explicitly selecting all available offload features; the latter is lot of chores. More importantly, users may not just know these features may prevent migration; they may just look like performance features nice to have at first glance. I don' think what a user would want are not individual performance knobs, but a user is more likely to need to express the platforms they would want to migrate VMs on. There are several possible scenarios in particular: 1) Migration everywhere 2) Migration on specific machines 3) Migration on some known platforms 4) No migration (migration on nowhere) If a user chooses 1-3), QEMU may reject platform-dependent features even if the user requests one; in this way, we don't need the users to make things crystal clear, but we can expect QEMU does so. If a user chooses 2-4), QEMU may enable all offloading features available on the specified platforms. Again, the user will no longer have to know each individual performance features. QEMU may also reject migration to platforms not specified to prevent misconfiguration. The -platform proposal earlier corresponds to 3). However it has a downside that QEMU needs to know about platforms, which may not be trivial. In that case, we can support 1), 2), and 4). Regards, Akihiko Odaki
On 2024/08/01 11:28, Jason Wang wrote: > On Wed, Jul 31, 2024 at 8:58 PM Peter Xu <peterx@redhat.com> wrote: >> >> On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote: >>> On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote: >>>> On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote: >>>>> On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote: >>>>>> On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote: >>>>>>> This is not what we did historically. Why should we start now? >>>>>> >>>>>> It's a matter of whether we still want migration to randomly fail, like >>>>>> what this patch does. >>>>>> >>>>>> Or any better suggestions? I'm definitely open to that. >>>>>> >>>>>> Thanks, >>>>>> >>>>>> -- >>>>>> Peter Xu >>>>> >>>>> Randomly is an overstatement. You need to switch between kernels >>>>> where this feature differs. We did it with a ton of features >>>>> in the past, donnu why we single out USO now. >>>> >>>> This has been a problem with a ton of features in the past. We've >>>> ignored the problem, but that doesn't make it the right solution >>>> >>>> With regards, >>>> Daniel >>> >>> Pushing it to domain xml does not really help, >>> migration will still fail unexpectedly (after wasting >>> a ton of resources copying memory, and getting >>> a downtime bump, I might add). >> >> Could you elaborate why it would fail if with what I proposed? >> >> Note that if this is a generic comment about "any migration can fail if we >> found a device mismatch", we have plan to fix that to some degree. It's >> just that we don't have enough people working on these topics yet. See: >> >> https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake >> >> It includes: >> >> "Check device tree on both sides, etc., to make sure the migration is >> applicable. E.g., we should fail early and clearly on any device >> mismatch." >> >> However I don't think it'll cover all checks, e.g. I _think_ even if we >> verify VMSDs then post_load() hooks can still fail, and there can be some >> corner cases to think. And of course, this may not even apply to virtio >> since virtio manages migration itself, without providing a top-level vmsd. >> >>> >>> The right solution is to have a tool that can query >>> backends, and that given the results from all of the cluster, >>> generate a set of parameters that will ensure migration works. > > This seems to be very hard for vhost-users. Can you elaborate more? I was thinking something like follows: 1. Prepare a QEMU command line. 2. Run the command line appended with -dump-platform on all hosts, which dumps platform features automatically enabled. For virtio devices, we can dump "host_features" variable. 3. Run the command line appended with -merge-platform with all dumps. For most virtio devices, this would be AND operations on "host_features" variable. 4. Run the command line appended with -use-platform with the merged dump. This will run VMs with features available on all hosts. I may have missed something but this seems good enough for me. Of course this requires changes throughout the stack (QEMU common and device-specific code, libvirt, and even higher layers like OpenStack). Regards, Akihiko Odaki
On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote: > > > > The right solution is to have a tool that can query > > backends, and that given the results from all of the cluster, > > generate a set of parameters that will ensure migration works. > > Kind of like qemu-img, but for migration. > > This is adding extra work, IMHO. Agreed that it's a lot of work. > If we stick with "qemu cmdline as guest ABI" concept, I think we're all > fine, as that work is done by QEMU booting up first on both sides, > including dest. Basically Libvirt already plays this role of the new tool > without any new code to be added at all: what captured on the boot failure > log will be the output of that tool if we write it. However, this means we can never add new features without also teaching libvirt to enable them. How about we add some kind of command on source qemu to return description of all working features? Then when qemu is started on destination, this data can be passed in, and validated. Hmm?
On Tue, Jul 30, 2024 at 02:02:27AM +0900, Akihiko Odaki wrote: > I think it is simpler to analyze the platform dependency and dump it for the > management layer. For example, libvirt can request QEMU to analyze the > platform dependency when it creates a new domain. QEMU will then figure out > that the host kernel is capable of USO and bake it as a platform dependency. > > Regards, > Akihiko Odaki I think for starters, we can just have dump-features as a QEMU command. Pass it on command line on destination. Achieves the same thing as making userspace pass each flag manually, but without the pain of teaching management to enable each new feature.
On Fri, Jul 26, 2024 at 07:39:46PM +0200, Thomas Huth wrote: > Anyway, while we're discussing solutions: We are in softfreeze already. > Should we disable the UFO bits in the new 9.1 machine type for the time > being to avoid that more people are running into this problem? At the moment I'm looking at solutions for 9.2 I don't see how we can do much for 9.1. I mean we can move it back to behave like 8.1 (IIRC), but that is not much.
On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote:
> Could you elaborate why it would fail if with what I proposed?
First I think I was wrong I misunderstood what you said.
To summarise, you said:
- any new feature depending on another package is off by default
- starting qemu on destination with feature enabled will fail
thus migration is not started
My comment is that this "started" is from qemu point of view,
from user's POV starting qemu on destination is just the 1st
step of migration.
However I agree, this is better since we do not waste bandwidth,
and I was wrong to say we do.
My other comment is that adding features becomes even more work
than it is now.
So I suggest a single command that dumps some description of host
features, to be passed to qemu on destination. qemu then fails to
start on destination if some of these do not work.
The advantage is that this also helps things like -cpu host,
and a bunch of other things like vdpa where we like to pass through
config from kernel.
The disadvantage is that it does not exactly *fix* migration,
it just does not let you start it.
On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote: > On 2024/07/31 4:11, Peter Xu wrote: > > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: > > > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: > > > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > > > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > > > > > > > > > We've got two mutually conflicting goals with the machine type > > > > > > > definitions. > > > > > > > > > > > > > > Primarily we use them to ensure stable ABI, but an important > > > > > > > secondary goal is to enable new tunables to have new defaults > > > > > > > set, without having to update every mgmt app. The latter > > > > > > > works very well when the defaults have no dependancy on the > > > > > > > platform kernel/OS, but breaks migration when they do have a > > > > > > > platform dependancy. > > > > > > > > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > > > > > by default, while advanced users can opt-in on new features. We can't > > > > > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > > > > > this host does not support running such VM specified, due to XXX > > > > > > > > feature missing". > > > > > > > > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > > > > > while in which the machine type is the fundation of such definition, as the > > > > > > > > machine type can decides many of the rest compat properties. And that's > > > > > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > > > > > stable). > > > > > > > > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > > > > > already stop making sense in general, because it didn't define the whole > > > > > > > > guest ABI.. > > > > > > > > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > > > > > keep up with this rule? > > > > > > > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > > > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > > > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > > > > > of tunables. The vast majority are low level details that no mgmt app > > > > > > > wants to know about, they just want to accept QEMU's new defaults, > > > > > > > while preserving machine ABI. This is a good thing. No one wants the > > > > > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > > > > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > > > > > would allow a way to enable groups of settings that have a platform level > > > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > > > > > about the individual tunables. > > > > > > > > > > > > Do you think we can achieve similar goal by simply turning the feature to > > > > > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > > > > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > > > > > > > > > So far it really sounds like the right thing to do to me to fix all similar > > > > > > issues, even without introducing anything new we need to maintain. > > > > > > > > > > Turning a feature with a platform dependency to "on" implies that > > > > > the machine type will cease to work out of the box for platforms > > > > > which lack the feature. IMHO that's not acceptable behaviour for > > > > > any of our supported platforms. > > > > > > > > Right, that's why I was thinking whether we should just always be on the > > > > safe side, even if I just replied in the other email to Akihiko, that we do > > > > have the option to make this more aggresive by turning those to ON after > > > > even 1-2 years or even less.. and we have control of how aggressive this > > > > can be. > > > > > > > > > > > > > > IOW, "after a few QEMU releases" implies a delay of as much as > > > > > 5 years, while we wait for platforms which don't support the > > > > > feature to drop out of our supported targets list. I don't > > > > > think that'll satisfy the desire to get the new feature > > > > > available to users as soon as practical for their particular > > > > > platform. > > > > > > > > The feature is always available since the 1st day, right? We just need the > > > > user to opt-in, by specifying ON in the cmdline. > > > > > > > > That'll be my take on this that QEMU's default VM setup should be always > > > > bootable, migratable, and so on. Then user opt-in on stuff like this one, > > > > where there's implication on the ABIs. The "user" can also include > > > > Libvirt. I mean when something is really important, Libvirt should, IMHO, > > > > opt-in by treating that similarly like many cpu properties, and by probing > > > > the host first. > > > > > > > > IIUC there aren't a lot of things like that (part of guest ABI & host > > > > kernel / HW dependent), am I right? Otherwise I would expect more failures > > > > like this one, but it isn't as much as that yet. IIUC it means the efforts > > > > to make Libvirt get involved should be hopefully under control too. The > > > > worst case is Libvirt doesn't auto-on it, but again the user should always > > > > have the option to turn it on when it's necessary. > > > > > > If it is left to libvirt, then it would very likely end up being a user > > > opt-in, not auto-enabled. > > > > Not sure whether there's other opinions, but that's definitely fine by me. > > > > I think it even makes more sense, as even if Libvirt probed the host and > > auto-on the feature, it also means Libvirt made a decision for the user, > > saying "having a better performance" is more important than "being able to > > migrate this VM everywhere". > > > > I don't see a way that can make such fair decision besides requesting the > > user to opt-in always for those, then the user is fully aware what is > > enabled, with the hope that when a migration fails later with "target host > > doesn't support feature XXX" the user is crystal clear on what happened. > > I think it is better to distinguish saying "having a better performance is > more important than being able to migrate this VM everywhere" from > explicitly selecting all available offload features; the latter is lot of > chores. More importantly, users may not just know these features may prevent > migration; they may just look like performance features nice to have at > first glance. > > I don' think what a user would want are not individual performance knobs, > but a user is more likely to need to express the platforms they would want > to migrate VMs on. There are several possible scenarios in particular: > 1) Migration everywhere > 2) Migration on specific machines > 3) Migration on some known platforms > 4) No migration (migration on nowhere) > > If a user chooses 1-3), QEMU may reject platform-dependent features even if > the user requests one; in this way, we don't need the users to make things > crystal clear, but we can expect QEMU does so. > > If a user chooses 2-4), QEMU may enable all offloading features available on > the specified platforms. Again, the user will no longer have to know each > individual performance features. QEMU may also reject migration to platforms > not specified to prevent misconfiguration. > > The -platform proposal earlier corresponds to 3). However it has a downside > that QEMU needs to know about platforms, which may not be trivial. In that > case, we can support 1), 2), and 4). I'm not sure if I read it right. Perhaps you meant something more generic than -platform but similar? For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either "perf" or "compat", while by default to "compat"? If so, I think I get the idea, but it'll be challenging in at least these aspects: - We already have (at least.. that I'm aware of) three layers of specifying a property for a device, they are: (1) default value (2) compat property (from machine type definitions) (3) qemu cmdline (specify one property explicitly) So far, there's an order we apply these (1-3), while (3) has the top priority to overwrite (1-2), and (2) to overwrite (1). The new "-profile", if I read it right, introduce (4), and it's already unclear to me how that interacts with (3) when -profile says "turn FEAT1 on" while cmdline says otherwise. It can make things very compilcated, IMHO. - This still will break the "QEMU cmdline defines the guest ABI", e.g., consider this USO* thing that we boot an old machine type on a new system that has QEMU+Linux USO* all enabled. We specify "-profile perf" there. Then when we try to migrate to another older QEMU it'll still fail the migration instead of any way telling us "migration is not compatible". So even if it helps the user turning on knobs, it doesn't sound like to fix the problem we're working on? For whatever profile setup, it sounds like more applicable to a Libvirt option that user can choose. That may avoid above two concerns I have, especially the latter. But I really don't know much on Libvirt, and this can be some extra effort too on top of either QEMU / Libvirt, and we may need to justify worthwhile. Do we really concern about users not enabling features that much? I thought users always can manually change the XML and add whatever they need, and device properties do not like too special here to me. I mean, we have bunch of "features" exported as new "-devices" and users must opt-in for them by changing the XML. We never worried on user not using them. I doubt whether we worried too much on user not opt-in, especially for performance features, because they're, IMHO, targeting advanced users. Thanks,
On Thu, Aug 01, 2024 at 11:13:37AM -0400, Peter Xu wrote: > Do we really concern about users not enabling features that much? I > thought users always can manually change the XML and add whatever they > need, and device properties do not like too special here to me. I mean, we > have bunch of "features" exported as new "-devices" and users must opt-in > for them by changing the XML. We never worried on user not using them. I > doubt whether we worried too much on user not opt-in, especially for > performance features, because they're, IMHO, targeting advanced users. What I do not like, is pushing the knowledge of what good defaults are to libvirt.
On Thu, Aug 01, 2024 at 11:15:47AM -0400, Michael S. Tsirkin wrote: > On Thu, Aug 01, 2024 at 11:13:37AM -0400, Peter Xu wrote: > > Do we really concern about users not enabling features that much? I > > thought users always can manually change the XML and add whatever they > > need, and device properties do not like too special here to me. I mean, we > > have bunch of "features" exported as new "-devices" and users must opt-in > > for them by changing the XML. We never worried on user not using them. I > > doubt whether we worried too much on user not opt-in, especially for > > performance features, because they're, IMHO, targeting advanced users. > > What I do not like, is pushing the knowledge of what good defaults > are to libvirt. With the -platform concept, libvirt wouldn't need to know anything about the settings being used, nor the defaults. Consider how it works for machine types. Libvirt queries the machine types, and gets a list back, and QEMU expresses a default. eg saying that 'pc-i440fx-9.1.0' is aliased to 'pc'. So libvirt can expand 'pc' to a particular version that QEMU has chosen as the default. Conceptually I could see something similar working for the -platform concept. Libvirt would ask QEMU for all the "platform" variants that are available on the current running kernel. QEMU can reply with the list, and indicate which of those is the "newest" in some manner. Absent any preference from the mgmt app, libvirt would use whichever one QEMU indicates was the newest. This optimizes for best featureset on the current kernel, as the cost of possibly reduced migration compatibility. When a mgmt app is caring about migration, they would explicitly tell libvirt which platform version to use, just as they would explicitly ask for a specific machine type version, rather than accepting the 'pc' default. With regards, Daniel
On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote: > So I suggest a single command that dumps some description of host > features, to be passed to qemu on destination. qemu then fails to > start on destination if some of these do not work. > The advantage is that this also helps things like -cpu host, > and a bunch of other things like vdpa where we like to pass through > config from kernel. Something like that could work indeed. I'm thinking whether it shouldn't require a new QMP command; that sounds more work, and we also needs Libvirt cooperations so QEMU migration will still fail. I wonder whether we can integrate it into migration handshake that I referred previously in our TODO item here: https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake The "device handshake" part (in the previous plan) was that we at least can verify VMSD fields matching on both sides - VMSDs are defined in both QEMU binaries, so migration can do that already without device opt-in. What we can do on top of that (or even, before that) is, maybe, allow device to opt-in in such handshake besides an "VMSD check", so that there can be something hooked to the VMSDs or similar structures, so the src QEMU's device A can talk to dest QEMU's device A making sure everything is good for migration. Virtio can handshake on host feature lists and we can fail the whole handshake there. Same to -cpu, or vDPA, as long as opt-in hook is provided on both sides. The good side of it is it sounds natural to integrate this with a handshake (when we can have it). Meanwhile, we restrict everything within the device scope, so neither QEMU nor migration needs to know what happened exactly. Would that sound workable and better? Besides, I also wonder what's our next step for this issue. Should we fix this on the safe side, and only set ON by default when we have the handshake ready (in whatever form, either above, or a new QMP command)? It's just that the handshake in general may still need some thoughts, so I'm not sure how fast that can ready, considering our very limited bandwidth so far. Maybe that can be done separately, but I remember Dan used to suggest we do handshake right in one shot, and I tend to agree that'll be nicer. Thanks,
On Thu, Aug 01, 2024 at 11:36:19AM -0400, Peter Xu wrote: > On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote: > > So I suggest a single command that dumps some description of host > > features, to be passed to qemu on destination. qemu then fails to > > start on destination if some of these do not work. > > The advantage is that this also helps things like -cpu host, > > and a bunch of other things like vdpa where we like to pass through > > config from kernel. > > Something like that could work indeed. I'm thinking whether it shouldn't > require a new QMP command; that sounds more work, and we also needs Libvirt > cooperations so QEMU migration will still fail. I wonder whether we can > integrate it into migration handshake that I referred previously in our > TODO item here: > > https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake This is different. You can start migration on destination without touching source at all. This allows e.g. finding a destination that can support your source.
On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote: > On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote: > > Could you elaborate why it would fail if with what I proposed? > > First I think I was wrong I misunderstood what you said. > To summarise, you said: > > - any new feature depending on another package is off by default > - starting qemu on destination with feature enabled will fail > thus migration is not started > > > My comment is that this "started" is from qemu point of view, > from user's POV starting qemu on destination is just the 1st > step of migration. > > > However I agree, this is better since we do not waste bandwidth, > and I was wrong to say we do. > > My other comment is that adding features becomes even more work > than it is now. > > So I suggest a single command that dumps some description of host > features, to be passed to qemu on destination. qemu then fails to > start on destination if some of these do not work. > The advantage is that this also helps things like -cpu host, > and a bunch of other things like vdpa where we like to pass through > config from kernel. > > The disadvantage is that it does not exactly *fix* migration, > it just does not let you start it. This feels like only half a solution, and not the most helpful half. It prevents you accidentally migrating to a host that lacks some features, but doesn't help with starting a VM that has migrate compatible features in the first place. From a user POV, the latter is what's most important. Checking for incompatible features is just a safety net that you should never need to hit, if QEMU was configured suitably to start with. So to ensure a QEMU is started with migration compatible features will still require teaching libvirt about every single feature that has a host kernel dependancy, so libvirt (or the app using libvirt) knows to turn this off. This is alot more work for both libvirt & the mgmt app, than having QEMU provide the generic "platforms" concept which is extensible without needing further work outside QEMU. With regards, Daniel
On Thu, Aug 01, 2024 at 04:45:17PM +0100, Daniel P. Berrangé wrote: > So to ensure a QEMU is started with migration compatible features > will still require teaching libvirt about every single feature > that has a host kernel dependancy, so libvirt (or the app using > libvirt) knows to turn this off. This is alot more work for both > libvirt & the mgmt app, than having QEMU provide the generic > "platforms" concept which is extensible without needing further > work outside QEMU. I am just not sure it can all amount to selecting from a list. For example, some resource can be limited on one host or another. Thus we get a number. Or there could be a set of N flags, with 2^N combinations.
On Thu, Aug 01, 2024 at 11:50:40AM -0400, Michael S. Tsirkin wrote: > On Thu, Aug 01, 2024 at 04:45:17PM +0100, Daniel P. Berrangé wrote: > > So to ensure a QEMU is started with migration compatible features > > will still require teaching libvirt about every single feature > > that has a host kernel dependancy, so libvirt (or the app using > > libvirt) knows to turn this off. This is alot more work for both > > libvirt & the mgmt app, than having QEMU provide the generic > > "platforms" concept which is extensible without needing further > > work outside QEMU. > > I am just not sure it can all amount to selecting from a list. > For example, some resource can be limited on one host or another. > Thus we get a number. Or there could be a set of N flags, with 2^N > combinations. We don't have to support all possible combinations IMHO. If a user really does require precise control over every combination of some settings, then exposing those tunables in libvirt is inevitable. The platform concept only has to be able to express a "good enough" subset of combinations, such that it is unlikely users will need to have fine tuning for most of the tunables. We might end up exposing a handful of tunables in libvirt anyway, but as long as we get the common case satisifed, we'll eliminate most of the ongoing burden. With regards, Daniel
On 2024/08/02 0:13, Peter Xu wrote: > On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote: >> On 2024/07/31 4:11, Peter Xu wrote: >>> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: >>>> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: >>>>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: >>>>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: >>>>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: >>>>>>>> >>>>>>>> We've got two mutually conflicting goals with the machine type >>>>>>>> definitions. >>>>>>>> >>>>>>>> Primarily we use them to ensure stable ABI, but an important >>>>>>>> secondary goal is to enable new tunables to have new defaults >>>>>>>> set, without having to update every mgmt app. The latter >>>>>>>> works very well when the defaults have no dependancy on the >>>>>>>> platform kernel/OS, but breaks migration when they do have a >>>>>>>> platform dependancy. >>>>>>>> >>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>>>> >>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>>>> >>>>>>>>> - If the host doesn't support the feature while the cmdline enabled it, >>>>>>>>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>>>>>>>> this host does not support running such VM specified, due to XXX >>>>>>>>> feature missing". >>>>>>>>> >>>>>>>>> That's the only way an user could understand what happened, and IMHO that's >>>>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI, >>>>>>>>> while in which the machine type is the fundation of such definition, as the >>>>>>>>> machine type can decides many of the rest compat properties. And that's >>>>>>>>> the whole point of the compat properties too (to make sure the guest ABI is >>>>>>>>> stable). >>>>>>>>> >>>>>>>>> If kernel breaks it easily, all compat property things that we maintain can >>>>>>>>> already stop making sense in general, because it didn't define the whole >>>>>>>>> guest ABI.. >>>>>>>>> >>>>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook >>>>>>>>> somehting. And maybe we don't yet need the "-platform" layer if we can >>>>>>>>> keep up with this rule? >>>>>>>> >>>>>>>> We've failed at this for years wrt enabling use of new defaults that have >>>>>>>> a platform depedancy, so historical practice isn't a good reference. >>>>>>>> >>>>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of >>>>>>>> the machine type, and of those, libvirt likely only exposes a few 10's >>>>>>>> of tunables. The vast majority are low level details that no mgmt app >>>>>>>> wants to know about, they just want to accept QEMU's new defaults, >>>>>>>> while preserving machine ABI. This is a good thing. No one wants the >>>>>>>> burden of wiring up every single tunable into libvirt and mgmt apps. >>>>>>>> >>>>>>>> This is what the "-platform" concept would be intended to preserve. It >>>>>>>> would allow a way to enable groups of settings that have a platform level >>>>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps >>>>>>>> about the individual tunables. >>>>>>> >>>>>>> Do you think we can achieve similar goal by simply turning the feature to >>>>>>> ON only after a few QEMU releases? I also mentioned that idea below. >>>>>>> >>>>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n >>>>>>> >>>>>>> So far it really sounds like the right thing to do to me to fix all similar >>>>>>> issues, even without introducing anything new we need to maintain. >>>>>> >>>>>> Turning a feature with a platform dependency to "on" implies that >>>>>> the machine type will cease to work out of the box for platforms >>>>>> which lack the feature. IMHO that's not acceptable behaviour for >>>>>> any of our supported platforms. >>>>> >>>>> Right, that's why I was thinking whether we should just always be on the >>>>> safe side, even if I just replied in the other email to Akihiko, that we do >>>>> have the option to make this more aggresive by turning those to ON after >>>>> even 1-2 years or even less.. and we have control of how aggressive this >>>>> can be. >>>>> >>>>>> >>>>>> IOW, "after a few QEMU releases" implies a delay of as much as >>>>>> 5 years, while we wait for platforms which don't support the >>>>>> feature to drop out of our supported targets list. I don't >>>>>> think that'll satisfy the desire to get the new feature >>>>>> available to users as soon as practical for their particular >>>>>> platform. >>>>> >>>>> The feature is always available since the 1st day, right? We just need the >>>>> user to opt-in, by specifying ON in the cmdline. >>>>> >>>>> That'll be my take on this that QEMU's default VM setup should be always >>>>> bootable, migratable, and so on. Then user opt-in on stuff like this one, >>>>> where there's implication on the ABIs. The "user" can also include >>>>> Libvirt. I mean when something is really important, Libvirt should, IMHO, >>>>> opt-in by treating that similarly like many cpu properties, and by probing >>>>> the host first. >>>>> >>>>> IIUC there aren't a lot of things like that (part of guest ABI & host >>>>> kernel / HW dependent), am I right? Otherwise I would expect more failures >>>>> like this one, but it isn't as much as that yet. IIUC it means the efforts >>>>> to make Libvirt get involved should be hopefully under control too. The >>>>> worst case is Libvirt doesn't auto-on it, but again the user should always >>>>> have the option to turn it on when it's necessary. >>>> >>>> If it is left to libvirt, then it would very likely end up being a user >>>> opt-in, not auto-enabled. >>> >>> Not sure whether there's other opinions, but that's definitely fine by me. >>> >>> I think it even makes more sense, as even if Libvirt probed the host and >>> auto-on the feature, it also means Libvirt made a decision for the user, >>> saying "having a better performance" is more important than "being able to >>> migrate this VM everywhere". >>> >>> I don't see a way that can make such fair decision besides requesting the >>> user to opt-in always for those, then the user is fully aware what is >>> enabled, with the hope that when a migration fails later with "target host >>> doesn't support feature XXX" the user is crystal clear on what happened. >> >> I think it is better to distinguish saying "having a better performance is >> more important than being able to migrate this VM everywhere" from >> explicitly selecting all available offload features; the latter is lot of >> chores. More importantly, users may not just know these features may prevent >> migration; they may just look like performance features nice to have at >> first glance. >> >> I don' think what a user would want are not individual performance knobs, >> but a user is more likely to need to express the platforms they would want >> to migrate VMs on. There are several possible scenarios in particular: >> 1) Migration everywhere >> 2) Migration on specific machines >> 3) Migration on some known platforms >> 4) No migration (migration on nowhere) >> >> If a user chooses 1-3), QEMU may reject platform-dependent features even if >> the user requests one; in this way, we don't need the users to make things >> crystal clear, but we can expect QEMU does so. >> >> If a user chooses 2-4), QEMU may enable all offloading features available on >> the specified platforms. Again, the user will no longer have to know each >> individual performance features. QEMU may also reject migration to platforms >> not specified to prevent misconfiguration. >> >> The -platform proposal earlier corresponds to 3). However it has a downside >> that QEMU needs to know about platforms, which may not be trivial. In that >> case, we can support 1), 2), and 4). > > I'm not sure if I read it right. Perhaps you meant something more generic > than -platform but similar? > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > "perf" or "compat", while by default to "compat"? "perf" would cover 4) and "compat" will cover 1). However neither of them will cover 2) because an enum is not enough to know about all hosts. I presented a design that will cover 2) in: https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com I also want to point out that "perf" should be rather named like "nomigrate". In general, a program should expose a functional requirement on the interface. It can then do its best to achieve high performance under that requirement. > > If so, I think I get the idea, but it'll be challenging in at least these > aspects: > > - We already have (at least.. that I'm aware of) three layers of > specifying a property for a device, they are: > > (1) default value > (2) compat property (from machine type definitions) > (3) qemu cmdline (specify one property explicitly) > > So far, there's an order we apply these (1-3), while (3) has the top > priority to overwrite (1-2), and (2) to overwrite (1). > > The new "-profile", if I read it right, introduce (4), and it's already > unclear to me how that interacts with (3) when -profile says "turn > FEAT1 on" while cmdline says otherwise. > > It can make things very compilcated, IMHO. > > - This still will break the "QEMU cmdline defines the guest ABI", e.g., > consider this USO* thing that we boot an old machine type on a new > system that has QEMU+Linux USO* all enabled. We specify "-profile > perf" there. Then when we try to migrate to another older QEMU it'll > still fail the migration instead of any way telling us "migration is > not compatible". So even if it helps the user turning on knobs, it > doesn't sound like to fix the problem we're working on? When it is named nomigrate, it is obvious that migration does not work. > > For whatever profile setup, it sounds like more applicable to a Libvirt > option that user can choose. That may avoid above two concerns I have, > especially the latter. But I really don't know much on Libvirt, and this > can be some extra effort too on top of either QEMU / Libvirt, and we may > need to justify worthwhile. > > Do we really concern about users not enabling features that much? I > thought users always can manually change the XML and add whatever they > need, and device properties do not like too special here to me. I mean, we > have bunch of "features" exported as new "-devices" and users must opt-in > for them by changing the XML. We never worried on user not using them. I > doubt whether we worried too much on user not opt-in, especially for > performance features, because they're, IMHO, targeting advanced users. It is not about whether the user is knowledgeable or not, but it is about what the user wants. Migration is mandatory for a user who runs multi-tenant platforms, but it doesn't really matter for desktop users. Which are more knowledgeable? Personally, I want to have higher expectation for users running multi-tenant platforms, but it all depends. You asked for the next step in another email. My suggestion is to satisfy 1) first because it is the easiest and safest. In particular, I suggest disabling all platform-dependent features by default to satisfy 1). Combined with an existing option, -only-migratable, users will get the maximum assurance of migratability. 4) is the second easiest to implement, but the design of 4) will depend on whether we will satisfy 2) or 3). In the email I cited earlier, I suggested an option -use-platform to specify the expectation on the platform. If it is ever to be implemented, that option can take a special value, "host" to tell QEMU that it can use any features it finds on the current host. Regards, Akihiko Odaki
On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote: > 4) is the second easiest to implement, but the design of 4) will depend on > whether we will satisfy 2) or 3). In the email I cited earlier, I suggested > an option -use-platform to specify the expectation on the platform. If it is > ever to be implemented, that option can take a special value, "host" to tell > QEMU that it can use any features it finds on the current host. In practice, lots of people would benefit from ability to migrate using host features (checking that hosts are compatibile, as they often are). If we are going to go to great lengths adding new interfaces, I think that would be a really useful thing to address.
On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote: > On 2024/08/02 0:13, Peter Xu wrote: > > On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote: > > > On 2024/07/31 4:11, Peter Xu wrote: > > > > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: > > > > > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: > > > > > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: > > > > > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: > > > > > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: > > > > > > > > > > > > > > > > > > We've got two mutually conflicting goals with the machine type > > > > > > > > > definitions. > > > > > > > > > > > > > > > > > > Primarily we use them to ensure stable ABI, but an important > > > > > > > > > secondary goal is to enable new tunables to have new defaults > > > > > > > > > set, without having to update every mgmt app. The latter > > > > > > > > > works very well when the defaults have no dependancy on the > > > > > > > > > platform kernel/OS, but breaks migration when they do have a > > > > > > > > > platform dependancy. > > > > > > > > > > > > > > > > > > > - Firstly, never quietly flipping any bit that affects the ABI... > > > > > > > > > > > > > > > > > > > > - Have a default value of off, then QEMU will always allow the VM to boot > > > > > > > > > > by default, while advanced users can opt-in on new features. We can't > > > > > > > > > > make this ON by default otherwise some VMs can already fail to boot, > > > > > > > > > > > > > > > > > > > > - If the host doesn't support the feature while the cmdline enabled it, > > > > > > > > > > it needs to fail QEMU boot rather than flipping, so that it says "hey, > > > > > > > > > > this host does not support running such VM specified, due to XXX > > > > > > > > > > feature missing". > > > > > > > > > > > > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's > > > > > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI, > > > > > > > > > > while in which the machine type is the fundation of such definition, as the > > > > > > > > > > machine type can decides many of the rest compat properties. And that's > > > > > > > > > > the whole point of the compat properties too (to make sure the guest ABI is > > > > > > > > > > stable). > > > > > > > > > > > > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can > > > > > > > > > > already stop making sense in general, because it didn't define the whole > > > > > > > > > > guest ABI.. > > > > > > > > > > > > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook > > > > > > > > > > somehting. And maybe we don't yet need the "-platform" layer if we can > > > > > > > > > > keep up with this rule? > > > > > > > > > > > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have > > > > > > > > > a platform depedancy, so historical practice isn't a good reference. > > > > > > > > > > > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of > > > > > > > > > the machine type, and of those, libvirt likely only exposes a few 10's > > > > > > > > > of tunables. The vast majority are low level details that no mgmt app > > > > > > > > > wants to know about, they just want to accept QEMU's new defaults, > > > > > > > > > while preserving machine ABI. This is a good thing. No one wants the > > > > > > > > > burden of wiring up every single tunable into libvirt and mgmt apps. > > > > > > > > > > > > > > > > > > This is what the "-platform" concept would be intended to preserve. It > > > > > > > > > would allow a way to enable groups of settings that have a platform level > > > > > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps > > > > > > > > > about the individual tunables. > > > > > > > > > > > > > > > > Do you think we can achieve similar goal by simply turning the feature to > > > > > > > > ON only after a few QEMU releases? I also mentioned that idea below. > > > > > > > > > > > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n > > > > > > > > > > > > > > > > So far it really sounds like the right thing to do to me to fix all similar > > > > > > > > issues, even without introducing anything new we need to maintain. > > > > > > > > > > > > > > Turning a feature with a platform dependency to "on" implies that > > > > > > > the machine type will cease to work out of the box for platforms > > > > > > > which lack the feature. IMHO that's not acceptable behaviour for > > > > > > > any of our supported platforms. > > > > > > > > > > > > Right, that's why I was thinking whether we should just always be on the > > > > > > safe side, even if I just replied in the other email to Akihiko, that we do > > > > > > have the option to make this more aggresive by turning those to ON after > > > > > > even 1-2 years or even less.. and we have control of how aggressive this > > > > > > can be. > > > > > > > > > > > > > > > > > > > > IOW, "after a few QEMU releases" implies a delay of as much as > > > > > > > 5 years, while we wait for platforms which don't support the > > > > > > > feature to drop out of our supported targets list. I don't > > > > > > > think that'll satisfy the desire to get the new feature > > > > > > > available to users as soon as practical for their particular > > > > > > > platform. > > > > > > > > > > > > The feature is always available since the 1st day, right? We just need the > > > > > > user to opt-in, by specifying ON in the cmdline. > > > > > > > > > > > > That'll be my take on this that QEMU's default VM setup should be always > > > > > > bootable, migratable, and so on. Then user opt-in on stuff like this one, > > > > > > where there's implication on the ABIs. The "user" can also include > > > > > > Libvirt. I mean when something is really important, Libvirt should, IMHO, > > > > > > opt-in by treating that similarly like many cpu properties, and by probing > > > > > > the host first. > > > > > > > > > > > > IIUC there aren't a lot of things like that (part of guest ABI & host > > > > > > kernel / HW dependent), am I right? Otherwise I would expect more failures > > > > > > like this one, but it isn't as much as that yet. IIUC it means the efforts > > > > > > to make Libvirt get involved should be hopefully under control too. The > > > > > > worst case is Libvirt doesn't auto-on it, but again the user should always > > > > > > have the option to turn it on when it's necessary. > > > > > > > > > > If it is left to libvirt, then it would very likely end up being a user > > > > > opt-in, not auto-enabled. > > > > > > > > Not sure whether there's other opinions, but that's definitely fine by me. > > > > > > > > I think it even makes more sense, as even if Libvirt probed the host and > > > > auto-on the feature, it also means Libvirt made a decision for the user, > > > > saying "having a better performance" is more important than "being able to > > > > migrate this VM everywhere". > > > > > > > > I don't see a way that can make such fair decision besides requesting the > > > > user to opt-in always for those, then the user is fully aware what is > > > > enabled, with the hope that when a migration fails later with "target host > > > > doesn't support feature XXX" the user is crystal clear on what happened. > > > > > > I think it is better to distinguish saying "having a better performance is > > > more important than being able to migrate this VM everywhere" from > > > explicitly selecting all available offload features; the latter is lot of > > > chores. More importantly, users may not just know these features may prevent > > > migration; they may just look like performance features nice to have at > > > first glance. > > > > > > I don' think what a user would want are not individual performance knobs, > > > but a user is more likely to need to express the platforms they would want > > > to migrate VMs on. There are several possible scenarios in particular: > > > 1) Migration everywhere > > > 2) Migration on specific machines > > > 3) Migration on some known platforms > > > 4) No migration (migration on nowhere) > > > > > > If a user chooses 1-3), QEMU may reject platform-dependent features even if > > > the user requests one; in this way, we don't need the users to make things > > > crystal clear, but we can expect QEMU does so. > > > > > > If a user chooses 2-4), QEMU may enable all offloading features available on > > > the specified platforms. Again, the user will no longer have to know each > > > individual performance features. QEMU may also reject migration to platforms > > > not specified to prevent misconfiguration. > > > > > > The -platform proposal earlier corresponds to 3). However it has a downside > > > that QEMU needs to know about platforms, which may not be trivial. In that > > > case, we can support 1), 2), and 4). > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > than -platform but similar? > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > "perf" or "compat", while by default to "compat"? > > "perf" would cover 4) and "compat" will cover 1). However neither of them > will cover 2) because an enum is not enough to know about all hosts. I > presented a design that will cover 2) in: > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com "-merge-platform" shouldn't be a QEMU parameter, but should be something separate. Yes, as you mentioned there it could be a lot of work, we may need to think it through and collect enough input before working on something like that. > > I also want to point out that "perf" should be rather named like > "nomigrate". In general, a program should expose a functional requirement on > the interface. It can then do its best to achieve high performance under > that requirement. "nomigrate" may be inaccurate or even wrong in this case, because as long as the features are supported on both hosts it's migratable. > > > > > If so, I think I get the idea, but it'll be challenging in at least these > > aspects: > > > > - We already have (at least.. that I'm aware of) three layers of > > specifying a property for a device, they are: > > > > (1) default value > > (2) compat property (from machine type definitions) > > (3) qemu cmdline (specify one property explicitly) > > > > So far, there's an order we apply these (1-3), while (3) has the top > > priority to overwrite (1-2), and (2) to overwrite (1). > > > > The new "-profile", if I read it right, introduce (4), and it's already > > unclear to me how that interacts with (3) when -profile says "turn > > FEAT1 on" while cmdline says otherwise. > > > > It can make things very compilcated, IMHO. > > > > - This still will break the "QEMU cmdline defines the guest ABI", e.g., > > consider this USO* thing that we boot an old machine type on a new > > system that has QEMU+Linux USO* all enabled. We specify "-profile > > perf" there. Then when we try to migrate to another older QEMU it'll > > still fail the migration instead of any way telling us "migration is > > not compatible". So even if it helps the user turning on knobs, it > > doesn't sound like to fix the problem we're working on? > > When it is named nomigrate, it is obvious that migration does not work. I am not sure whether you meant to e.g. add a migration blocker in this case even if migration can be supported between some hosts. But if so it may not be wise either to block users trying to migrate where it is still applicable. So maybe I misunderstood. > > > > > For whatever profile setup, it sounds like more applicable to a Libvirt > > option that user can choose. That may avoid above two concerns I have, > > especially the latter. But I really don't know much on Libvirt, and this > > can be some extra effort too on top of either QEMU / Libvirt, and we may > > need to justify worthwhile. > > > > Do we really concern about users not enabling features that much? I > > thought users always can manually change the XML and add whatever they > > need, and device properties do not like too special here to me. I mean, we > > have bunch of "features" exported as new "-devices" and users must opt-in > > for them by changing the XML. We never worried on user not using them. I > > doubt whether we worried too much on user not opt-in, especially for > > performance features, because they're, IMHO, targeting advanced users. > > It is not about whether the user is knowledgeable or not, but it is about > what the user wants. Migration is mandatory for a user who runs multi-tenant > platforms, but it doesn't really matter for desktop users. Which are more > knowledgeable? Personally, I want to have higher expectation for users > running multi-tenant platforms, but it all depends. > > You asked for the next step in another email. My suggestion is to satisfy 1) > first because it is the easiest and safest. In particular, I suggest > disabling all platform-dependent features by default to satisfy 1). Combined > with an existing option, -only-migratable, users will get the maximum > assurance of migratability. > > 4) is the second easiest to implement, but the design of 4) will depend on > whether we will satisfy 2) or 3). In the email I cited earlier, I suggested > an option -use-platform to specify the expectation on the platform. If it is > ever to be implemented, that option can take a special value, "host" to tell > QEMU that it can use any features it finds on the current host. So I don't know what's the best plan yet for the longer term, but I'm completely with you on starting with 1). Thanks,
On 2024/08/03 0:05, Peter Xu wrote: > On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote: >> On 2024/08/02 0:13, Peter Xu wrote: >>> On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote: >>>> On 2024/07/31 4:11, Peter Xu wrote: >>>>> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote: >>>>>> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote: >>>>>>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote: >>>>>>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote: >>>>>>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote: >>>>>>>>>> >>>>>>>>>> We've got two mutually conflicting goals with the machine type >>>>>>>>>> definitions. >>>>>>>>>> >>>>>>>>>> Primarily we use them to ensure stable ABI, but an important >>>>>>>>>> secondary goal is to enable new tunables to have new defaults >>>>>>>>>> set, without having to update every mgmt app. The latter >>>>>>>>>> works very well when the defaults have no dependancy on the >>>>>>>>>> platform kernel/OS, but breaks migration when they do have a >>>>>>>>>> platform dependancy. >>>>>>>>>> >>>>>>>>>>> - Firstly, never quietly flipping any bit that affects the ABI... >>>>>>>>>>> >>>>>>>>>>> - Have a default value of off, then QEMU will always allow the VM to boot >>>>>>>>>>> by default, while advanced users can opt-in on new features. We can't >>>>>>>>>>> make this ON by default otherwise some VMs can already fail to boot, >>>>>>>>>>> >>>>>>>>>>> - If the host doesn't support the feature while the cmdline enabled it, >>>>>>>>>>> it needs to fail QEMU boot rather than flipping, so that it says "hey, >>>>>>>>>>> this host does not support running such VM specified, due to XXX >>>>>>>>>>> feature missing". >>>>>>>>>>> >>>>>>>>>>> That's the only way an user could understand what happened, and IMHO that's >>>>>>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI, >>>>>>>>>>> while in which the machine type is the fundation of such definition, as the >>>>>>>>>>> machine type can decides many of the rest compat properties. And that's >>>>>>>>>>> the whole point of the compat properties too (to make sure the guest ABI is >>>>>>>>>>> stable). >>>>>>>>>>> >>>>>>>>>>> If kernel breaks it easily, all compat property things that we maintain can >>>>>>>>>>> already stop making sense in general, because it didn't define the whole >>>>>>>>>>> guest ABI.. >>>>>>>>>>> >>>>>>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook >>>>>>>>>>> somehting. And maybe we don't yet need the "-platform" layer if we can >>>>>>>>>>> keep up with this rule? >>>>>>>>>> >>>>>>>>>> We've failed at this for years wrt enabling use of new defaults that have >>>>>>>>>> a platform depedancy, so historical practice isn't a good reference. >>>>>>>>>> >>>>>>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of >>>>>>>>>> the machine type, and of those, libvirt likely only exposes a few 10's >>>>>>>>>> of tunables. The vast majority are low level details that no mgmt app >>>>>>>>>> wants to know about, they just want to accept QEMU's new defaults, >>>>>>>>>> while preserving machine ABI. This is a good thing. No one wants the >>>>>>>>>> burden of wiring up every single tunable into libvirt and mgmt apps. >>>>>>>>>> >>>>>>>>>> This is what the "-platform" concept would be intended to preserve. It >>>>>>>>>> would allow a way to enable groups of settings that have a platform level >>>>>>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps >>>>>>>>>> about the individual tunables. >>>>>>>>> >>>>>>>>> Do you think we can achieve similar goal by simply turning the feature to >>>>>>>>> ON only after a few QEMU releases? I also mentioned that idea below. >>>>>>>>> >>>>>>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n >>>>>>>>> >>>>>>>>> So far it really sounds like the right thing to do to me to fix all similar >>>>>>>>> issues, even without introducing anything new we need to maintain. >>>>>>>> >>>>>>>> Turning a feature with a platform dependency to "on" implies that >>>>>>>> the machine type will cease to work out of the box for platforms >>>>>>>> which lack the feature. IMHO that's not acceptable behaviour for >>>>>>>> any of our supported platforms. >>>>>>> >>>>>>> Right, that's why I was thinking whether we should just always be on the >>>>>>> safe side, even if I just replied in the other email to Akihiko, that we do >>>>>>> have the option to make this more aggresive by turning those to ON after >>>>>>> even 1-2 years or even less.. and we have control of how aggressive this >>>>>>> can be. >>>>>>> >>>>>>>> >>>>>>>> IOW, "after a few QEMU releases" implies a delay of as much as >>>>>>>> 5 years, while we wait for platforms which don't support the >>>>>>>> feature to drop out of our supported targets list. I don't >>>>>>>> think that'll satisfy the desire to get the new feature >>>>>>>> available to users as soon as practical for their particular >>>>>>>> platform. >>>>>>> >>>>>>> The feature is always available since the 1st day, right? We just need the >>>>>>> user to opt-in, by specifying ON in the cmdline. >>>>>>> >>>>>>> That'll be my take on this that QEMU's default VM setup should be always >>>>>>> bootable, migratable, and so on. Then user opt-in on stuff like this one, >>>>>>> where there's implication on the ABIs. The "user" can also include >>>>>>> Libvirt. I mean when something is really important, Libvirt should, IMHO, >>>>>>> opt-in by treating that similarly like many cpu properties, and by probing >>>>>>> the host first. >>>>>>> >>>>>>> IIUC there aren't a lot of things like that (part of guest ABI & host >>>>>>> kernel / HW dependent), am I right? Otherwise I would expect more failures >>>>>>> like this one, but it isn't as much as that yet. IIUC it means the efforts >>>>>>> to make Libvirt get involved should be hopefully under control too. The >>>>>>> worst case is Libvirt doesn't auto-on it, but again the user should always >>>>>>> have the option to turn it on when it's necessary. >>>>>> >>>>>> If it is left to libvirt, then it would very likely end up being a user >>>>>> opt-in, not auto-enabled. >>>>> >>>>> Not sure whether there's other opinions, but that's definitely fine by me. >>>>> >>>>> I think it even makes more sense, as even if Libvirt probed the host and >>>>> auto-on the feature, it also means Libvirt made a decision for the user, >>>>> saying "having a better performance" is more important than "being able to >>>>> migrate this VM everywhere". >>>>> >>>>> I don't see a way that can make such fair decision besides requesting the >>>>> user to opt-in always for those, then the user is fully aware what is >>>>> enabled, with the hope that when a migration fails later with "target host >>>>> doesn't support feature XXX" the user is crystal clear on what happened. >>>> >>>> I think it is better to distinguish saying "having a better performance is >>>> more important than being able to migrate this VM everywhere" from >>>> explicitly selecting all available offload features; the latter is lot of >>>> chores. More importantly, users may not just know these features may prevent >>>> migration; they may just look like performance features nice to have at >>>> first glance. >>>> >>>> I don' think what a user would want are not individual performance knobs, >>>> but a user is more likely to need to express the platforms they would want >>>> to migrate VMs on. There are several possible scenarios in particular: >>>> 1) Migration everywhere >>>> 2) Migration on specific machines >>>> 3) Migration on some known platforms >>>> 4) No migration (migration on nowhere) >>>> >>>> If a user chooses 1-3), QEMU may reject platform-dependent features even if >>>> the user requests one; in this way, we don't need the users to make things >>>> crystal clear, but we can expect QEMU does so. >>>> >>>> If a user chooses 2-4), QEMU may enable all offloading features available on >>>> the specified platforms. Again, the user will no longer have to know each >>>> individual performance features. QEMU may also reject migration to platforms >>>> not specified to prevent misconfiguration. >>>> >>>> The -platform proposal earlier corresponds to 3). However it has a downside >>>> that QEMU needs to know about platforms, which may not be trivial. In that >>>> case, we can support 1), 2), and 4). >>> >>> I'm not sure if I read it right. Perhaps you meant something more generic >>> than -platform but similar? >>> >>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either >>> "perf" or "compat", while by default to "compat"? >> >> "perf" would cover 4) and "compat" will cover 1). However neither of them >> will cover 2) because an enum is not enough to know about all hosts. I >> presented a design that will cover 2) in: >> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > separate. Do you mean merging platform dumps should be done with another command? I think we will want to know the QOM tree is in use when implementing -merge-platform. For example, you cannot define a "platform" when e.g., you don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is connected to virtio-net devices. Of course we can include those information in dumps, but we don't do so for VMState. > > Yes, as you mentioned there it could be a lot of work, we may need to think > it through and collect enough input before working on something like that. > >> >> I also want to point out that "perf" should be rather named like >> "nomigrate". In general, a program should expose a functional requirement on >> the interface. It can then do its best to achieve high performance under >> that requirement. > > "nomigrate" may be inaccurate or even wrong in this case, because as long > as the features are supported on both hosts it's migratable. Perhaps it may be named no-cross-migrate or something. There are lots of details we need to figure out. > >> >>> >>> If so, I think I get the idea, but it'll be challenging in at least these >>> aspects: >>> >>> - We already have (at least.. that I'm aware of) three layers of >>> specifying a property for a device, they are: >>> >>> (1) default value >>> (2) compat property (from machine type definitions) >>> (3) qemu cmdline (specify one property explicitly) >>> >>> So far, there's an order we apply these (1-3), while (3) has the top >>> priority to overwrite (1-2), and (2) to overwrite (1). >>> >>> The new "-profile", if I read it right, introduce (4), and it's already >>> unclear to me how that interacts with (3) when -profile says "turn >>> FEAT1 on" while cmdline says otherwise. >>> >>> It can make things very compilcated, IMHO. >>> >>> - This still will break the "QEMU cmdline defines the guest ABI", e.g., >>> consider this USO* thing that we boot an old machine type on a new >>> system that has QEMU+Linux USO* all enabled. We specify "-profile >>> perf" there. Then when we try to migrate to another older QEMU it'll >>> still fail the migration instead of any way telling us "migration is >>> not compatible". So even if it helps the user turning on knobs, it >>> doesn't sound like to fix the problem we're working on? >> >> When it is named nomigrate, it is obvious that migration does not work. > > I am not sure whether you meant to e.g. add a migration blocker in this > case even if migration can be supported between some hosts. But if so it > may not be wise either to block users trying to migrate where it is still > applicable. So maybe I misunderstood. There is certainly downside and upside to add a migration blocker and I don't have a strong opinion here. Regards, Akihiko Odaki
On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > than -platform but similar? > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > "perf" or "compat", while by default to "compat"? > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > will cover 2) because an enum is not enough to know about all hosts. I > > > presented a design that will cover 2) in: > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > separate. > > Do you mean merging platform dumps should be done with another command? I > think we will want to know the QOM tree is in use when implementing > -merge-platform. For example, you cannot define a "platform" when e.g., you > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > connected to virtio-net devices. Of course we can include those information > in dumps, but we don't do so for VMState. What I was thinking is the generated platform dump shouldn't care about what is used as backend: it should try to probe whatever is specified in the qemu cmdline, and it's the user's job to make sure the exact same qemu cmdline is used in other hosts to dump this information. IOW, the dump will only contain the information that was based on the qemu cmdline. E.g., if it doesn't include virtio device at all, and if we only support such dump for virtio, it should dump nothing. Then the -merge-platform will expect all dumps to look the same too, merging them with AND on each field. Said that, I actually am still not clear on how / whether it should work at last. At least my previous concern (1) didn't has a good answer yet, on what we do when profile collisions with qemu cmdlines. So far I actually still think it more straightforward that in migration we handshake on these capabilities if possible. And that's why I was thinking (where I totally agree with you on this) that whether we should settle a short term plan first to be on the safe side that we start with migration always being compatible, then we figure the other approach. That seems easier to me, and it's also a matter of whether we want to do something for 9.1, or leaving that for 9.2 for USO*. Thanks,
On Fri, Aug 02, 2024 at 12:26:22PM -0400, Peter Xu wrote: > And that's why I was thinking (where I totally agree with you on this) that > whether we should settle a short term plan first to be on the safe side > that we start with migration always being compatible, then we figure the > other approach. We have two big issues around migration compatibility we never solved: - some guest visible behaviour depends on a package outside of qemu: as that package can change, so can qemu behaviour - sometimes we change guest visible behaviour and only discover this after the release: fixing that breaks migration to one version, not fixing breaks migration to another These, to me, look similar enough that I feel we should look at them together from QAPI POV. Both issues sometimes can have work-arounds, enabling these would be nice. Also, both issues have a clean solution, which can come in two flavors: 1. basic: detecting incompatibility and not starting qemu on destination (or failing migration, possibly early, which I consider a less clean solution). 2. advanced: ability to go from a set of configurations to a flag making them compatible.
On Fri, Aug 02, 2024 at 12:40:33PM -0400, Michael S. Tsirkin wrote: > On Fri, Aug 02, 2024 at 12:26:22PM -0400, Peter Xu wrote: > > And that's why I was thinking (where I totally agree with you on this) that > > whether we should settle a short term plan first to be on the safe side > > that we start with migration always being compatible, then we figure the > > other approach. > > We have two big issues around migration compatibility we never solved: > > - some guest visible behaviour depends on a package outside of qemu: > as that package can change, so can qemu behaviour Any example, or bug link for this one? > > - sometimes we change guest visible behaviour and only > discover this after the release: fixing that breaks > migration to one version, not fixing breaks migration to another In this case it is a bug, IMHO, and not always fixable. It's like QEMU can crash and coredump, not fixable unless the user upgrades. Here "upgrades" for migration purpose means, the user should avoid migration with a broken QEMU version, and one needs to cold reboot into a new fixed binary rather than a live migration. The good thing is as long as the user doesn't trigger migration logically the bug can be avoided. The bad thing is since it's a migration bug it cannot be fixed by live migrating to a new QEMU.. AFAICT we did that before, for downstream we fix X.Y.0 with X.Y.1, then declare X.Y.0 broken, something like that. It's the same for downstream where we put into similar documentations. > > > These, to me, look similar enough that I feel we should look > at them together from QAPI POV. Or maybe I misunderstood here, in that case some elaboration of the QAPI that mentioned here could help on clarifying things. So far I don't see any QAPI command can fix a migration bug, for example, which falls into category 2 above. > > Both issues sometimes can have work-arounds, enabling these > would be nice. > Also, both issues have a clean solution, which can come in > two flavors: > 1. basic: detecting incompatibility > and not starting qemu on destination (or failing migration, > possibly early, which I consider a less clean solution). > 2. advanced: ability to go from a set of configurations to > a flag making them compatible. > > > -- > MST > Thanks,
On 2024/08/03 1:26, Peter Xu wrote: > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: >>>>> I'm not sure if I read it right. Perhaps you meant something more generic >>>>> than -platform but similar? >>>>> >>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either >>>>> "perf" or "compat", while by default to "compat"? >>>> >>>> "perf" would cover 4) and "compat" will cover 1). However neither of them >>>> will cover 2) because an enum is not enough to know about all hosts. I >>>> presented a design that will cover 2) in: >>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com >>> >>> "-merge-platform" shouldn't be a QEMU parameter, but should be something >>> separate. >> >> Do you mean merging platform dumps should be done with another command? I >> think we will want to know the QOM tree is in use when implementing >> -merge-platform. For example, you cannot define a "platform" when e.g., you >> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is >> connected to virtio-net devices. Of course we can include those information >> in dumps, but we don't do so for VMState. > > What I was thinking is the generated platform dump shouldn't care about > what is used as backend: it should try to probe whatever is specified in > the qemu cmdline, and it's the user's job to make sure the exact same qemu > cmdline is used in other hosts to dump this information. > > IOW, the dump will only contain the information that was based on the qemu > cmdline. E.g., if it doesn't include virtio device at all, and if we only > support such dump for virtio, it should dump nothing. > > Then the -merge-platform will expect all dumps to look the same too, > merging them with AND on each field. I think we will still need the QOM tree in that case. I think the platform information will look somewhat similar to VMState, which requires the QOM tree to interpret. > > Said that, I actually am still not clear on how / whether it should work at > last. At least my previous concern (1) didn't has a good answer yet, on > what we do when profile collisions with qemu cmdlines. So far I actually > still think it more straightforward that in migration we handshake on these > capabilities if possible. > > And that's why I was thinking (where I totally agree with you on this) that > whether we should settle a short term plan first to be on the safe side > that we start with migration always being compatible, then we figure the > other approach. That seems easier to me, and it's also a matter of whether > we want to do something for 9.1, or leaving that for 9.2 for USO*. I suggest disabling all offload features of virtio-net with 9.2. I want to keep things consistent so I want to disable all at once. This change will be very uncomfortable for us, who are implementing offload features, but I hope it will motivate us to implement a proper solution. That said, it will be surely a breaking change so we should wait for 9.1 before making such a change. By the way, I am wondering perhaps the "no-cross-migrate" scenario can be implemented relatively easy in a way similar to compatibility properties. The idea is to add the "no-cross-migrate" property to machines. If the property is set to "on", all offload features of virtio-net will be set to "auto". virtio-net will then probe the offload features and enable available offloading features. Regards, Akihiko Odaki
On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > On 2024/08/03 1:26, Peter Xu wrote: > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > > > than -platform but similar? > > > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > > > "perf" or "compat", while by default to "compat"? > > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > > > will cover 2) because an enum is not enough to know about all hosts. I > > > > > presented a design that will cover 2) in: > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > > > separate. > > > > > > Do you mean merging platform dumps should be done with another command? I > > > think we will want to know the QOM tree is in use when implementing > > > -merge-platform. For example, you cannot define a "platform" when e.g., you > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > > > connected to virtio-net devices. Of course we can include those information > > > in dumps, but we don't do so for VMState. > > > > What I was thinking is the generated platform dump shouldn't care about > > what is used as backend: it should try to probe whatever is specified in > > the qemu cmdline, and it's the user's job to make sure the exact same qemu > > cmdline is used in other hosts to dump this information. > > > > IOW, the dump will only contain the information that was based on the qemu > > cmdline. E.g., if it doesn't include virtio device at all, and if we only > > support such dump for virtio, it should dump nothing. > > > > Then the -merge-platform will expect all dumps to look the same too, > > merging them with AND on each field. > > I think we will still need the QOM tree in that case. I think the platform > information will look somewhat similar to VMState, which requires the QOM > tree to interpret. Ah yes, I assume you meant when multiple devices can report different thing even if with the same frontend / device type. QOM should work, or anything that can identify a device, e.g. with id / instance_id attached along with the device class. One thing that I still don't know how it works is how it interacts with new hosts being added. This idea is based on the fact that the cluster is known before starting any VM. However in reality I think it can happen when VMs started with a small cluster but then cluster extended, when the -merge-platform has been done on the smaller set. > > > > > Said that, I actually am still not clear on how / whether it should work at > > last. At least my previous concern (1) didn't has a good answer yet, on > > what we do when profile collisions with qemu cmdlines. So far I actually > > still think it more straightforward that in migration we handshake on these > > capabilities if possible. > > > > And that's why I was thinking (where I totally agree with you on this) that > > whether we should settle a short term plan first to be on the safe side > > that we start with migration always being compatible, then we figure the > > other approach. That seems easier to me, and it's also a matter of whether > > we want to do something for 9.1, or leaving that for 9.2 for USO*. > > I suggest disabling all offload features of virtio-net with 9.2. > > I want to keep things consistent so I want to disable all at once. This > change will be very uncomfortable for us, who are implementing offload > features, but I hope it will motivate us to implement a proper solution. > > That said, it will be surely a breaking change so we should wait for 9.1 > before making such a change. Personally I don't worry too much on other offload bits besides USO* so far if we have them ON for longer time. My wish was that they're old good kernel features mostly supported everywhere who runs QEMU, then we're good. And I definitely worry about future offload features, or any feature that may probe host like this and auto-OFF: I hope we can do them on the safe side starting from day1. So I don't know whether we should do that to USO* only or all. But I agree with you that'll definitely be cleaner. On the details of how to turn them off properly.. Taking an example if we want to turn off all the offload features by default (or simply we replace that with USO-only).. Upstream machine type is flexible to all kinds of kernels, so we may not want to regress anyone using an existing machine type even on perf, especially if we want to turn off all. In that case we may need one more knob (I'm assuming this is virtio-net specific issue, but maybe not; using it as an example) to make sure the old machine types perfs as well, with: - x-virtio-net-offload-enforce When set, the offload features with value ON are enforced, so when the host doesn't support a offload feature it will fail to boot, showing the error that specific offload feature is not supported by the virtio backend. When clear, the offload features with value ON are not enforced, so these features can be automatically turned OFF when it's detected the backend doesn't support them. This may bring best perf but has the risk of breaking migration. With that, - On old machine types (compat properties): - set "x-virtio-net-offload-enforce" OFF - set all offload features ON - On new machine types (the default values): - set "x-virtio-net-offload-enforce" ON - set all offload features OFF And yes, we can do that until 9.2, but with above even 9.1 should be safe to do. 9.2 might be still easier just to think everything through again, after all at least USO was introduced in 8.2 so not a regress in 9.1. > > By the way, I am wondering perhaps the "no-cross-migrate" scenario can be > implemented relatively easy in a way similar to compatibility properties. > The idea is to add the "no-cross-migrate" property to machines. If the > property is set to "on", all offload features of virtio-net will be set to > "auto". virtio-net will then probe the offload features and enable available > offloading features. If it'll become a device property, there's still the trick / concern where no-cross-migrate could conflict with the other offload feature that was selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF). Thanks,
On Sun, Aug 04, 2024 at 09:08:05AM -0400, Peter Xu wrote: > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > > On 2024/08/03 1:26, Peter Xu wrote: > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > > > > than -platform but similar? > > > > > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > > > > "perf" or "compat", while by default to "compat"? > > > > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > > > > will cover 2) because an enum is not enough to know about all hosts. I > > > > > > presented a design that will cover 2) in: > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > > > > separate. > > > > > > > > Do you mean merging platform dumps should be done with another command? I > > > > think we will want to know the QOM tree is in use when implementing > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > > > > connected to virtio-net devices. Of course we can include those information > > > > in dumps, but we don't do so for VMState. > > > > > > What I was thinking is the generated platform dump shouldn't care about > > > what is used as backend: it should try to probe whatever is specified in > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu > > > cmdline is used in other hosts to dump this information. > > > > > > IOW, the dump will only contain the information that was based on the qemu > > > cmdline. E.g., if it doesn't include virtio device at all, and if we only > > > support such dump for virtio, it should dump nothing. > > > > > > Then the -merge-platform will expect all dumps to look the same too, > > > merging them with AND on each field. > > > > I think we will still need the QOM tree in that case. I think the platform > > information will look somewhat similar to VMState, which requires the QOM > > tree to interpret. > > Ah yes, I assume you meant when multiple devices can report different thing > even if with the same frontend / device type. QOM should work, or anything > that can identify a device, e.g. with id / instance_id attached along with > the device class. > > One thing that I still don't know how it works is how it interacts with new > hosts being added. > > This idea is based on the fact that the cluster is known before starting > any VM. However in reality I think it can happen when VMs started with a > small cluster but then cluster extended, when the -merge-platform has been > done on the smaller set. This is why I think we (also?) need a way to dump config and then give it to qemu on destination. To have a simple way to management to know whether migration has a chance of working. > > > > > > > > Said that, I actually am still not clear on how / whether it should work at > > > last. At least my previous concern (1) didn't has a good answer yet, on > > > what we do when profile collisions with qemu cmdlines. So far I actually > > > still think it more straightforward that in migration we handshake on these > > > capabilities if possible. > > > > > > And that's why I was thinking (where I totally agree with you on this) that > > > whether we should settle a short term plan first to be on the safe side > > > that we start with migration always being compatible, then we figure the > > > other approach. That seems easier to me, and it's also a matter of whether > > > we want to do something for 9.1, or leaving that for 9.2 for USO*. > > > > I suggest disabling all offload features of virtio-net with 9.2. > > > > I want to keep things consistent so I want to disable all at once. This > > change will be very uncomfortable for us, who are implementing offload > > features, but I hope it will motivate us to implement a proper solution. > > > > That said, it will be surely a breaking change so we should wait for 9.1 > > before making such a change. > > Personally I don't worry too much on other offload bits besides USO* so far > if we have them ON for longer time. My wish was that they're old good > kernel features mostly supported everywhere who runs QEMU, then we're good. > > And I definitely worry about future offload features, or any feature that > may probe host like this and auto-OFF: I hope we can do them on the safe > side starting from day1. > > So I don't know whether we should do that to USO* only or all. But I agree > with you that'll definitely be cleaner. > > On the details of how to turn them off properly.. Taking an example if we > want to turn off all the offload features by default (or simply we replace > that with USO-only).. > > Upstream machine type is flexible to all kinds of kernels, so we may not > want to regress anyone using an existing machine type even on perf, > especially if we want to turn off all. > > In that case we may need one more knob (I'm assuming this is virtio-net > specific issue, but maybe not; using it as an example) to make sure the old > machine types perfs as well, with: > > - x-virtio-net-offload-enforce > > When set, the offload features with value ON are enforced, so when > the host doesn't support a offload feature it will fail to boot, > showing the error that specific offload feature is not supported by the > virtio backend. > > When clear, the offload features with value ON are not enforced, so > these features can be automatically turned OFF when it's detected the > backend doesn't support them. This may bring best perf but has the > risk of breaking migration. > > With that, > > - On old machine types (compat properties): > > - set "x-virtio-net-offload-enforce" OFF > - set all offload features ON > > - On new machine types (the default values): > > - set "x-virtio-net-offload-enforce" ON > - set all offload features OFF > > And yes, we can do that until 9.2, but with above even 9.1 should be safe > to do. 9.2 might be still easier just to think everything through again, > after all at least USO was introduced in 8.2 so not a regress in 9.1. > > > > > By the way, I am wondering perhaps the "no-cross-migrate" scenario can be > > implemented relatively easy in a way similar to compatibility properties. > > The idea is to add the "no-cross-migrate" property to machines. If the > > property is set to "on", all offload features of virtio-net will be set to > > "auto". virtio-net will then probe the offload features and enable available > > offloading features. > > If it'll become a device property, there's still the trick / concern where > no-cross-migrate could conflict with the other offload feature that was > selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF). > > Thanks, > > -- > Peter Xu
On 2024/08/04 22:08, Peter Xu wrote: > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: >> On 2024/08/03 1:26, Peter Xu wrote: >>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: >>>>>>> I'm not sure if I read it right. Perhaps you meant something more generic >>>>>>> than -platform but similar? >>>>>>> >>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either >>>>>>> "perf" or "compat", while by default to "compat"? >>>>>> >>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them >>>>>> will cover 2) because an enum is not enough to know about all hosts. I >>>>>> presented a design that will cover 2) in: >>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com >>>>> >>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something >>>>> separate. >>>> >>>> Do you mean merging platform dumps should be done with another command? I >>>> think we will want to know the QOM tree is in use when implementing >>>> -merge-platform. For example, you cannot define a "platform" when e.g., you >>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is >>>> connected to virtio-net devices. Of course we can include those information >>>> in dumps, but we don't do so for VMState. >>> >>> What I was thinking is the generated platform dump shouldn't care about >>> what is used as backend: it should try to probe whatever is specified in >>> the qemu cmdline, and it's the user's job to make sure the exact same qemu >>> cmdline is used in other hosts to dump this information. >>> >>> IOW, the dump will only contain the information that was based on the qemu >>> cmdline. E.g., if it doesn't include virtio device at all, and if we only >>> support such dump for virtio, it should dump nothing. >>> >>> Then the -merge-platform will expect all dumps to look the same too, >>> merging them with AND on each field. >> >> I think we will still need the QOM tree in that case. I think the platform >> information will look somewhat similar to VMState, which requires the QOM >> tree to interpret. > > Ah yes, I assume you meant when multiple devices can report different thing > even if with the same frontend / device type. QOM should work, or anything > that can identify a device, e.g. with id / instance_id attached along with > the device class. > > One thing that I still don't know how it works is how it interacts with new > hosts being added. > > This idea is based on the fact that the cluster is known before starting > any VM. However in reality I think it can happen when VMs started with a > small cluster but then cluster extended, when the -merge-platform has been > done on the smaller set. > >> >>> >>> Said that, I actually am still not clear on how / whether it should work at >>> last. At least my previous concern (1) didn't has a good answer yet, on >>> what we do when profile collisions with qemu cmdlines. So far I actually >>> still think it more straightforward that in migration we handshake on these >>> capabilities if possible. >>> >>> And that's why I was thinking (where I totally agree with you on this) that >>> whether we should settle a short term plan first to be on the safe side >>> that we start with migration always being compatible, then we figure the >>> other approach. That seems easier to me, and it's also a matter of whether >>> we want to do something for 9.1, or leaving that for 9.2 for USO*. >> >> I suggest disabling all offload features of virtio-net with 9.2. >> >> I want to keep things consistent so I want to disable all at once. This >> change will be very uncomfortable for us, who are implementing offload >> features, but I hope it will motivate us to implement a proper solution. >> >> That said, it will be surely a breaking change so we should wait for 9.1 >> before making such a change. > > Personally I don't worry too much on other offload bits besides USO* so far > if we have them ON for longer time. My wish was that they're old good > kernel features mostly supported everywhere who runs QEMU, then we're good. Unfortunately, we cannot expect everyone runs Linux, and the offload features are provided by Linux. However, QEMU can run on other platforms, and offload features may be provided by vhost-user or vhost-vdpa. > > And I definitely worry about future offload features, or any feature that > may probe host like this and auto-OFF: I hope we can do them on the safe > side starting from day1. > > So I don't know whether we should do that to USO* only or all. But I agree > with you that'll definitely be cleaner. > > On the details of how to turn them off properly.. Taking an example if we > want to turn off all the offload features by default (or simply we replace > that with USO-only).. > > Upstream machine type is flexible to all kinds of kernels, so we may not > want to regress anyone using an existing machine type even on perf, > especially if we want to turn off all. > > In that case we may need one more knob (I'm assuming this is virtio-net > specific issue, but maybe not; using it as an example) to make sure the old > machine types perfs as well, with: > > - x-virtio-net-offload-enforce > > When set, the offload features with value ON are enforced, so when > the host doesn't support a offload feature it will fail to boot, > showing the error that specific offload feature is not supported by the > virtio backend. > > When clear, the offload features with value ON are not enforced, so > these features can be automatically turned OFF when it's detected the > backend doesn't support them. This may bring best perf but has the > risk of breaking migration. "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds "x-force-features-auto" compatibility property to virtio-net for this purpose: https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com > > With that, > > - On old machine types (compat properties): > > - set "x-virtio-net-offload-enforce" OFF > - set all offload features ON > > - On new machine types (the default values): > > - set "x-virtio-net-offload-enforce" ON > - set all offload features OFF > > And yes, we can do that until 9.2, but with above even 9.1 should be safe > to do. 9.2 might be still easier just to think everything through again, > after all at least USO was introduced in 8.2 so not a regress in 9.1. > >> >> By the way, I am wondering perhaps the "no-cross-migrate" scenario can be >> implemented relatively easy in a way similar to compatibility properties. >> The idea is to add the "no-cross-migrate" property to machines. If the >> property is set to "on", all offload features of virtio-net will be set to >> "auto". virtio-net will then probe the offload features and enable available >> offloading features. > > If it'll become a device property, there's still the trick / concern where > no-cross-migrate could conflict with the other offload feature that was > selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF). With no-cross-migrate=ON + uso=OFF, no-cross-migrate will set uso=auto, but the user overrides with uso=off. As the consequence, USO will be disabled but all other available offload features will be enabled. Regards, Akihiko Odaki
On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > I suggest disabling all offload features of virtio-net with 9.2. Yea ... no. > I want to keep things consistent so I want to disable all at once. This > change will be very uncomfortable for us, who are implementing offload > features, but I hope it will motivate us to implement a proper solution. It's uncomfortable for users.
On 2024/08/05 16:30, Michael S. Tsirkin wrote: > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: >> I suggest disabling all offload features of virtio-net with 9.2. > > Yea ... no. > >> I want to keep things consistent so I want to disable all at once. This >> change will be very uncomfortable for us, who are implementing offload >> features, but I hope it will motivate us to implement a proper solution. > > It's uncomfortable for users. An obvious alternative is to set cross-migrate=off by default (I dropped the no- prefix because no-cross-migrate=off is confusing). I don't have a particular idea whether cross-migrate should be on or off by default. This is a trade-off of safety and performance. In general, I believe safety should come first before performance. On the other hand, disabling offload features is a breaking change. QEMU also has -only-migratable option; it is more consistent to make the additional assurance for migration opt-in instead of opt-out. Finally, I see migration across hosts as an advanced feature, and perhaps it can be justified to make it more like an optional feature. Regards, Akihiko Odaki
On Mon, Aug 05, 2024 at 04:53:52PM +0900, Akihiko Odaki wrote: > On 2024/08/05 16:30, Michael S. Tsirkin wrote: > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > > > I suggest disabling all offload features of virtio-net with 9.2. > > > > Yea ... no. > > > > > I want to keep things consistent so I want to disable all at once. This > > > change will be very uncomfortable for us, who are implementing offload > > > features, but I hope it will motivate us to implement a proper solution. > > > > It's uncomfortable for users. > > An obvious alternative is to set cross-migrate=off by default (I dropped the > no- prefix because no-cross-migrate=off is confusing). I don't have a > particular idea whether cross-migrate should be on or off by default. > > This is a trade-off of safety and performance. In general, I believe safety > should come first before performance. There's no actual safety issue here. You can't migrate certain configurations. So I don't really understand what "cross-migrate" is supposed to do: fail migration in 100% of cases? I can see value in getting configuration from source and not starting qemu on destination if it can not be migrated. This is rather straight-forward, and seems directly useful for management. I also see value in getting configuration from destination and starting on source only if it can be migrated. As a variant of last one, I maybe see value in getting that info from multiple destinations. Using this last kind of thing would be trickier because it's not at the libvirt level, so we would need very good documentation. > On the other hand, disabling offload features is a breaking change. QEMU > also has -only-migratable option; it is more consistent to make the > additional assurance for migration opt-in instead of opt-out. Finally, I see > migration across hosts as an advanced feature, and perhaps it can be > justified to make it more like an optional feature. > > Regards, > Akihiko Odaki It's already an optional feature.
On 2024/08/05 17:23, Michael S. Tsirkin wrote: > On Mon, Aug 05, 2024 at 04:53:52PM +0900, Akihiko Odaki wrote: >> On 2024/08/05 16:30, Michael S. Tsirkin wrote: >>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: >>>> I suggest disabling all offload features of virtio-net with 9.2. >>> >>> Yea ... no. >>> >>>> I want to keep things consistent so I want to disable all at once. This >>>> change will be very uncomfortable for us, who are implementing offload >>>> features, but I hope it will motivate us to implement a proper solution. >>> >>> It's uncomfortable for users. >> >> An obvious alternative is to set cross-migrate=off by default (I dropped the >> no- prefix because no-cross-migrate=off is confusing). I don't have a >> particular idea whether cross-migrate should be on or off by default. >> >> This is a trade-off of safety and performance. In general, I believe safety >> should come first before performance. > > There's no actual safety issue here. You can't migrate certain configurations. > So I don't really understand what "cross-migrate" is supposed to do: > fail migration in 100% of cases? "cross-migrate" means migration among hosts with different platforms (e.g., different kernels, vDPA devices). If cross-migrate=off, QEMU can still migrate on the same host (checkpoint and restart). QEMU can also migrate across hosts if the user ensures they are on the same platform.
On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint > and restart). QEMU can also migrate across hosts if the user ensures they > are on the same platform. What is so special about checkpoint/restart? I guess we hope that downgrades are uncommon, but they are possible...
On 2024/08/05 19:08, Michael S. Tsirkin wrote: > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: >> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint >> and restart). QEMU can also migrate across hosts if the user ensures they >> are on the same platform. > > What is so special about checkpoint/restart? I guess we hope that > downgrades are uncommon, but they are possible... Downgrades will not work with cross-migrate=off. Users who want downgrades should use cross-migrate=on.
On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: > On 2024/08/05 19:08, Michael S. Tsirkin wrote: > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint > > > and restart). QEMU can also migrate across hosts if the user ensures they > > > are on the same platform. > > > > What is so special about checkpoint/restart? I guess we hope that > > downgrades are uncommon, but they are possible... > > Downgrades will not work with cross-migrate=off. Users who want downgrades > should use cross-migrate=on. We also don't know that upgrades do not disable a feature: can happen if e.g. there's a serious bug in the feature. Basically, this makes the feature too fragile, in my opinion.
On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote: > On 2024/08/04 22:08, Peter Xu wrote: > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > > > On 2024/08/03 1:26, Peter Xu wrote: > > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > > > > > than -platform but similar? > > > > > > > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > > > > > "perf" or "compat", while by default to "compat"? > > > > > > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > > > > > will cover 2) because an enum is not enough to know about all hosts. I > > > > > > > presented a design that will cover 2) in: > > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > > > > > separate. > > > > > > > > > > Do you mean merging platform dumps should be done with another command? I > > > > > think we will want to know the QOM tree is in use when implementing > > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you > > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > > > > > connected to virtio-net devices. Of course we can include those information > > > > > in dumps, but we don't do so for VMState. > > > > > > > > What I was thinking is the generated platform dump shouldn't care about > > > > what is used as backend: it should try to probe whatever is specified in > > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu > > > > cmdline is used in other hosts to dump this information. > > > > > > > > IOW, the dump will only contain the information that was based on the qemu > > > > cmdline. E.g., if it doesn't include virtio device at all, and if we only > > > > support such dump for virtio, it should dump nothing. > > > > > > > > Then the -merge-platform will expect all dumps to look the same too, > > > > merging them with AND on each field. > > > > > > I think we will still need the QOM tree in that case. I think the platform > > > information will look somewhat similar to VMState, which requires the QOM > > > tree to interpret. > > > > Ah yes, I assume you meant when multiple devices can report different thing > > even if with the same frontend / device type. QOM should work, or anything > > that can identify a device, e.g. with id / instance_id attached along with > > the device class. > > > > One thing that I still don't know how it works is how it interacts with new > > hosts being added. > > > > This idea is based on the fact that the cluster is known before starting > > any VM. However in reality I think it can happen when VMs started with a > > small cluster but then cluster extended, when the -merge-platform has been > > done on the smaller set. > > > > > > > > > > > > > Said that, I actually am still not clear on how / whether it should work at > > > > last. At least my previous concern (1) didn't has a good answer yet, on > > > > what we do when profile collisions with qemu cmdlines. So far I actually > > > > still think it more straightforward that in migration we handshake on these > > > > capabilities if possible. > > > > > > > > And that's why I was thinking (where I totally agree with you on this) that > > > > whether we should settle a short term plan first to be on the safe side > > > > that we start with migration always being compatible, then we figure the > > > > other approach. That seems easier to me, and it's also a matter of whether > > > > we want to do something for 9.1, or leaving that for 9.2 for USO*. > > > > > > I suggest disabling all offload features of virtio-net with 9.2. > > > > > > I want to keep things consistent so I want to disable all at once. This > > > change will be very uncomfortable for us, who are implementing offload > > > features, but I hope it will motivate us to implement a proper solution. > > > > > > That said, it will be surely a breaking change so we should wait for 9.1 > > > before making such a change. > > > > Personally I don't worry too much on other offload bits besides USO* so far > > if we have them ON for longer time. My wish was that they're old good > > kernel features mostly supported everywhere who runs QEMU, then we're good. > > Unfortunately, we cannot expect everyone runs Linux, and the offload > features are provided by Linux. However, QEMU can run on other platforms, > and offload features may be provided by vhost-user or vhost-vdpa. I see. I am not familiar with the status quo there, so I'll leave that to you and other experts that know better on this.. Personally I do care more on Linux, as that's what we ship within RH.. > > > > > And I definitely worry about future offload features, or any feature that > > may probe host like this and auto-OFF: I hope we can do them on the safe > > side starting from day1. > > > > So I don't know whether we should do that to USO* only or all. But I agree > > with you that'll definitely be cleaner. > > > > On the details of how to turn them off properly.. Taking an example if we > > want to turn off all the offload features by default (or simply we replace > > that with USO-only).. > > > > Upstream machine type is flexible to all kinds of kernels, so we may not > > want to regress anyone using an existing machine type even on perf, > > especially if we want to turn off all. > > > > In that case we may need one more knob (I'm assuming this is virtio-net > > specific issue, but maybe not; using it as an example) to make sure the old > > machine types perfs as well, with: > > > > - x-virtio-net-offload-enforce > > > > When set, the offload features with value ON are enforced, so when > > the host doesn't support a offload feature it will fail to boot, > > showing the error that specific offload feature is not supported by the > > virtio backend. > > > > When clear, the offload features with value ON are not enforced, so > > these features can be automatically turned OFF when it's detected the > > backend doesn't support them. This may bring best perf but has the > > risk of breaking migration. > > "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds > "x-force-features-auto" compatibility property to virtio-net for this > purpose: > https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com Ah ok. But note that there's still a slight difference: we need to avoid AUTO being an option, at all, IMHO. It's about making qemu cmdline the ABI: when with AUTO it's still possible the user uses AUTO on both sides, then ABI may not be guaranteed. AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2) the AUTO bit will always generate the same thing on both hosts. However USO* isn't such case.. so the AUTO option is IMHO not wanted. What I mentioned above "x-virtio-net-offload-enforce" shouldn't add anything new to "uso"; it still can only be ON/OFF. However it should affect "flip that to OFF automatically" or "fail the boot" behavior on missing features. > > > > > With that, > > > > - On old machine types (compat properties): > > > > - set "x-virtio-net-offload-enforce" OFF > > - set all offload features ON > > > > - On new machine types (the default values): > > > > - set "x-virtio-net-offload-enforce" ON > > - set all offload features OFF > > > > And yes, we can do that until 9.2, but with above even 9.1 should be safe > > to do. 9.2 might be still easier just to think everything through again, > > after all at least USO was introduced in 8.2 so not a regress in 9.1. > > > > > > > > By the way, I am wondering perhaps the "no-cross-migrate" scenario can be > > > implemented relatively easy in a way similar to compatibility properties. > > > The idea is to add the "no-cross-migrate" property to machines. If the > > > property is set to "on", all offload features of virtio-net will be set to > > > "auto". virtio-net will then probe the offload features and enable available > > > offloading features. > > > > If it'll become a device property, there's still the trick / concern where > > no-cross-migrate could conflict with the other offload feature that was > > selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF). > With no-cross-migrate=ON + uso=OFF, no-cross-migrate will set uso=auto, but > the user overrides with uso=off. As the consequence, USO will be disabled > but all other available offload features will be enabled. Basically you're saying that no-cross-migrate has lower priority than specific feature bits. That's OK to me. Thanks,
On 2024/08/06 22:29, Michael S. Tsirkin wrote: > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: >> On 2024/08/05 19:08, Michael S. Tsirkin wrote: >>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: >>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint >>>> and restart). QEMU can also migrate across hosts if the user ensures they >>>> are on the same platform. >>> >>> What is so special about checkpoint/restart? I guess we hope that >>> downgrades are uncommon, but they are possible... >> >> Downgrades will not work with cross-migrate=off. Users who want downgrades >> should use cross-migrate=on. > > We also don't know that upgrades do not disable a feature: > can happen if e.g. there's a serious bug in the feature. > Basically, this makes the feature too fragile, in my opinion. We can do nothing in such a case. Whether it is on a single host or multiple hosts, we cannot support migration if features once enabled disappear. Regards, Akihiko Odaki
On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote: > On 2024/08/06 22:29, Michael S. Tsirkin wrote: > > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: > > > On 2024/08/05 19:08, Michael S. Tsirkin wrote: > > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: > > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint > > > > > and restart). QEMU can also migrate across hosts if the user ensures they > > > > > are on the same platform. > > > > > > > > What is so special about checkpoint/restart? I guess we hope that > > > > downgrades are uncommon, but they are possible... > > > > > > Downgrades will not work with cross-migrate=off. Users who want downgrades > > > should use cross-migrate=on. > > > > We also don't know that upgrades do not disable a feature: > > can happen if e.g. there's a serious bug in the feature. > > Basically, this makes the feature too fragile, in my opinion. > > We can do nothing in such a case. Whether it is on a single host or multiple > hosts, we cannot support migration if features once enabled disappear. > > Regards, > Akihiko Odaki It does not follow that we have to do something, and this is something, therefore that we have to do this. This is just a reason not to handle checkpoint/restart any different than any other migration.
On 2024/08/08 19:54, Michael S. Tsirkin wrote: > On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote: >> On 2024/08/06 22:29, Michael S. Tsirkin wrote: >>> On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: >>>> On 2024/08/05 19:08, Michael S. Tsirkin wrote: >>>>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: >>>>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint >>>>>> and restart). QEMU can also migrate across hosts if the user ensures they >>>>>> are on the same platform. >>>>> >>>>> What is so special about checkpoint/restart? I guess we hope that >>>>> downgrades are uncommon, but they are possible... >>>> >>>> Downgrades will not work with cross-migrate=off. Users who want downgrades >>>> should use cross-migrate=on. >>> >>> We also don't know that upgrades do not disable a feature: >>> can happen if e.g. there's a serious bug in the feature. >>> Basically, this makes the feature too fragile, in my opinion. >> >> We can do nothing in such a case. Whether it is on a single host or multiple >> hosts, we cannot support migration if features once enabled disappear. >> >> Regards, >> Akihiko Odaki > > It does not follow that we have to do something, and this is something, > therefore that we have to do this. > > This is just a reason not to handle checkpoint/restart any different > than any other migration. Whethere it is checkpoint/restart or any other migration, I expect platform features won't disappear from the host(s); we can't readily support migration in such a situation. When platform features won't disappear, for checkpoint/restart, we can enable all available features without disrupting migration; cross-migrate=off will instruct that. However, if we are migrating a VM across hosts and the user doesn't ensure they are on the same platform, we cannot enable platform features even if we are sure that platform features already present on a host won't disappear because some hosts may not have features in the first place. We can set cross-migrate=on in such a case to disable optional platform features. Regards, Akihiko Odaki
On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote: > On 2024/08/08 19:54, Michael S. Tsirkin wrote: > > On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote: > > > On 2024/08/06 22:29, Michael S. Tsirkin wrote: > > > > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: > > > > > On 2024/08/05 19:08, Michael S. Tsirkin wrote: > > > > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: > > > > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint > > > > > > > and restart). QEMU can also migrate across hosts if the user ensures they > > > > > > > are on the same platform. > > > > > > > > > > > > What is so special about checkpoint/restart? I guess we hope that > > > > > > downgrades are uncommon, but they are possible... > > > > > > > > > > Downgrades will not work with cross-migrate=off. Users who want downgrades > > > > > should use cross-migrate=on. > > > > > > > > We also don't know that upgrades do not disable a feature: > > > > can happen if e.g. there's a serious bug in the feature. > > > > Basically, this makes the feature too fragile, in my opinion. > > > > > > We can do nothing in such a case. Whether it is on a single host or multiple > > > hosts, we cannot support migration if features once enabled disappear. > > > > > > Regards, > > > Akihiko Odaki > > > > It does not follow that we have to do something, and this is something, > > therefore that we have to do this. > > > > This is just a reason not to handle checkpoint/restart any different > > than any other migration. > > Whethere it is checkpoint/restart or any other migration, I expect platform > features won't disappear from the host(s); we can't readily support > migration in such a situation. We can if we mask the features from the guest before starting VM. Or if we didn't, we can fail gracefully. > When platform features won't disappear, for checkpoint/restart, we can > enable all available features without disrupting migration; > cross-migrate=off will instruct that. > > However, if we are migrating a VM across hosts and the user doesn't ensure > they are on the same platform, we cannot enable platform features even if we > are sure that platform features already present on a host won't disappear > because some hosts may not have features in the first place. We can set > cross-migrate=on in such a case to disable optional platform features. > > Regards, > Akihiko Odaki This is too big of a hammer. People already use what you call "cross migrate" and have for years. We are not going to stop developing features just because someone suddenly became aware of some such bit. If you care, you will have to work to solve the problem properly - nacking half baked hacks is the only tool maintainers have to make people work on hard problems.
On 2024/08/08 20:12, Michael S. Tsirkin wrote: > On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote: >> On 2024/08/08 19:54, Michael S. Tsirkin wrote: >>> On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote: >>>> On 2024/08/06 22:29, Michael S. Tsirkin wrote: >>>>> On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: >>>>>> On 2024/08/05 19:08, Michael S. Tsirkin wrote: >>>>>>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: >>>>>>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint >>>>>>>> and restart). QEMU can also migrate across hosts if the user ensures they >>>>>>>> are on the same platform. >>>>>>> >>>>>>> What is so special about checkpoint/restart? I guess we hope that >>>>>>> downgrades are uncommon, but they are possible... >>>>>> >>>>>> Downgrades will not work with cross-migrate=off. Users who want downgrades >>>>>> should use cross-migrate=on. >>>>> >>>>> We also don't know that upgrades do not disable a feature: >>>>> can happen if e.g. there's a serious bug in the feature. >>>>> Basically, this makes the feature too fragile, in my opinion. >>>> >>>> We can do nothing in such a case. Whether it is on a single host or multiple >>>> hosts, we cannot support migration if features once enabled disappear. >>>> >>>> Regards, >>>> Akihiko Odaki >>> >>> It does not follow that we have to do something, and this is something, >>> therefore that we have to do this. >>> >>> This is just a reason not to handle checkpoint/restart any different >>> than any other migration. >> >> Whethere it is checkpoint/restart or any other migration, I expect platform >> features won't disappear from the host(s); we can't readily support >> migration in such a situation. > > > We can if we mask the features from the guest before starting VM. > > Or if we didn't, we can fail gracefully. > >> When platform features won't disappear, for checkpoint/restart, we can >> enable all available features without disrupting migration; >> cross-migrate=off will instruct that. >> >> However, if we are migrating a VM across hosts and the user doesn't ensure >> they are on the same platform, we cannot enable platform features even if we >> are sure that platform features already present on a host won't disappear >> because some hosts may not have features in the first place. We can set >> cross-migrate=on in such a case to disable optional platform features. >> >> Regards, >> Akihiko Odaki > > > This is too big of a hammer. People already use what you call "cross > migrate" and have for years. We are not going to stop developing > features just because someone suddenly became aware of some such bit. > If you care, you will have to work to solve the problem properly - > nacking half baked hacks is the only tool maintainers have to make > people work on hard problems. I think you meant cross-migrate=off, which is the current behavior. I am not suggesting forcing cross-migrate=on or even making it default. I have shown four possible scenarios earlier[a]: 1) Migration everywhere 2) Migration on specific machines 3) Migration on some known platforms 4) No migration (migration on nowhere) Taking the discussion with Peter, I amend 4) as follows: 4*) Migration on one platform (checkpoint/restore) cross-migrate=on is a complete solution for 1). 2) is dealt with another proposal of mine.[b] 3) can be solved with the -platform proposal by Daniel.[c] 4*) is what QEMU currently implements. [a] https://lore.kernel.org/all/39a8bb8b-4191-4f41-aaf7-06df24bf3280@daynix.com/ [b] https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/ [c] https://lore.kernel.org/all/ZqO7cR-UiGpX2rk0@redhat.com/ Regards, Akihiko Odaki
On 2024/08/07 5:41, Peter Xu wrote: > On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote: >> On 2024/08/04 22:08, Peter Xu wrote: >>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: >>>> On 2024/08/03 1:26, Peter Xu wrote: >>>>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: >>>>>>>>> I'm not sure if I read it right. Perhaps you meant something more generic >>>>>>>>> than -platform but similar? >>>>>>>>> >>>>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either >>>>>>>>> "perf" or "compat", while by default to "compat"? >>>>>>>> >>>>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them >>>>>>>> will cover 2) because an enum is not enough to know about all hosts. I >>>>>>>> presented a design that will cover 2) in: >>>>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com >>>>>>> >>>>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something >>>>>>> separate. >>>>>> >>>>>> Do you mean merging platform dumps should be done with another command? I >>>>>> think we will want to know the QOM tree is in use when implementing >>>>>> -merge-platform. For example, you cannot define a "platform" when e.g., you >>>>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is >>>>>> connected to virtio-net devices. Of course we can include those information >>>>>> in dumps, but we don't do so for VMState. >>>>> >>>>> What I was thinking is the generated platform dump shouldn't care about >>>>> what is used as backend: it should try to probe whatever is specified in >>>>> the qemu cmdline, and it's the user's job to make sure the exact same qemu >>>>> cmdline is used in other hosts to dump this information. >>>>> >>>>> IOW, the dump will only contain the information that was based on the qemu >>>>> cmdline. E.g., if it doesn't include virtio device at all, and if we only >>>>> support such dump for virtio, it should dump nothing. >>>>> >>>>> Then the -merge-platform will expect all dumps to look the same too, >>>>> merging them with AND on each field. >>>> >>>> I think we will still need the QOM tree in that case. I think the platform >>>> information will look somewhat similar to VMState, which requires the QOM >>>> tree to interpret. >>> >>> Ah yes, I assume you meant when multiple devices can report different thing >>> even if with the same frontend / device type. QOM should work, or anything >>> that can identify a device, e.g. with id / instance_id attached along with >>> the device class. >>> >>> One thing that I still don't know how it works is how it interacts with new >>> hosts being added. >>> >>> This idea is based on the fact that the cluster is known before starting >>> any VM. However in reality I think it can happen when VMs started with a >>> small cluster but then cluster extended, when the -merge-platform has been >>> done on the smaller set. >>> >>>> >>>>> >>>>> Said that, I actually am still not clear on how / whether it should work at >>>>> last. At least my previous concern (1) didn't has a good answer yet, on >>>>> what we do when profile collisions with qemu cmdlines. So far I actually >>>>> still think it more straightforward that in migration we handshake on these >>>>> capabilities if possible. >>>>> >>>>> And that's why I was thinking (where I totally agree with you on this) that >>>>> whether we should settle a short term plan first to be on the safe side >>>>> that we start with migration always being compatible, then we figure the >>>>> other approach. That seems easier to me, and it's also a matter of whether >>>>> we want to do something for 9.1, or leaving that for 9.2 for USO*. >>>> >>>> I suggest disabling all offload features of virtio-net with 9.2. >>>> >>>> I want to keep things consistent so I want to disable all at once. This >>>> change will be very uncomfortable for us, who are implementing offload >>>> features, but I hope it will motivate us to implement a proper solution. >>>> >>>> That said, it will be surely a breaking change so we should wait for 9.1 >>>> before making such a change. >>> >>> Personally I don't worry too much on other offload bits besides USO* so far >>> if we have them ON for longer time. My wish was that they're old good >>> kernel features mostly supported everywhere who runs QEMU, then we're good. >> >> Unfortunately, we cannot expect everyone runs Linux, and the offload >> features are provided by Linux. However, QEMU can run on other platforms, >> and offload features may be provided by vhost-user or vhost-vdpa. > > I see. I am not familiar with the status quo there, so I'll leave that to > you and other experts that know better on this.. > > Personally I do care more on Linux, as that's what we ship within RH.. > >> >>> >>> And I definitely worry about future offload features, or any feature that >>> may probe host like this and auto-OFF: I hope we can do them on the safe >>> side starting from day1. >>> >>> So I don't know whether we should do that to USO* only or all. But I agree >>> with you that'll definitely be cleaner. >>> >>> On the details of how to turn them off properly.. Taking an example if we >>> want to turn off all the offload features by default (or simply we replace >>> that with USO-only).. >>> >>> Upstream machine type is flexible to all kinds of kernels, so we may not >>> want to regress anyone using an existing machine type even on perf, >>> especially if we want to turn off all. >>> >>> In that case we may need one more knob (I'm assuming this is virtio-net >>> specific issue, but maybe not; using it as an example) to make sure the old >>> machine types perfs as well, with: >>> >>> - x-virtio-net-offload-enforce >>> >>> When set, the offload features with value ON are enforced, so when >>> the host doesn't support a offload feature it will fail to boot, >>> showing the error that specific offload feature is not supported by the >>> virtio backend. >>> >>> When clear, the offload features with value ON are not enforced, so >>> these features can be automatically turned OFF when it's detected the >>> backend doesn't support them. This may bring best perf but has the >>> risk of breaking migration. >> >> "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds >> "x-force-features-auto" compatibility property to virtio-net for this >> purpose: >> https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com > > Ah ok. But note that there's still a slight difference: we need to avoid > AUTO being an option, at all, IMHO. > > It's about making qemu cmdline the ABI: when with AUTO it's still possible > the user uses AUTO on both sides, then ABI may not be guaranteed. > > AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2) > the AUTO bit will always generate the same thing on both hosts. However > USO* isn't such case.. so the AUTO option is IMHO not wanted. > > What I mentioned above "x-virtio-net-offload-enforce" shouldn't add > anything new to "uso"; it still can only be ON/OFF. However it should > affect "flip that to OFF automatically" or "fail the boot" behavior on > missing features. My rationale for the OnOffAuto change is that "flipping ON to OFF automatically" is more confusing than making users specify AUTO to allow QEMU making the feature OFF. "ON" will always make the boot fail. The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off. AUTO is no different in that sense. Regards, Akihiko Odaki
On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote: > On 2024/08/07 5:41, Peter Xu wrote: > > On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote: > > > On 2024/08/04 22:08, Peter Xu wrote: > > > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > > > > > On 2024/08/03 1:26, Peter Xu wrote: > > > > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > > > > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > > > > > > > than -platform but similar? > > > > > > > > > > > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > > > > > > > "perf" or "compat", while by default to "compat"? > > > > > > > > > > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > > > > > > > will cover 2) because an enum is not enough to know about all hosts. I > > > > > > > > > presented a design that will cover 2) in: > > > > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > > > > > > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > > > > > > > separate. > > > > > > > > > > > > > > Do you mean merging platform dumps should be done with another command? I > > > > > > > think we will want to know the QOM tree is in use when implementing > > > > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you > > > > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > > > > > > > connected to virtio-net devices. Of course we can include those information > > > > > > > in dumps, but we don't do so for VMState. > > > > > > > > > > > > What I was thinking is the generated platform dump shouldn't care about > > > > > > what is used as backend: it should try to probe whatever is specified in > > > > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu > > > > > > cmdline is used in other hosts to dump this information. > > > > > > > > > > > > IOW, the dump will only contain the information that was based on the qemu > > > > > > cmdline. E.g., if it doesn't include virtio device at all, and if we only > > > > > > support such dump for virtio, it should dump nothing. > > > > > > > > > > > > Then the -merge-platform will expect all dumps to look the same too, > > > > > > merging them with AND on each field. > > > > > > > > > > I think we will still need the QOM tree in that case. I think the platform > > > > > information will look somewhat similar to VMState, which requires the QOM > > > > > tree to interpret. > > > > > > > > Ah yes, I assume you meant when multiple devices can report different thing > > > > even if with the same frontend / device type. QOM should work, or anything > > > > that can identify a device, e.g. with id / instance_id attached along with > > > > the device class. > > > > > > > > One thing that I still don't know how it works is how it interacts with new > > > > hosts being added. > > > > > > > > This idea is based on the fact that the cluster is known before starting > > > > any VM. However in reality I think it can happen when VMs started with a > > > > small cluster but then cluster extended, when the -merge-platform has been > > > > done on the smaller set. > > > > > > > > > > > > > > > > > > > > > Said that, I actually am still not clear on how / whether it should work at > > > > > > last. At least my previous concern (1) didn't has a good answer yet, on > > > > > > what we do when profile collisions with qemu cmdlines. So far I actually > > > > > > still think it more straightforward that in migration we handshake on these > > > > > > capabilities if possible. > > > > > > > > > > > > And that's why I was thinking (where I totally agree with you on this) that > > > > > > whether we should settle a short term plan first to be on the safe side > > > > > > that we start with migration always being compatible, then we figure the > > > > > > other approach. That seems easier to me, and it's also a matter of whether > > > > > > we want to do something for 9.1, or leaving that for 9.2 for USO*. > > > > > > > > > > I suggest disabling all offload features of virtio-net with 9.2. > > > > > > > > > > I want to keep things consistent so I want to disable all at once. This > > > > > change will be very uncomfortable for us, who are implementing offload > > > > > features, but I hope it will motivate us to implement a proper solution. > > > > > > > > > > That said, it will be surely a breaking change so we should wait for 9.1 > > > > > before making such a change. > > > > > > > > Personally I don't worry too much on other offload bits besides USO* so far > > > > if we have them ON for longer time. My wish was that they're old good > > > > kernel features mostly supported everywhere who runs QEMU, then we're good. > > > > > > Unfortunately, we cannot expect everyone runs Linux, and the offload > > > features are provided by Linux. However, QEMU can run on other platforms, > > > and offload features may be provided by vhost-user or vhost-vdpa. > > > > I see. I am not familiar with the status quo there, so I'll leave that to > > you and other experts that know better on this.. > > > > Personally I do care more on Linux, as that's what we ship within RH.. > > > > > > > > > > > > > And I definitely worry about future offload features, or any feature that > > > > may probe host like this and auto-OFF: I hope we can do them on the safe > > > > side starting from day1. > > > > > > > > So I don't know whether we should do that to USO* only or all. But I agree > > > > with you that'll definitely be cleaner. > > > > > > > > On the details of how to turn them off properly.. Taking an example if we > > > > want to turn off all the offload features by default (or simply we replace > > > > that with USO-only).. > > > > > > > > Upstream machine type is flexible to all kinds of kernels, so we may not > > > > want to regress anyone using an existing machine type even on perf, > > > > especially if we want to turn off all. > > > > > > > > In that case we may need one more knob (I'm assuming this is virtio-net > > > > specific issue, but maybe not; using it as an example) to make sure the old > > > > machine types perfs as well, with: > > > > > > > > - x-virtio-net-offload-enforce > > > > > > > > When set, the offload features with value ON are enforced, so when > > > > the host doesn't support a offload feature it will fail to boot, > > > > showing the error that specific offload feature is not supported by the > > > > virtio backend. > > > > > > > > When clear, the offload features with value ON are not enforced, so > > > > these features can be automatically turned OFF when it's detected the > > > > backend doesn't support them. This may bring best perf but has the > > > > risk of breaking migration. > > > > > > "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds > > > "x-force-features-auto" compatibility property to virtio-net for this > > > purpose: > > > https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com > > > > Ah ok. But note that there's still a slight difference: we need to avoid > > AUTO being an option, at all, IMHO. > > > > It's about making qemu cmdline the ABI: when with AUTO it's still possible > > the user uses AUTO on both sides, then ABI may not be guaranteed. > > > > AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2) > > the AUTO bit will always generate the same thing on both hosts. However > > USO* isn't such case.. so the AUTO option is IMHO not wanted. > > > > What I mentioned above "x-virtio-net-offload-enforce" shouldn't add > > anything new to "uso"; it still can only be ON/OFF. However it should > > affect "flip that to OFF automatically" or "fail the boot" behavior on > > missing features. > > My rationale for the OnOffAuto change is that "flipping ON to OFF > automatically" is more confusing than making users specify AUTO to allow > QEMU making the feature OFF. "ON" will always make the boot fail. > > The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off. > AUTO is no different in that sense. Hmm yes; I wished we can have device properties that user can never specify, but only set from internals. It's just that applying a compat property so far require a generic device property. Or say, it'll be nice that compat property can tweak a class variable too then no property to introduce. We could even add a migration blocker for x-virtio-net-offload-enforce=ON, but again it could be too aggresive. I think it might be better we bet nobody will even know there's the parameter, so it won't be used in manual setup. OTOH, "guest_uso4" can be too easiy to find there's the AUTO option: I normally use ",guest_uso4=?" to just dump the possible values. Thanks,
On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: > This is too big of a hammer. People already use what you call "cross > migrate" and have for years. We are not going to stop developing > features just because someone suddenly became aware of some such bit. > If you care, you will have to work to solve the problem properly - > nacking half baked hacks is the only tool maintainers have to make > people work on hard problems. IMHO this is totally different thing. It's not about proposing a new feature yet so far, it's about how we should fix a breakage first. And that's why I think we should fix it even in the simple way first, then we consider anything more benefitial from perf side without breaking anything, which should be on top of that. Thanks,
On Thu, Aug 08, 2024 at 08:32:58PM +0900, Akihiko Odaki wrote: > On 2024/08/08 20:12, Michael S. Tsirkin wrote: > > On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote: > > > On 2024/08/08 19:54, Michael S. Tsirkin wrote: > > > > On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote: > > > > > On 2024/08/06 22:29, Michael S. Tsirkin wrote: > > > > > > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote: > > > > > > > On 2024/08/05 19:08, Michael S. Tsirkin wrote: > > > > > > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote: > > > > > > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint > > > > > > > > > and restart). QEMU can also migrate across hosts if the user ensures they > > > > > > > > > are on the same platform. > > > > > > > > > > > > > > > > What is so special about checkpoint/restart? I guess we hope that > > > > > > > > downgrades are uncommon, but they are possible... > > > > > > > > > > > > > > Downgrades will not work with cross-migrate=off. Users who want downgrades > > > > > > > should use cross-migrate=on. > > > > > > > > > > > > We also don't know that upgrades do not disable a feature: > > > > > > can happen if e.g. there's a serious bug in the feature. > > > > > > Basically, this makes the feature too fragile, in my opinion. > > > > > > > > > > We can do nothing in such a case. Whether it is on a single host or multiple > > > > > hosts, we cannot support migration if features once enabled disappear. > > > > > > > > > > Regards, > > > > > Akihiko Odaki > > > > > > > > It does not follow that we have to do something, and this is something, > > > > therefore that we have to do this. > > > > > > > > This is just a reason not to handle checkpoint/restart any different > > > > than any other migration. > > > > > > Whethere it is checkpoint/restart or any other migration, I expect platform > > > features won't disappear from the host(s); we can't readily support > > > migration in such a situation. > > > > > > We can if we mask the features from the guest before starting VM. > > > > Or if we didn't, we can fail gracefully. > > > > > When platform features won't disappear, for checkpoint/restart, we can > > > enable all available features without disrupting migration; > > > cross-migrate=off will instruct that. > > > > > > However, if we are migrating a VM across hosts and the user doesn't ensure > > > they are on the same platform, we cannot enable platform features even if we > > > are sure that platform features already present on a host won't disappear > > > because some hosts may not have features in the first place. We can set > > > cross-migrate=on in such a case to disable optional platform features. > > > > > > Regards, > > > Akihiko Odaki > > > > > > This is too big of a hammer. People already use what you call "cross > > migrate" and have for years. We are not going to stop developing > > features just because someone suddenly became aware of some such bit. > > If you care, you will have to work to solve the problem properly - > > nacking half baked hacks is the only tool maintainers have to make > > people work on hard problems. > > I think you meant cross-migrate=off, which is the current behavior. > > I am not suggesting forcing cross-migrate=on or even making it default. I > have shown four possible scenarios earlier[a]: > > 1) Migration everywhere > 2) Migration on specific machines > 3) Migration on some known platforms > 4) No migration (migration on nowhere) > > Taking the discussion with Peter, I amend 4) as follows: > 4*) Migration on one platform (checkpoint/restore) Maybe we can avoid calling out "checkpoint/restore", but something like "migration on identical hosts" or something. AFAIU that's what we do with many arm64 systems on the vcpu models with KVM (IIRC it's still about using "virt" machines), where we simply mostly require it's the identical bare metal host or weird things can happen when migration happens. > > cross-migrate=on is a complete solution for 1). > 2) is dealt with another proposal of mine.[b] > 3) can be solved with the -platform proposal by Daniel.[c] > 4*) is what QEMU currently implements. > > [a] > https://lore.kernel.org/all/39a8bb8b-4191-4f41-aaf7-06df24bf3280@daynix.com/ > [b] > https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/ > [c] https://lore.kernel.org/all/ZqO7cR-UiGpX2rk0@redhat.com/ > > Regards, > Akihiko Odaki > Thanks,
On Thu, Aug 08, 2024 at 09:55:49AM -0400, Peter Xu wrote: > On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote: > > On 2024/08/07 5:41, Peter Xu wrote: > > > On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote: > > > > On 2024/08/04 22:08, Peter Xu wrote: > > > > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: > > > > > > On 2024/08/03 1:26, Peter Xu wrote: > > > > > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: > > > > > > > > > > > I'm not sure if I read it right. Perhaps you meant something more generic > > > > > > > > > > > than -platform but similar? > > > > > > > > > > > > > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either > > > > > > > > > > > "perf" or "compat", while by default to "compat"? > > > > > > > > > > > > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them > > > > > > > > > > will cover 2) because an enum is not enough to know about all hosts. I > > > > > > > > > > presented a design that will cover 2) in: > > > > > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com > > > > > > > > > > > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something > > > > > > > > > separate. > > > > > > > > > > > > > > > > Do you mean merging platform dumps should be done with another command? I > > > > > > > > think we will want to know the QOM tree is in use when implementing > > > > > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you > > > > > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is > > > > > > > > connected to virtio-net devices. Of course we can include those information > > > > > > > > in dumps, but we don't do so for VMState. > > > > > > > > > > > > > > What I was thinking is the generated platform dump shouldn't care about > > > > > > > what is used as backend: it should try to probe whatever is specified in > > > > > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu > > > > > > > cmdline is used in other hosts to dump this information. > > > > > > > > > > > > > > IOW, the dump will only contain the information that was based on the qemu > > > > > > > cmdline. E.g., if it doesn't include virtio device at all, and if we only > > > > > > > support such dump for virtio, it should dump nothing. > > > > > > > > > > > > > > Then the -merge-platform will expect all dumps to look the same too, > > > > > > > merging them with AND on each field. > > > > > > > > > > > > I think we will still need the QOM tree in that case. I think the platform > > > > > > information will look somewhat similar to VMState, which requires the QOM > > > > > > tree to interpret. > > > > > > > > > > Ah yes, I assume you meant when multiple devices can report different thing > > > > > even if with the same frontend / device type. QOM should work, or anything > > > > > that can identify a device, e.g. with id / instance_id attached along with > > > > > the device class. > > > > > > > > > > One thing that I still don't know how it works is how it interacts with new > > > > > hosts being added. > > > > > > > > > > This idea is based on the fact that the cluster is known before starting > > > > > any VM. However in reality I think it can happen when VMs started with a > > > > > small cluster but then cluster extended, when the -merge-platform has been > > > > > done on the smaller set. > > > > > > > > > > > > > > > > > > > > > > > > > Said that, I actually am still not clear on how / whether it should work at > > > > > > > last. At least my previous concern (1) didn't has a good answer yet, on > > > > > > > what we do when profile collisions with qemu cmdlines. So far I actually > > > > > > > still think it more straightforward that in migration we handshake on these > > > > > > > capabilities if possible. > > > > > > > > > > > > > > And that's why I was thinking (where I totally agree with you on this) that > > > > > > > whether we should settle a short term plan first to be on the safe side > > > > > > > that we start with migration always being compatible, then we figure the > > > > > > > other approach. That seems easier to me, and it's also a matter of whether > > > > > > > we want to do something for 9.1, or leaving that for 9.2 for USO*. > > > > > > > > > > > > I suggest disabling all offload features of virtio-net with 9.2. > > > > > > > > > > > > I want to keep things consistent so I want to disable all at once. This > > > > > > change will be very uncomfortable for us, who are implementing offload > > > > > > features, but I hope it will motivate us to implement a proper solution. > > > > > > > > > > > > That said, it will be surely a breaking change so we should wait for 9.1 > > > > > > before making such a change. > > > > > > > > > > Personally I don't worry too much on other offload bits besides USO* so far > > > > > if we have them ON for longer time. My wish was that they're old good > > > > > kernel features mostly supported everywhere who runs QEMU, then we're good. > > > > > > > > Unfortunately, we cannot expect everyone runs Linux, and the offload > > > > features are provided by Linux. However, QEMU can run on other platforms, > > > > and offload features may be provided by vhost-user or vhost-vdpa. > > > > > > I see. I am not familiar with the status quo there, so I'll leave that to > > > you and other experts that know better on this.. > > > > > > Personally I do care more on Linux, as that's what we ship within RH.. > > > > > > > > > > > > > > > > > And I definitely worry about future offload features, or any feature that > > > > > may probe host like this and auto-OFF: I hope we can do them on the safe > > > > > side starting from day1. > > > > > > > > > > So I don't know whether we should do that to USO* only or all. But I agree > > > > > with you that'll definitely be cleaner. > > > > > > > > > > On the details of how to turn them off properly.. Taking an example if we > > > > > want to turn off all the offload features by default (or simply we replace > > > > > that with USO-only).. > > > > > > > > > > Upstream machine type is flexible to all kinds of kernels, so we may not > > > > > want to regress anyone using an existing machine type even on perf, > > > > > especially if we want to turn off all. > > > > > > > > > > In that case we may need one more knob (I'm assuming this is virtio-net > > > > > specific issue, but maybe not; using it as an example) to make sure the old > > > > > machine types perfs as well, with: > > > > > > > > > > - x-virtio-net-offload-enforce > > > > > > > > > > When set, the offload features with value ON are enforced, so when > > > > > the host doesn't support a offload feature it will fail to boot, > > > > > showing the error that specific offload feature is not supported by the > > > > > virtio backend. > > > > > > > > > > When clear, the offload features with value ON are not enforced, so > > > > > these features can be automatically turned OFF when it's detected the > > > > > backend doesn't support them. This may bring best perf but has the > > > > > risk of breaking migration. > > > > > > > > "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds > > > > "x-force-features-auto" compatibility property to virtio-net for this > > > > purpose: > > > > https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com > > > > > > Ah ok. But note that there's still a slight difference: we need to avoid > > > AUTO being an option, at all, IMHO. > > > > > > It's about making qemu cmdline the ABI: when with AUTO it's still possible > > > the user uses AUTO on both sides, then ABI may not be guaranteed. > > > > > > AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2) > > > the AUTO bit will always generate the same thing on both hosts. However > > > USO* isn't such case.. so the AUTO option is IMHO not wanted. > > > > > > What I mentioned above "x-virtio-net-offload-enforce" shouldn't add > > > anything new to "uso"; it still can only be ON/OFF. However it should > > > affect "flip that to OFF automatically" or "fail the boot" behavior on > > > missing features. > > > > My rationale for the OnOffAuto change is that "flipping ON to OFF > > automatically" is more confusing than making users specify AUTO to allow > > QEMU making the feature OFF. "ON" will always make the boot fail. > > > > The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off. > > AUTO is no different in that sense. > > Hmm yes; I wished we can have device properties that user can never > specify, but only set from internals. I think prefixing with "x-" is good enough. > It's just that applying a compat > property so far require a generic device property. Or say, it'll be nice > that compat property can tweak a class variable too then no property to > introduce. > > We could even add a migration blocker for x-virtio-net-offload-enforce=ON, > but again it could be too aggresive. I think it might be better we bet > nobody will even know there's the parameter, so it won't be used in manual > setup. OTOH, "guest_uso4" can be too easiy to find there's the AUTO > option: I normally use ",guest_uso4=?" to just dump the possible values. > > Thanks, > > -- > Peter Xu
On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: > > This is too big of a hammer. People already use what you call "cross > > migrate" and have for years. We are not going to stop developing > > features just because someone suddenly became aware of some such bit. > > If you care, you will have to work to solve the problem properly - > > nacking half baked hacks is the only tool maintainers have to make > > people work on hard problems. > > IMHO this is totally different thing. It's not about proposing a new > feature yet so far, it's about how we should fix a breakage first. > > And that's why I think we should fix it even in the simple way first, then > we consider anything more benefitial from perf side without breaking > anything, which should be on top of that. > > Thanks, As I said, once the quick hack is merged people stop caring. Mixing different kernel versions in migration is esoteric enough for this not to matter to most people. There's no rush I think, address it properly.
On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: > On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: > > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: > > > This is too big of a hammer. People already use what you call "cross > > > migrate" and have for years. We are not going to stop developing > > > features just because someone suddenly became aware of some such bit. > > > If you care, you will have to work to solve the problem properly - > > > nacking half baked hacks is the only tool maintainers have to make > > > people work on hard problems. > > > > IMHO this is totally different thing. It's not about proposing a new > > feature yet so far, it's about how we should fix a breakage first. > > > > And that's why I think we should fix it even in the simple way first, then > > we consider anything more benefitial from perf side without breaking > > anything, which should be on top of that. > > > > Thanks, > > As I said, once the quick hack is merged people stop caring. IMHO it's not a hack. It's a proper fix to me to disable it by default for now. OTOH, having it ON always even knowing it can break migration is a hack to me, when we don't have anything else to guard the migration. > Mixing different kernel versions in migration is esoteric enough for > this not to matter to most people. There's no rush I think, address > it properly. Exactly mixing kernel versions will be tricky to users to identify, but that's, AFAICT, exactly happening everywhere. We can't urge user to always use the exact same kernels when we're talking about a VM cluster. That's why I think allowing migration to work across those kernels matter. I will agree there's no rush iff RHEL9 kernel won't backport TAP at all, otherwise this will trigger between y-stream after people upgrades partial of the clusters. Thanks,
On 2024/08/08 22:55, Peter Xu wrote: > On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote: >> On 2024/08/07 5:41, Peter Xu wrote: >>> On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote: >>>> On 2024/08/04 22:08, Peter Xu wrote: >>>>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote: >>>>>> On 2024/08/03 1:26, Peter Xu wrote: >>>>>>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote: >>>>>>>>>>> I'm not sure if I read it right. Perhaps you meant something more generic >>>>>>>>>>> than -platform but similar? >>>>>>>>>>> >>>>>>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either >>>>>>>>>>> "perf" or "compat", while by default to "compat"? >>>>>>>>>> >>>>>>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them >>>>>>>>>> will cover 2) because an enum is not enough to know about all hosts. I >>>>>>>>>> presented a design that will cover 2) in: >>>>>>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com >>>>>>>>> >>>>>>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something >>>>>>>>> separate. >>>>>>>> >>>>>>>> Do you mean merging platform dumps should be done with another command? I >>>>>>>> think we will want to know the QOM tree is in use when implementing >>>>>>>> -merge-platform. For example, you cannot define a "platform" when e.g., you >>>>>>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is >>>>>>>> connected to virtio-net devices. Of course we can include those information >>>>>>>> in dumps, but we don't do so for VMState. >>>>>>> >>>>>>> What I was thinking is the generated platform dump shouldn't care about >>>>>>> what is used as backend: it should try to probe whatever is specified in >>>>>>> the qemu cmdline, and it's the user's job to make sure the exact same qemu >>>>>>> cmdline is used in other hosts to dump this information. >>>>>>> >>>>>>> IOW, the dump will only contain the information that was based on the qemu >>>>>>> cmdline. E.g., if it doesn't include virtio device at all, and if we only >>>>>>> support such dump for virtio, it should dump nothing. >>>>>>> >>>>>>> Then the -merge-platform will expect all dumps to look the same too, >>>>>>> merging them with AND on each field. >>>>>> >>>>>> I think we will still need the QOM tree in that case. I think the platform >>>>>> information will look somewhat similar to VMState, which requires the QOM >>>>>> tree to interpret. >>>>> >>>>> Ah yes, I assume you meant when multiple devices can report different thing >>>>> even if with the same frontend / device type. QOM should work, or anything >>>>> that can identify a device, e.g. with id / instance_id attached along with >>>>> the device class. >>>>> >>>>> One thing that I still don't know how it works is how it interacts with new >>>>> hosts being added. >>>>> >>>>> This idea is based on the fact that the cluster is known before starting >>>>> any VM. However in reality I think it can happen when VMs started with a >>>>> small cluster but then cluster extended, when the -merge-platform has been >>>>> done on the smaller set. >>>>> >>>>>> >>>>>>> >>>>>>> Said that, I actually am still not clear on how / whether it should work at >>>>>>> last. At least my previous concern (1) didn't has a good answer yet, on >>>>>>> what we do when profile collisions with qemu cmdlines. So far I actually >>>>>>> still think it more straightforward that in migration we handshake on these >>>>>>> capabilities if possible. >>>>>>> >>>>>>> And that's why I was thinking (where I totally agree with you on this) that >>>>>>> whether we should settle a short term plan first to be on the safe side >>>>>>> that we start with migration always being compatible, then we figure the >>>>>>> other approach. That seems easier to me, and it's also a matter of whether >>>>>>> we want to do something for 9.1, or leaving that for 9.2 for USO*. >>>>>> >>>>>> I suggest disabling all offload features of virtio-net with 9.2. >>>>>> >>>>>> I want to keep things consistent so I want to disable all at once. This >>>>>> change will be very uncomfortable for us, who are implementing offload >>>>>> features, but I hope it will motivate us to implement a proper solution. >>>>>> >>>>>> That said, it will be surely a breaking change so we should wait for 9.1 >>>>>> before making such a change. >>>>> >>>>> Personally I don't worry too much on other offload bits besides USO* so far >>>>> if we have them ON for longer time. My wish was that they're old good >>>>> kernel features mostly supported everywhere who runs QEMU, then we're good. >>>> >>>> Unfortunately, we cannot expect everyone runs Linux, and the offload >>>> features are provided by Linux. However, QEMU can run on other platforms, >>>> and offload features may be provided by vhost-user or vhost-vdpa. >>> >>> I see. I am not familiar with the status quo there, so I'll leave that to >>> you and other experts that know better on this.. >>> >>> Personally I do care more on Linux, as that's what we ship within RH.. >>> >>>> >>>>> >>>>> And I definitely worry about future offload features, or any feature that >>>>> may probe host like this and auto-OFF: I hope we can do them on the safe >>>>> side starting from day1. >>>>> >>>>> So I don't know whether we should do that to USO* only or all. But I agree >>>>> with you that'll definitely be cleaner. >>>>> >>>>> On the details of how to turn them off properly.. Taking an example if we >>>>> want to turn off all the offload features by default (or simply we replace >>>>> that with USO-only).. >>>>> >>>>> Upstream machine type is flexible to all kinds of kernels, so we may not >>>>> want to regress anyone using an existing machine type even on perf, >>>>> especially if we want to turn off all. >>>>> >>>>> In that case we may need one more knob (I'm assuming this is virtio-net >>>>> specific issue, but maybe not; using it as an example) to make sure the old >>>>> machine types perfs as well, with: >>>>> >>>>> - x-virtio-net-offload-enforce >>>>> >>>>> When set, the offload features with value ON are enforced, so when >>>>> the host doesn't support a offload feature it will fail to boot, >>>>> showing the error that specific offload feature is not supported by the >>>>> virtio backend. >>>>> >>>>> When clear, the offload features with value ON are not enforced, so >>>>> these features can be automatically turned OFF when it's detected the >>>>> backend doesn't support them. This may bring best perf but has the >>>>> risk of breaking migration. >>>> >>>> "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds >>>> "x-force-features-auto" compatibility property to virtio-net for this >>>> purpose: >>>> https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com >>> >>> Ah ok. But note that there's still a slight difference: we need to avoid >>> AUTO being an option, at all, IMHO. >>> >>> It's about making qemu cmdline the ABI: when with AUTO it's still possible >>> the user uses AUTO on both sides, then ABI may not be guaranteed. >>> >>> AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2) >>> the AUTO bit will always generate the same thing on both hosts. However >>> USO* isn't such case.. so the AUTO option is IMHO not wanted. >>> >>> What I mentioned above "x-virtio-net-offload-enforce" shouldn't add >>> anything new to "uso"; it still can only be ON/OFF. However it should >>> affect "flip that to OFF automatically" or "fail the boot" behavior on >>> missing features. >> >> My rationale for the OnOffAuto change is that "flipping ON to OFF >> automatically" is more confusing than making users specify AUTO to allow >> QEMU making the feature OFF. "ON" will always make the boot fail. >> >> The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off. >> AUTO is no different in that sense. > > Hmm yes; I wished we can have device properties that user can never > specify, but only set from internals. It's just that applying a compat > property so far require a generic device property. Or say, it'll be nice > that compat property can tweak a class variable too then no property to > introduce. > > We could even add a migration blocker for x-virtio-net-offload-enforce=ON, > but again it could be too aggresive. I think it might be better we bet > nobody will even know there's the parameter, so it won't be used in manual > setup. OTOH, "guest_uso4" can be too easiy to find there's the AUTO > option: I normally use ",guest_uso4=?" to just dump the possible values. We can detect and reject AUTO when cross-migrate=on if desired, but I'm not sure it's worthwhile. Regards, Akihiko Odaki
Peter Xu <peterx@redhat.com> writes: > On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: >> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: >> > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: >> > > This is too big of a hammer. People already use what you call "cross >> > > migrate" and have for years. We are not going to stop developing >> > > features just because someone suddenly became aware of some such bit. >> > > If you care, you will have to work to solve the problem properly - >> > > nacking half baked hacks is the only tool maintainers have to make >> > > people work on hard problems. >> > >> > IMHO this is totally different thing. It's not about proposing a new >> > feature yet so far, it's about how we should fix a breakage first. >> > >> > And that's why I think we should fix it even in the simple way first, then >> > we consider anything more benefitial from perf side without breaking >> > anything, which should be on top of that. >> > >> > Thanks, >> >> As I said, once the quick hack is merged people stop caring. > > IMHO it's not a hack. It's a proper fix to me to disable it by default for > now. > > OTOH, having it ON always even knowing it can break migration is a hack to > me, when we don't have anything else to guard the migration. > >> Mixing different kernel versions in migration is esoteric enough for >> this not to matter to most people. There's no rush I think, address >> it properly. > > Exactly mixing kernel versions will be tricky to users to identify, but > that's, AFAICT, exactly happening everywhere. We can't urge user to always > use the exact same kernels when we're talking about a VM cluster. That's > why I think allowing migration to work across those kernels matter. I also worry a bit about the scenario where the cluster changes slightly and now all VMs are already restricted by some option that requires the exact same kernel. Specifically, kernel changes in a cloud environment also happen due to factors completely unrelated to migration. I'm not sure the people managing the infra (who care about migration) will be gating kernel changes just because QEMU has been configured in a specific manner.
On Thu, Aug 08, 2024 at 11:25:29AM -0400, Peter Xu wrote: > On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: > > On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: > > > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: > > > > This is too big of a hammer. People already use what you call "cross > > > > migrate" and have for years. We are not going to stop developing > > > > features just because someone suddenly became aware of some such bit. > > > > If you care, you will have to work to solve the problem properly - > > > > nacking half baked hacks is the only tool maintainers have to make > > > > people work on hard problems. > > > > > > IMHO this is totally different thing. It's not about proposing a new > > > feature yet so far, it's about how we should fix a breakage first. > > > > > > And that's why I think we should fix it even in the simple way first, then > > > we consider anything more benefitial from perf side without breaking > > > anything, which should be on top of that. > > > > > > Thanks, > > > > As I said, once the quick hack is merged people stop caring. > > IMHO it's not a hack. It's a proper fix to me to disable it by default for > now. > > OTOH, having it ON always even knowing it can break migration is a hack to > me, when we don't have anything else to guard the migration. It's a hack in the sense that it's specific to this option. But hack or not, it's the only way I have to make people work on a full solution. > > Mixing different kernel versions in migration is esoteric enough for > > this not to matter to most people. There's no rush I think, address > > it properly. > > Exactly mixing kernel versions will be tricky to users to identify, but > that's, AFAICT, exactly happening everywhere. We can't urge user to always > use the exact same kernels when we're talking about a VM cluster. That's > why I think allowing migration to work across those kernels matter. > > I will agree there's no rush iff RHEL9 kernel won't backport TAP at all, > otherwise this will trigger between y-stream after people upgrades partial > of the clusters. > > Thanks, > > -- > Peter Xu
On 2024/08/09 21:50, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > >> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: >>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: >>>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: >>>>> This is too big of a hammer. People already use what you call "cross >>>>> migrate" and have for years. We are not going to stop developing >>>>> features just because someone suddenly became aware of some such bit. >>>>> If you care, you will have to work to solve the problem properly - >>>>> nacking half baked hacks is the only tool maintainers have to make >>>>> people work on hard problems. >>>> >>>> IMHO this is totally different thing. It's not about proposing a new >>>> feature yet so far, it's about how we should fix a breakage first. >>>> >>>> And that's why I think we should fix it even in the simple way first, then >>>> we consider anything more benefitial from perf side without breaking >>>> anything, which should be on top of that. >>>> >>>> Thanks, >>> >>> As I said, once the quick hack is merged people stop caring. >> >> IMHO it's not a hack. It's a proper fix to me to disable it by default for >> now. >> >> OTOH, having it ON always even knowing it can break migration is a hack to >> me, when we don't have anything else to guard the migration. >> >>> Mixing different kernel versions in migration is esoteric enough for >>> this not to matter to most people. There's no rush I think, address >>> it properly. >> >> Exactly mixing kernel versions will be tricky to users to identify, but >> that's, AFAICT, exactly happening everywhere. We can't urge user to always >> use the exact same kernels when we're talking about a VM cluster. That's >> why I think allowing migration to work across those kernels matter. > > I also worry a bit about the scenario where the cluster changes slightly > and now all VMs are already restricted by some option that requires the > exact same kernel. Specifically, kernel changes in a cloud environment > also happen due to factors completely unrelated to migration. I'm not > sure the people managing the infra (who care about migration) will be > gating kernel changes just because QEMU has been configured in a > specific manner. I have wrote a bit about the expectation on the platform earlier[1], but let me summarize it here. 1. I expect the user will not downgrade the platform of hosts after setting up a VM. This is essential to enable any platform feature. 2. The user is allowed to upgrade the platform of hosts gradually. This results in a situation with mixed platforms. The oldest platform is still not older than the platform the VM is set up for. This enables the gradual deployment strategy. 3. the user is allowed to downgrade the platform of hosts to the version used when setting up the VM. This enables rollbacks in case of regression. With these expectations, we can ensure migratability by a) enabling platform features available on all hosts when setting up the VM and b) saving the enabled features. This is covered with my -dump-platform/-merge-platform/-use-platform proposal[2]. Regards, Akihiko Odaki [1] https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com [2] https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
On 2024/08/09 0:25, Peter Xu wrote: > On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: >> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: >>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: >>>> This is too big of a hammer. People already use what you call "cross >>>> migrate" and have for years. We are not going to stop developing >>>> features just because someone suddenly became aware of some such bit. >>>> If you care, you will have to work to solve the problem properly - >>>> nacking half baked hacks is the only tool maintainers have to make >>>> people work on hard problems. >>> >>> IMHO this is totally different thing. It's not about proposing a new >>> feature yet so far, it's about how we should fix a breakage first. >>> >>> And that's why I think we should fix it even in the simple way first, then >>> we consider anything more benefitial from perf side without breaking >>> anything, which should be on top of that. >>> >>> Thanks, >> >> As I said, once the quick hack is merged people stop caring. > > IMHO it's not a hack. It's a proper fix to me to disable it by default for > now. > > OTOH, having it ON always even knowing it can break migration is a hack to > me, when we don't have anything else to guard the migration. I think neither of them is a hack; they just deal with different scenarios summarized in [1]. We need apply a solution appropriate for each scenario, or we will end up with a broken system. Regards, Akihiko Odaki [1] https://lore.kernel.org/r/770300ac-7ed3-4aba-addb-b3f987cc6376@daynix.com/
On Sun, Aug 18, 2024 at 02:04:29PM +0900, Akihiko Odaki wrote: > On 2024/08/09 21:50, Fabiano Rosas wrote: > > Peter Xu <peterx@redhat.com> writes: > > > > > On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: > > > > On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: > > > > > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: > > > > > > This is too big of a hammer. People already use what you call "cross > > > > > > migrate" and have for years. We are not going to stop developing > > > > > > features just because someone suddenly became aware of some such bit. > > > > > > If you care, you will have to work to solve the problem properly - > > > > > > nacking half baked hacks is the only tool maintainers have to make > > > > > > people work on hard problems. > > > > > > > > > > IMHO this is totally different thing. It's not about proposing a new > > > > > feature yet so far, it's about how we should fix a breakage first. > > > > > > > > > > And that's why I think we should fix it even in the simple way first, then > > > > > we consider anything more benefitial from perf side without breaking > > > > > anything, which should be on top of that. > > > > > > > > > > Thanks, > > > > > > > > As I said, once the quick hack is merged people stop caring. > > > > > > IMHO it's not a hack. It's a proper fix to me to disable it by default for > > > now. > > > > > > OTOH, having it ON always even knowing it can break migration is a hack to > > > me, when we don't have anything else to guard the migration. > > > > > > > Mixing different kernel versions in migration is esoteric enough for > > > > this not to matter to most people. There's no rush I think, address > > > > it properly. > > > > > > Exactly mixing kernel versions will be tricky to users to identify, but > > > that's, AFAICT, exactly happening everywhere. We can't urge user to always > > > use the exact same kernels when we're talking about a VM cluster. That's > > > why I think allowing migration to work across those kernels matter. > > > > I also worry a bit about the scenario where the cluster changes slightly > > and now all VMs are already restricted by some option that requires the > > exact same kernel. Specifically, kernel changes in a cloud environment > > also happen due to factors completely unrelated to migration. I'm not > > sure the people managing the infra (who care about migration) will be > > gating kernel changes just because QEMU has been configured in a > > specific manner. > > I have wrote a bit about the expectation on the platform earlier[1], but let > me summarize it here. > > 1. I expect the user will not downgrade the platform of hosts after setting > up a VM. This is essential to enable any platform feature. > > 2. The user is allowed to upgrade the platform of hosts gradually. This > results in a situation with mixed platforms. The oldest platform is still > not older than the platform the VM is set up for. This enables the gradual > deployment strategy. > > 3. the user is allowed to downgrade the platform of hosts to the version > used when setting up the VM. This enables rollbacks in case of regression. > > With these expectations, we can ensure migratability by a) enabling platform > features available on all hosts when setting up the VM and b) saving the > enabled features. This is covered with my > -dump-platform/-merge-platform/-use-platform proposal[2]. I really like [2]. Do you plan to work on it? Does anyone else? > Regards, > Akihiko Odaki > > [1] > https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com > [2] > https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
On 2024/08/18 16:03, Michael S. Tsirkin wrote: > On Sun, Aug 18, 2024 at 02:04:29PM +0900, Akihiko Odaki wrote: >> On 2024/08/09 21:50, Fabiano Rosas wrote: >>> Peter Xu <peterx@redhat.com> writes: >>> >>>> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote: >>>>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote: >>>>>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote: >>>>>>> This is too big of a hammer. People already use what you call "cross >>>>>>> migrate" and have for years. We are not going to stop developing >>>>>>> features just because someone suddenly became aware of some such bit. >>>>>>> If you care, you will have to work to solve the problem properly - >>>>>>> nacking half baked hacks is the only tool maintainers have to make >>>>>>> people work on hard problems. >>>>>> >>>>>> IMHO this is totally different thing. It's not about proposing a new >>>>>> feature yet so far, it's about how we should fix a breakage first. >>>>>> >>>>>> And that's why I think we should fix it even in the simple way first, then >>>>>> we consider anything more benefitial from perf side without breaking >>>>>> anything, which should be on top of that. >>>>>> >>>>>> Thanks, >>>>> >>>>> As I said, once the quick hack is merged people stop caring. >>>> >>>> IMHO it's not a hack. It's a proper fix to me to disable it by default for >>>> now. >>>> >>>> OTOH, having it ON always even knowing it can break migration is a hack to >>>> me, when we don't have anything else to guard the migration. >>>> >>>>> Mixing different kernel versions in migration is esoteric enough for >>>>> this not to matter to most people. There's no rush I think, address >>>>> it properly. >>>> >>>> Exactly mixing kernel versions will be tricky to users to identify, but >>>> that's, AFAICT, exactly happening everywhere. We can't urge user to always >>>> use the exact same kernels when we're talking about a VM cluster. That's >>>> why I think allowing migration to work across those kernels matter. >>> >>> I also worry a bit about the scenario where the cluster changes slightly >>> and now all VMs are already restricted by some option that requires the >>> exact same kernel. Specifically, kernel changes in a cloud environment >>> also happen due to factors completely unrelated to migration. I'm not >>> sure the people managing the infra (who care about migration) will be >>> gating kernel changes just because QEMU has been configured in a >>> specific manner. >> >> I have wrote a bit about the expectation on the platform earlier[1], but let >> me summarize it here. >> >> 1. I expect the user will not downgrade the platform of hosts after setting >> up a VM. This is essential to enable any platform feature. >> >> 2. The user is allowed to upgrade the platform of hosts gradually. This >> results in a situation with mixed platforms. The oldest platform is still >> not older than the platform the VM is set up for. This enables the gradual >> deployment strategy. >> >> 3. the user is allowed to downgrade the platform of hosts to the version >> used when setting up the VM. This enables rollbacks in case of regression. >> >> With these expectations, we can ensure migratability by a) enabling platform >> features available on all hosts when setting up the VM and b) saving the >> enabled features. This is covered with my >> -dump-platform/-merge-platform/-use-platform proposal[2]. > > I really like [2]. Do you plan to work on it? Does anyone else? No, but I want to move "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" forward: https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/ This will clarify the existence of the "auto" semantics, which is to enable a platform feature based on availability. [2] will be regarded as a feature to improve the handling of the "auto" semantics once this change lands. Regards, Akihiko Odaki > >> Regards, >> Akihiko Odaki >> >> [1] >> https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com >> [2] >> https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/ >
diff --git a/hw/core/machine.c b/hw/core/machine.c index f0d35c6401..a725e76738 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -38,10 +38,14 @@ #include "exec/confidential-guest-support.h" #include "hw/virtio/virtio.h" #include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-net.h" GlobalProperty hw_compat_8_0[] = { { "migration", "multifd-flush-after-each-section", "on"}, { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, + { TYPE_VIRTIO_NET, "host_uso", "off"}, + { TYPE_VIRTIO_NET, "guest_uso4", "off"}, + { TYPE_VIRTIO_NET, "guest_uso6", "off"}, }; const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0); diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index d2311e7d6e..bd0ead94fe 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n) return n->has_ufo; } +static int peer_has_uso(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) { + return 0; + } + + return qemu_has_uso(qemu_get_queue(n->nic)->peer); +} + static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, int version_1, int hash_report) { @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); } @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); } + if (!peer_has_uso(n)) { + virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + } + if (!get_vhost_net(nc->peer)) { return features; } @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n) !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); } -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features) +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) { static const uint64_t guest_offloads_mask = (1ULL << VIRTIO_NET_F_GUEST_CSUM) | (1ULL << VIRTIO_NET_F_GUEST_TSO4) | (1ULL << VIRTIO_NET_F_GUEST_TSO6) | (1ULL << VIRTIO_NET_F_GUEST_ECN) | - (1ULL << VIRTIO_NET_F_GUEST_UFO); + (1ULL << VIRTIO_NET_F_GUEST_UFO) | + (1ULL << VIRTIO_NET_F_GUEST_USO4) | + (1ULL << VIRTIO_NET_F_GUEST_USO6); return guest_offloads_mask & features; } @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = { DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), DEFINE_PROP_BOOL("failover", VirtIONet, failover, false), + DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features, + VIRTIO_NET_F_GUEST_USO4, true), + DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features, + VIRTIO_NET_F_GUEST_USO6, true), + DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, + VIRTIO_NET_F_HOST_USO, true), DEFINE_PROP_END_OF_LIST(), };