diff mbox series

[1/4] vhost: Re-enable vrings after setting features

Message ID 20230411150515.14020-2-hreitz@redhat.com (mailing list archive)
State New, archived
Headers show
Series vhost-user-fs: Internal migration | expand

Commit Message

Hanna Czenczek April 11, 2023, 3:05 p.m. UTC
If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
setting the vhost features will set this feature, too.  Doing so
disables all vrings, which may not be intended.

For example, enabling or disabling logging during migration requires
setting those features (to set or unset VHOST_F_LOG_ALL), which will
automatically disable all vrings.  In either case, the VM is running
(disabling logging is done after a failed or cancelled migration, and
only once the VM is running again, see comment in
memory_global_dirty_log_stop()), so the vrings should really be enabled.
As a result, the back-end seems to hang.

To fix this, we must remember whether the vrings are supposed to be
enabled, and, if so, re-enable them after a SET_FEATURES call that set
VHOST_USER_F_PROTOCOL_FEATURES.

It seems less than ideal that there is a short period in which the VM is
running but the vrings will be stopped (between SET_FEATURES and
SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
e.g. by introducing a new flag or vhost-user protocol feature to disable
disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
new functions for setting/clearing singular feature bits (so that
F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).

Even with such a potential addition to the protocol, we still need this
fix here, because we cannot expect that back-ends will implement this
addition.

Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 include/hw/virtio/vhost.h | 10 ++++++++++
 hw/virtio/vhost.c         | 13 +++++++++++++
 2 files changed, 23 insertions(+)

Comments

German Maglione April 12, 2023, 10:55 a.m. UTC | #1
On Tue, Apr 11, 2023 at 5:05 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>
> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
> setting the vhost features will set this feature, too.  Doing so
> disables all vrings, which may not be intended.
>
> For example, enabling or disabling logging during migration requires
> setting those features (to set or unset VHOST_F_LOG_ALL), which will
> automatically disable all vrings.  In either case, the VM is running
> (disabling logging is done after a failed or cancelled migration, and
> only once the VM is running again, see comment in
> memory_global_dirty_log_stop()), so the vrings should really be enabled.
> As a result, the back-end seems to hang.
>
> To fix this, we must remember whether the vrings are supposed to be
> enabled, and, if so, re-enable them after a SET_FEATURES call that set
> VHOST_USER_F_PROTOCOL_FEATURES.
>
> It seems less than ideal that there is a short period in which the VM is
> running but the vrings will be stopped (between SET_FEATURES and
> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
> e.g. by introducing a new flag or vhost-user protocol feature to disable
> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
> new functions for setting/clearing singular feature bits (so that
> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>

Could be the other way around?, I mean before commit
02b61f38d3574900fb4cc4c450b17c75956a6a04

so until v7.2rc0 we didn't have this problem with
VHOST_USER_F_PROTOCOL_FEATURES,
so "it seems" it's fine to start with the vrings enabled, could be
possible to go back to that
behavior (reflecting that in the spec) and add a new flag to start
with vrings disabled?

> Even with such a potential addition to the protocol, we still need this
> fix here, because we cannot expect that back-ends will implement this
> addition.
>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost.h | 10 ++++++++++
>  hw/virtio/vhost.c         | 13 +++++++++++++
>  2 files changed, 23 insertions(+)
>
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index a52f273347..2fe02ed5d4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -90,6 +90,16 @@ struct vhost_dev {
>      int vq_index_end;
>      /* if non-zero, minimum required value for max_queues */
>      int num_queues;
> +
> +    /*
> +     * Whether the virtqueues are supposed to be enabled (via
> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
> +     * enabling/disabling logging) will disable all virtqueues if
> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
> +     * re-enable them if this field is set.
> +     */
> +    bool enable_vqs;
> +
>      /**
>       * vhost feature handling requires matching the feature set
>       * offered by a backend which may be a subset of the total
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index a266396576..cbff589efa 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>  static QLIST_HEAD(, vhost_dev) vhost_devices =
>      QLIST_HEAD_INITIALIZER(vhost_devices);
>
> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
> +
>  bool vhost_has_free_slot(void)
>  {
>      unsigned int slots_limit = ~0U;
> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>          }
>      }
>
> +    if (dev->enable_vqs) {
> +        /*
> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
> +         * virtqueues, even if that was not intended; re-enable them if
> +         * necessary.
> +         */
> +        vhost_dev_set_vring_enable(dev, true);
> +    }
> +
>  out:
>      return r;
>  }
> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>
>  static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>  {
> +    hdev->enable_vqs = enable;
> +
>      if (!hdev->vhost_ops->vhost_set_vring_enable) {
>          return 0;
>      }
> --
> 2.39.1
>
Hanna Czenczek April 12, 2023, 12:18 p.m. UTC | #2
On 12.04.23 12:55, German Maglione wrote:
> On Tue, Apr 11, 2023 at 5:05 PM Hanna Czenczek <hreitz@redhat.com> wrote:
>> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
>> setting the vhost features will set this feature, too.  Doing so
>> disables all vrings, which may not be intended.
>>
>> For example, enabling or disabling logging during migration requires
>> setting those features (to set or unset VHOST_F_LOG_ALL), which will
>> automatically disable all vrings.  In either case, the VM is running
>> (disabling logging is done after a failed or cancelled migration, and
>> only once the VM is running again, see comment in
>> memory_global_dirty_log_stop()), so the vrings should really be enabled.
>> As a result, the back-end seems to hang.
>>
>> To fix this, we must remember whether the vrings are supposed to be
>> enabled, and, if so, re-enable them after a SET_FEATURES call that set
>> VHOST_USER_F_PROTOCOL_FEATURES.
>>
>> It seems less than ideal that there is a short period in which the VM is
>> running but the vrings will be stopped (between SET_FEATURES and
>> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
>> e.g. by introducing a new flag or vhost-user protocol feature to disable
>> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
>> new functions for setting/clearing singular feature bits (so that
>> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>>
> Could be the other way around?, I mean before commit
> 02b61f38d3574900fb4cc4c450b17c75956a6a04
>
> so until v7.2rc0 we didn't have this problem with
> VHOST_USER_F_PROTOCOL_FEATURES,
> so "it seems" it's fine to start with the vrings enabled, could be
> possible to go back to that
> behavior (reflecting that in the spec) and add a new flag to start
> with vrings disabled?

I’m not a fan of retroactively changing a public specification in an 
incompatible manner.  Also, “seems fine” isn’t enough of an argument to 
do so. :)  I’m not sure whether finding out if it’s actually fine is 
easy.  But in general, I try to abstain from retroactive spec changes...

I see the problem of qemu apparently not really caring for the specified 
meaning of the flag, indicating that this specified behavior is not 
optimal.  But the ideal way to fix this to me seems to add new flags to 
change the meaning to something more broadly useful.

But I’m not convinced either way.

Hanna
Stefan Hajnoczi April 12, 2023, 8:51 p.m. UTC | #3
On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
> setting the vhost features will set this feature, too.  Doing so
> disables all vrings, which may not be intended.
> 
> For example, enabling or disabling logging during migration requires
> setting those features (to set or unset VHOST_F_LOG_ALL), which will
> automatically disable all vrings.  In either case, the VM is running
> (disabling logging is done after a failed or cancelled migration, and
> only once the VM is running again, see comment in
> memory_global_dirty_log_stop()), so the vrings should really be enabled.
> As a result, the back-end seems to hang.
> 
> To fix this, we must remember whether the vrings are supposed to be
> enabled, and, if so, re-enable them after a SET_FEATURES call that set
> VHOST_USER_F_PROTOCOL_FEATURES.
> 
> It seems less than ideal that there is a short period in which the VM is
> running but the vrings will be stopped (between SET_FEATURES and
> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
> e.g. by introducing a new flag or vhost-user protocol feature to disable
> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
> new functions for setting/clearing singular feature bits (so that
> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
> 
> Even with such a potential addition to the protocol, we still need this
> fix here, because we cannot expect that back-ends will implement this
> addition.
> 
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost.h | 10 ++++++++++
>  hw/virtio/vhost.c         | 13 +++++++++++++
>  2 files changed, 23 insertions(+)
> 
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index a52f273347..2fe02ed5d4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -90,6 +90,16 @@ struct vhost_dev {
>      int vq_index_end;
>      /* if non-zero, minimum required value for max_queues */
>      int num_queues;
> +
> +    /*
> +     * Whether the virtqueues are supposed to be enabled (via
> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
> +     * enabling/disabling logging) will disable all virtqueues if
> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
> +     * re-enable them if this field is set.
> +     */
> +    bool enable_vqs;
> +
>      /**
>       * vhost feature handling requires matching the feature set
>       * offered by a backend which may be a subset of the total
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index a266396576..cbff589efa 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>  static QLIST_HEAD(, vhost_dev) vhost_devices =
>      QLIST_HEAD_INITIALIZER(vhost_devices);
>  
> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
> +
>  bool vhost_has_free_slot(void)
>  {
>      unsigned int slots_limit = ~0U;
> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>          }
>      }
>  
> +    if (dev->enable_vqs) {
> +        /*
> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
> +         * virtqueues, even if that was not intended; re-enable them if
> +         * necessary.
> +         */
> +        vhost_dev_set_vring_enable(dev, true);
> +    }
> +
>  out:
>      return r;
>  }
> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>  
>  static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>  {
> +    hdev->enable_vqs = enable;
> +
>      if (!hdev->vhost_ops->vhost_set_vring_enable) {
>          return 0;
>      }

The vhost-user spec doesn't say that VHOST_F_LOG_ALL needs to be toggled
at runtime and I don't think VHOST_USER_SET_PROTOCOL_FEATURES is
intended to be used like that. This issue shows why doing so is a bad
idea.

VHOST_F_LOG_ALL does not need to be toggled to control logging. Logging
is controlled at runtime by the presence of the dirty log
(VHOST_USER_SET_LOG_BASE) and the per-vring logging flag
(VHOST_VRING_F_LOG).

I suggest permanently enabling VHOST_F_LOG_ALL upon connection when the
the backend supports it. No spec changes are required.

libvhost-user looks like it will work. I didn't look at DPDK/SPDK, but
checking that it works there is important too.

I have CCed people who may be interested in this issue. This is the
first time I've looked at vhost-user logging, so this idea may not work.

Stefan
Maxime Coquelin April 13, 2023, 7:17 a.m. UTC | #4
On 4/12/23 22:51, Stefan Hajnoczi wrote:
> On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
>> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
>> setting the vhost features will set this feature, too.  Doing so
>> disables all vrings, which may not be intended.
>>
>> For example, enabling or disabling logging during migration requires
>> setting those features (to set or unset VHOST_F_LOG_ALL), which will
>> automatically disable all vrings.  In either case, the VM is running
>> (disabling logging is done after a failed or cancelled migration, and
>> only once the VM is running again, see comment in
>> memory_global_dirty_log_stop()), so the vrings should really be enabled.
>> As a result, the back-end seems to hang.
>>
>> To fix this, we must remember whether the vrings are supposed to be
>> enabled, and, if so, re-enable them after a SET_FEATURES call that set
>> VHOST_USER_F_PROTOCOL_FEATURES.
>>
>> It seems less than ideal that there is a short period in which the VM is
>> running but the vrings will be stopped (between SET_FEATURES and
>> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
>> e.g. by introducing a new flag or vhost-user protocol feature to disable
>> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
>> new functions for setting/clearing singular feature bits (so that
>> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>>
>> Even with such a potential addition to the protocol, we still need this
>> fix here, because we cannot expect that back-ends will implement this
>> addition.
>>
>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>> ---
>>   include/hw/virtio/vhost.h | 10 ++++++++++
>>   hw/virtio/vhost.c         | 13 +++++++++++++
>>   2 files changed, 23 insertions(+)
>>
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index a52f273347..2fe02ed5d4 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -90,6 +90,16 @@ struct vhost_dev {
>>       int vq_index_end;
>>       /* if non-zero, minimum required value for max_queues */
>>       int num_queues;
>> +
>> +    /*
>> +     * Whether the virtqueues are supposed to be enabled (via
>> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
>> +     * enabling/disabling logging) will disable all virtqueues if
>> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
>> +     * re-enable them if this field is set.
>> +     */
>> +    bool enable_vqs;
>> +
>>       /**
>>        * vhost feature handling requires matching the feature set
>>        * offered by a backend which may be a subset of the total
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index a266396576..cbff589efa 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>>   static QLIST_HEAD(, vhost_dev) vhost_devices =
>>       QLIST_HEAD_INITIALIZER(vhost_devices);
>>   
>> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
>> +
>>   bool vhost_has_free_slot(void)
>>   {
>>       unsigned int slots_limit = ~0U;
>> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>>           }
>>       }
>>   
>> +    if (dev->enable_vqs) {
>> +        /*
>> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
>> +         * virtqueues, even if that was not intended; re-enable them if
>> +         * necessary.
>> +         */
>> +        vhost_dev_set_vring_enable(dev, true);
>> +    }
>> +
>>   out:
>>       return r;
>>   }
>> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>>   
>>   static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>>   {
>> +    hdev->enable_vqs = enable;
>> +
>>       if (!hdev->vhost_ops->vhost_set_vring_enable) {
>>           return 0;
>>       }
> 
> The vhost-user spec doesn't say that VHOST_F_LOG_ALL needs to be toggled
> at runtime and I don't think VHOST_USER_SET_PROTOCOL_FEATURES is
> intended to be used like that. This issue shows why doing so is a bad
> idea.
> 
> VHOST_F_LOG_ALL does not need to be toggled to control logging. Logging
> is controlled at runtime by the presence of the dirty log
> (VHOST_USER_SET_LOG_BASE) and the per-vring logging flag
> (VHOST_VRING_F_LOG).
> 
> I suggest permanently enabling VHOST_F_LOG_ALL upon connection when the
> the backend supports it. No spec changes are required.
> 
> libvhost-user looks like it will work. I didn't look at DPDK/SPDK, but
> checking that it works there is important too.
> 
> I have CCed people who may be interested in this issue. This is the
> first time I've looked at vhost-user logging, so this idea may not work.

In the case of DPDK, we rely on VHOST_F_LOG_ALL to be set to know
whether we should do dirty pages logging or not [0], so setting this
feature at init time will cause performance degradation. The check on
whether the log base address has been set is done afterwards.

Regards,
Maxime

> Stefan

[0]: https://git.dpdk.org/dpdk/tree/lib/vhost/vhost.h#n594
Hanna Czenczek April 13, 2023, 8:19 a.m. UTC | #5
On 12.04.23 22:51, Stefan Hajnoczi wrote:
> On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
>> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
>> setting the vhost features will set this feature, too.  Doing so
>> disables all vrings, which may not be intended.
>>
>> For example, enabling or disabling logging during migration requires
>> setting those features (to set or unset VHOST_F_LOG_ALL), which will
>> automatically disable all vrings.  In either case, the VM is running
>> (disabling logging is done after a failed or cancelled migration, and
>> only once the VM is running again, see comment in
>> memory_global_dirty_log_stop()), so the vrings should really be enabled.
>> As a result, the back-end seems to hang.
>>
>> To fix this, we must remember whether the vrings are supposed to be
>> enabled, and, if so, re-enable them after a SET_FEATURES call that set
>> VHOST_USER_F_PROTOCOL_FEATURES.
>>
>> It seems less than ideal that there is a short period in which the VM is
>> running but the vrings will be stopped (between SET_FEATURES and
>> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
>> e.g. by introducing a new flag or vhost-user protocol feature to disable
>> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
>> new functions for setting/clearing singular feature bits (so that
>> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>>
>> Even with such a potential addition to the protocol, we still need this
>> fix here, because we cannot expect that back-ends will implement this
>> addition.
>>
>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>> ---
>>   include/hw/virtio/vhost.h | 10 ++++++++++
>>   hw/virtio/vhost.c         | 13 +++++++++++++
>>   2 files changed, 23 insertions(+)
>>
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index a52f273347..2fe02ed5d4 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -90,6 +90,16 @@ struct vhost_dev {
>>       int vq_index_end;
>>       /* if non-zero, minimum required value for max_queues */
>>       int num_queues;
>> +
>> +    /*
>> +     * Whether the virtqueues are supposed to be enabled (via
>> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
>> +     * enabling/disabling logging) will disable all virtqueues if
>> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
>> +     * re-enable them if this field is set.
>> +     */
>> +    bool enable_vqs;
>> +
>>       /**
>>        * vhost feature handling requires matching the feature set
>>        * offered by a backend which may be a subset of the total
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index a266396576..cbff589efa 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>>   static QLIST_HEAD(, vhost_dev) vhost_devices =
>>       QLIST_HEAD_INITIALIZER(vhost_devices);
>>   
>> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
>> +
>>   bool vhost_has_free_slot(void)
>>   {
>>       unsigned int slots_limit = ~0U;
>> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>>           }
>>       }
>>   
>> +    if (dev->enable_vqs) {
>> +        /*
>> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
>> +         * virtqueues, even if that was not intended; re-enable them if
>> +         * necessary.
>> +         */
>> +        vhost_dev_set_vring_enable(dev, true);
>> +    }
>> +
>>   out:
>>       return r;
>>   }
>> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>>   
>>   static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>>   {
>> +    hdev->enable_vqs = enable;
>> +
>>       if (!hdev->vhost_ops->vhost_set_vring_enable) {
>>           return 0;
>>       }
> The vhost-user spec doesn't say that VHOST_F_LOG_ALL needs to be toggled
> at runtime and I don't think VHOST_USER_SET_PROTOCOL_FEATURES is
> intended to be used like that. This issue shows why doing so is a bad
> idea.
>
> VHOST_F_LOG_ALL does not need to be toggled to control logging. Logging
> is controlled at runtime by the presence of the dirty log
> (VHOST_USER_SET_LOG_BASE) and the per-vring logging flag
> (VHOST_VRING_F_LOG).

Technically, the spec doesn’t say that SET_LOG_BASE is required.  It says:

“To start/stop logging of data/used ring writes, the front-end may send 
messages VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and 
VHOST_USER_SET_VRING_ADDR with VHOST_VRING_F_LOG in ring’s flags set to 
1/0, respectively.”

(So the spec also very much does imply that toggling F_LOG_ALL at 
runtime is a valid way to enable/disable logging.  If we were to no 
longer do that, we should clarify it there.)

I mean, naturally, logging without a shared memory area to log in to 
isn’t much fun, so we could clarify that SET_LOG_BASE is also a 
requirement, but it looks to me as if we can’t use SET_LOG_BASE to 
disable logging, because it’s supposed to always pass a valid FD (at 
least libvhost-user expects this: 
https://gitlab.com/qemu-project/qemu/-/blob/master/subprojects/libvhost-user/libvhost-user.c#L1044). 
So after a cancelled migration, the dirty bitmap SHM will stay around 
indefinitely (which is already not great, but if we were to use the 
presence of that bitmap as an indicator as to whether we should log or 
not, it would be worse).

So the VRING_F_LOG flag remains.

> I suggest permanently enabling VHOST_F_LOG_ALL upon connection when the
> the backend supports it. No spec changes are required.
>
> libvhost-user looks like it will work. I didn't look at DPDK/SPDK, but
> checking that it works there is important too.

I can’t find VRING_F_LOG in libvhost-user, so what protocol do you mean 
exactly that would work in libvhost-user?  Because SET_LOG_BASE won’t 
work, as you can’t use it to disable logging.

(For DPDK, I’m not sure.  It looks like it sometimes takes VRING_F_LOG 
into account, but I think only when it comes to logging accesses to the 
vring specifically, i.e. not DMA to other areas of guest memory?  I 
think only places that use vq->log_guest_addr implicitly check it, 
others don’t.  So for example, 
vhost_log_write_iova()/vhost_log_cache_write_iova() don’t seem to check 
VRING_F_LOG, which seem to be the functions generally used for writes to 
memory outside of the vrings.  So here, even if VRING_F_LOG is disabled 
for all vrings, as long as a log SHM is set, all writes to memory 
outside of the immediate vrings seem to cause logging (as long as 
F_LOG_ALL is set).)

Hanna

> I have CCed people who may be interested in this issue. This is the
> first time I've looked at vhost-user logging, so this idea may not work.
>
> Stefan
Stefan Hajnoczi April 13, 2023, 11:03 a.m. UTC | #6
On Tue, 11 Apr 2023 at 11:05, Hanna Czenczek <hreitz@redhat.com> wrote:
>
> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
> setting the vhost features will set this feature, too.  Doing so
> disables all vrings, which may not be intended.
>
> For example, enabling or disabling logging during migration requires
> setting those features (to set or unset VHOST_F_LOG_ALL), which will
> automatically disable all vrings.  In either case, the VM is running
> (disabling logging is done after a failed or cancelled migration, and
> only once the VM is running again, see comment in
> memory_global_dirty_log_stop()), so the vrings should really be enabled.
> As a result, the back-end seems to hang.
>
> To fix this, we must remember whether the vrings are supposed to be
> enabled, and, if so, re-enable them after a SET_FEATURES call that set
> VHOST_USER_F_PROTOCOL_FEATURES.
>
> It seems less than ideal that there is a short period in which the VM is
> running but the vrings will be stopped (between SET_FEATURES and
> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
> e.g. by introducing a new flag or vhost-user protocol feature to disable
> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
> new functions for setting/clearing singular feature bits (so that
> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>
> Even with such a potential addition to the protocol, we still need this
> fix here, because we cannot expect that back-ends will implement this
> addition.
>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost.h | 10 ++++++++++
>  hw/virtio/vhost.c         | 13 +++++++++++++
>  2 files changed, 23 insertions(+)
>
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index a52f273347..2fe02ed5d4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -90,6 +90,16 @@ struct vhost_dev {
>      int vq_index_end;
>      /* if non-zero, minimum required value for max_queues */
>      int num_queues;
> +
> +    /*
> +     * Whether the virtqueues are supposed to be enabled (via
> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
> +     * enabling/disabling logging) will disable all virtqueues if
> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
> +     * re-enable them if this field is set.
> +     */
> +    bool enable_vqs;
> +
>      /**
>       * vhost feature handling requires matching the feature set
>       * offered by a backend which may be a subset of the total
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index a266396576..cbff589efa 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>  static QLIST_HEAD(, vhost_dev) vhost_devices =
>      QLIST_HEAD_INITIALIZER(vhost_devices);
>
> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
> +
>  bool vhost_has_free_slot(void)
>  {
>      unsigned int slots_limit = ~0U;
> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>          }
>      }
>
> +    if (dev->enable_vqs) {
> +        /*
> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all

Is there a reason to put this vhost-user-specific workaround in
vhost.c instead of vhost-user.c?

Stefan
Stefan Hajnoczi April 13, 2023, 11:03 a.m. UTC | #7
On Thu, 13 Apr 2023 at 04:20, Hanna Czenczek <hreitz@redhat.com> wrote:
>
> On 12.04.23 22:51, Stefan Hajnoczi wrote:
> > On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
> >> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
> >> setting the vhost features will set this feature, too.  Doing so
> >> disables all vrings, which may not be intended.
> >>
> >> For example, enabling or disabling logging during migration requires
> >> setting those features (to set or unset VHOST_F_LOG_ALL), which will
> >> automatically disable all vrings.  In either case, the VM is running
> >> (disabling logging is done after a failed or cancelled migration, and
> >> only once the VM is running again, see comment in
> >> memory_global_dirty_log_stop()), so the vrings should really be enabled.
> >> As a result, the back-end seems to hang.
> >>
> >> To fix this, we must remember whether the vrings are supposed to be
> >> enabled, and, if so, re-enable them after a SET_FEATURES call that set
> >> VHOST_USER_F_PROTOCOL_FEATURES.
> >>
> >> It seems less than ideal that there is a short period in which the VM is
> >> running but the vrings will be stopped (between SET_FEATURES and
> >> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
> >> e.g. by introducing a new flag or vhost-user protocol feature to disable
> >> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
> >> new functions for setting/clearing singular feature bits (so that
> >> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
> >>
> >> Even with such a potential addition to the protocol, we still need this
> >> fix here, because we cannot expect that back-ends will implement this
> >> addition.
> >>
> >> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> >> ---
> >>   include/hw/virtio/vhost.h | 10 ++++++++++
> >>   hw/virtio/vhost.c         | 13 +++++++++++++
> >>   2 files changed, 23 insertions(+)
> >>
> >> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >> index a52f273347..2fe02ed5d4 100644
> >> --- a/include/hw/virtio/vhost.h
> >> +++ b/include/hw/virtio/vhost.h
> >> @@ -90,6 +90,16 @@ struct vhost_dev {
> >>       int vq_index_end;
> >>       /* if non-zero, minimum required value for max_queues */
> >>       int num_queues;
> >> +
> >> +    /*
> >> +     * Whether the virtqueues are supposed to be enabled (via
> >> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
> >> +     * enabling/disabling logging) will disable all virtqueues if
> >> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
> >> +     * re-enable them if this field is set.
> >> +     */
> >> +    bool enable_vqs;
> >> +
> >>       /**
> >>        * vhost feature handling requires matching the feature set
> >>        * offered by a backend which may be a subset of the total
> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >> index a266396576..cbff589efa 100644
> >> --- a/hw/virtio/vhost.c
> >> +++ b/hw/virtio/vhost.c
> >> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
> >>   static QLIST_HEAD(, vhost_dev) vhost_devices =
> >>       QLIST_HEAD_INITIALIZER(vhost_devices);
> >>
> >> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
> >> +
> >>   bool vhost_has_free_slot(void)
> >>   {
> >>       unsigned int slots_limit = ~0U;
> >> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
> >>           }
> >>       }
> >>
> >> +    if (dev->enable_vqs) {
> >> +        /*
> >> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
> >> +         * virtqueues, even if that was not intended; re-enable them if
> >> +         * necessary.
> >> +         */
> >> +        vhost_dev_set_vring_enable(dev, true);
> >> +    }
> >> +
> >>   out:
> >>       return r;
> >>   }
> >> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
> >>
> >>   static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
> >>   {
> >> +    hdev->enable_vqs = enable;
> >> +
> >>       if (!hdev->vhost_ops->vhost_set_vring_enable) {
> >>           return 0;
> >>       }
> > The vhost-user spec doesn't say that VHOST_F_LOG_ALL needs to be toggled
> > at runtime and I don't think VHOST_USER_SET_PROTOCOL_FEATURES is
> > intended to be used like that. This issue shows why doing so is a bad
> > idea.
> >
> > VHOST_F_LOG_ALL does not need to be toggled to control logging. Logging
> > is controlled at runtime by the presence of the dirty log
> > (VHOST_USER_SET_LOG_BASE) and the per-vring logging flag
> > (VHOST_VRING_F_LOG).
>
> Technically, the spec doesn’t say that SET_LOG_BASE is required.  It says:
>
> “To start/stop logging of data/used ring writes, the front-end may send
> messages VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and
> VHOST_USER_SET_VRING_ADDR with VHOST_VRING_F_LOG in ring’s flags set to
> 1/0, respectively.”
>
> (So the spec also very much does imply that toggling F_LOG_ALL at
> runtime is a valid way to enable/disable logging.  If we were to no
> longer do that, we should clarify it there.)

I missed that VHOST_VRING_F_LOG only controls logging used ring writes
while writes to descriptors are always logged when VHOST_F_LOG_ALL is
set. I agree that the spec does require VHOST_F_LOG_ALL to be toggled
at runtime.

What I suggested won't work.

> I mean, naturally, logging without a shared memory area to log in to
> isn’t much fun, so we could clarify that SET_LOG_BASE is also a
> requirement, but it looks to me as if we can’t use SET_LOG_BASE to
> disable logging, because it’s supposed to always pass a valid FD (at
> least libvhost-user expects this:
> https://gitlab.com/qemu-project/qemu/-/blob/master/subprojects/libvhost-user/libvhost-user.c#L1044).

As an aside: I don't understand how logging without an fd is supposed
to work in QEMU's code or in the vhost-user spec. QEMU does not
support that case even though it's written as if shmfd were optional.

> So after a cancelled migration, the dirty bitmap SHM will stay around
> indefinitely (which is already not great, but if we were to use the
> presence of that bitmap as an indicator as to whether we should log or
> not, it would be worse).

Yes, continuing to log forever is worse.

>
> So the VRING_F_LOG flag remains.
>
> > I suggest permanently enabling VHOST_F_LOG_ALL upon connection when the
> > the backend supports it. No spec changes are required.
> >
> > libvhost-user looks like it will work. I didn't look at DPDK/SPDK, but
> > checking that it works there is important too.
>
> I can’t find VRING_F_LOG in libvhost-user, so what protocol do you mean
> exactly that would work in libvhost-user?  Because SET_LOG_BASE won’t
> work, as you can’t use it to disable logging.

That's true. There is no way to disable logging.

> (For DPDK, I’m not sure.  It looks like it sometimes takes VRING_F_LOG
> into account, but I think only when it comes to logging accesses to the
> vring specifically, i.e. not DMA to other areas of guest memory?  I
> think only places that use vq->log_guest_addr implicitly check it,
> others don’t.  So for example,
> vhost_log_write_iova()/vhost_log_cache_write_iova() don’t seem to check
> VRING_F_LOG, which seem to be the functions generally used for writes to
> memory outside of the vrings.  So here, even if VRING_F_LOG is disabled
> for all vrings, as long as a log SHM is set, all writes to memory
> outside of the immediate vrings seem to cause logging (as long as
> F_LOG_ALL is set).)
>
> Hanna
>
> > I have CCed people who may be interested in this issue. This is the
> > first time I've looked at vhost-user logging, so this idea may not work.
> >
> > Stefan
>
>
Michael S. Tsirkin April 13, 2023, 1:19 p.m. UTC | #8
On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
> setting the vhost features will set this feature, too.  Doing so
> disables all vrings, which may not be intended.

Hmm not sure I understand: why does it disable vrings?

> For example, enabling or disabling logging during migration requires
> setting those features (to set or unset VHOST_F_LOG_ALL), which will
> automatically disable all vrings.  In either case, the VM is running
> (disabling logging is done after a failed or cancelled migration, and
> only once the VM is running again, see comment in
> memory_global_dirty_log_stop()), so the vrings should really be enabled.
> As a result, the back-end seems to hang.
> 
> To fix this, we must remember whether the vrings are supposed to be
> enabled, and, if so, re-enable them after a SET_FEATURES call that set
> VHOST_USER_F_PROTOCOL_FEATURES.
> 
> It seems less than ideal that there is a short period in which the VM is
> running but the vrings will be stopped (between SET_FEATURES and
> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
> e.g. by introducing a new flag or vhost-user protocol feature to disable
> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
> new functions for setting/clearing singular feature bits (so that
> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
> 
> Even with such a potential addition to the protocol, we still need this
> fix here, because we cannot expect that back-ends will implement this
> addition.
> 
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  include/hw/virtio/vhost.h | 10 ++++++++++
>  hw/virtio/vhost.c         | 13 +++++++++++++
>  2 files changed, 23 insertions(+)
> 
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index a52f273347..2fe02ed5d4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -90,6 +90,16 @@ struct vhost_dev {
>      int vq_index_end;
>      /* if non-zero, minimum required value for max_queues */
>      int num_queues;
> +
> +    /*
> +     * Whether the virtqueues are supposed to be enabled (via
> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
> +     * enabling/disabling logging) will disable all virtqueues if
> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
> +     * re-enable them if this field is set.
> +     */
> +    bool enable_vqs;
> +
>      /**
>       * vhost feature handling requires matching the feature set
>       * offered by a backend which may be a subset of the total
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index a266396576..cbff589efa 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>  static QLIST_HEAD(, vhost_dev) vhost_devices =
>      QLIST_HEAD_INITIALIZER(vhost_devices);
>  
> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
> +
>  bool vhost_has_free_slot(void)
>  {
>      unsigned int slots_limit = ~0U;
> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>          }
>      }
>  
> +    if (dev->enable_vqs) {
> +        /*
> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
> +         * virtqueues, even if that was not intended; re-enable them if
> +         * necessary.
> +         */
> +        vhost_dev_set_vring_enable(dev, true);
> +    }
> +
>  out:
>      return r;
>  }
> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>  
>  static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>  {
> +    hdev->enable_vqs = enable;
> +
>      if (!hdev->vhost_ops->vhost_set_vring_enable) {
>          return 0;
>      }
> -- 
> 2.39.1
Anton Kuchin April 13, 2023, 2:24 p.m. UTC | #9
On 13/04/2023 14:03, Stefan Hajnoczi wrote:
> On Thu, 13 Apr 2023 at 04:20, Hanna Czenczek <hreitz@redhat.com> wrote:
>> On 12.04.23 22:51, Stefan Hajnoczi wrote:
>>> On Tue, Apr 11, 2023 at 05:05:12PM +0200, Hanna Czenczek wrote:
>>>> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
>>>> setting the vhost features will set this feature, too.  Doing so
>>>> disables all vrings, which may not be intended.
>>>>
>>>> For example, enabling or disabling logging during migration requires
>>>> setting those features (to set or unset VHOST_F_LOG_ALL), which will
>>>> automatically disable all vrings.  In either case, the VM is running
>>>> (disabling logging is done after a failed or cancelled migration, and
>>>> only once the VM is running again, see comment in
>>>> memory_global_dirty_log_stop()), so the vrings should really be enabled.
>>>> As a result, the back-end seems to hang.
>>>>
>>>> To fix this, we must remember whether the vrings are supposed to be
>>>> enabled, and, if so, re-enable them after a SET_FEATURES call that set
>>>> VHOST_USER_F_PROTOCOL_FEATURES.
>>>>
>>>> It seems less than ideal that there is a short period in which the VM is
>>>> running but the vrings will be stopped (between SET_FEATURES and
>>>> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
>>>> e.g. by introducing a new flag or vhost-user protocol feature to disable
>>>> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
>>>> new functions for setting/clearing singular feature bits (so that
>>>> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>>>>
>>>> Even with such a potential addition to the protocol, we still need this
>>>> fix here, because we cannot expect that back-ends will implement this
>>>> addition.
>>>>
>>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>>>> ---
>>>>    include/hw/virtio/vhost.h | 10 ++++++++++
>>>>    hw/virtio/vhost.c         | 13 +++++++++++++
>>>>    2 files changed, 23 insertions(+)
>>>>
>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>> index a52f273347..2fe02ed5d4 100644
>>>> --- a/include/hw/virtio/vhost.h
>>>> +++ b/include/hw/virtio/vhost.h
>>>> @@ -90,6 +90,16 @@ struct vhost_dev {
>>>>        int vq_index_end;
>>>>        /* if non-zero, minimum required value for max_queues */
>>>>        int num_queues;
>>>> +
>>>> +    /*
>>>> +     * Whether the virtqueues are supposed to be enabled (via
>>>> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
>>>> +     * enabling/disabling logging) will disable all virtqueues if
>>>> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
>>>> +     * re-enable them if this field is set.
>>>> +     */
>>>> +    bool enable_vqs;
>>>> +
>>>>        /**
>>>>         * vhost feature handling requires matching the feature set
>>>>         * offered by a backend which may be a subset of the total
>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>> index a266396576..cbff589efa 100644
>>>> --- a/hw/virtio/vhost.c
>>>> +++ b/hw/virtio/vhost.c
>>>> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>>>>    static QLIST_HEAD(, vhost_dev) vhost_devices =
>>>>        QLIST_HEAD_INITIALIZER(vhost_devices);
>>>>
>>>> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
>>>> +
>>>>    bool vhost_has_free_slot(void)
>>>>    {
>>>>        unsigned int slots_limit = ~0U;
>>>> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>>>>            }
>>>>        }
>>>>
>>>> +    if (dev->enable_vqs) {
>>>> +        /*
>>>> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
>>>> +         * virtqueues, even if that was not intended; re-enable them if
>>>> +         * necessary.
>>>> +         */
>>>> +        vhost_dev_set_vring_enable(dev, true);
>>>> +    }
>>>> +
>>>>    out:
>>>>        return r;
>>>>    }
>>>> @@ -1896,6 +1907,8 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
>>>>
>>>>    static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
>>>>    {
>>>> +    hdev->enable_vqs = enable;
>>>> +
>>>>        if (!hdev->vhost_ops->vhost_set_vring_enable) {
>>>>            return 0;
>>>>        }
>>> The vhost-user spec doesn't say that VHOST_F_LOG_ALL needs to be toggled
>>> at runtime and I don't think VHOST_USER_SET_PROTOCOL_FEATURES is
>>> intended to be used like that. This issue shows why doing so is a bad
>>> idea.
>>>
>>> VHOST_F_LOG_ALL does not need to be toggled to control logging. Logging
>>> is controlled at runtime by the presence of the dirty log
>>> (VHOST_USER_SET_LOG_BASE) and the per-vring logging flag
>>> (VHOST_VRING_F_LOG).
>> Technically, the spec doesn’t say that SET_LOG_BASE is required.  It says:
>>
>> “To start/stop logging of data/used ring writes, the front-end may send
>> messages VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and
>> VHOST_USER_SET_VRING_ADDR with VHOST_VRING_F_LOG in ring’s flags set to
>> 1/0, respectively.”
>>
>> (So the spec also very much does imply that toggling F_LOG_ALL at
>> runtime is a valid way to enable/disable logging.  If we were to no
>> longer do that, we should clarify it there.)
> I missed that VHOST_VRING_F_LOG only controls logging used ring writes
> while writes to descriptors are always logged when VHOST_F_LOG_ALL is
> set. I agree that the spec does require VHOST_F_LOG_ALL to be toggled
> at runtime.
>
> What I suggested won't work.

But is there a valid use-case for logging some dirty memory but not all?
I can't understand if this is a feature or a just flaw in specification.

>
>> I mean, naturally, logging without a shared memory area to log in to
>> isn’t much fun, so we could clarify that SET_LOG_BASE is also a
>> requirement, but it looks to me as if we can’t use SET_LOG_BASE to
>> disable logging, because it’s supposed to always pass a valid FD (at
>> least libvhost-user expects this:
>> https://gitlab.com/qemu-project/qemu/-/blob/master/subprojects/libvhost-user/libvhost-user.c#L1044).
> As an aside: I don't understand how logging without an fd is supposed
> to work in QEMU's code or in the vhost-user spec. QEMU does not
> support that case even though it's written as if shmfd were optional.
>
>> So after a cancelled migration, the dirty bitmap SHM will stay around
>> indefinitely (which is already not great, but if we were to use the
>> presence of that bitmap as an indicator as to whether we should log or
>> not, it would be worse).
> Yes, continuing to log forever is worse.
>
>> So the VRING_F_LOG flag remains.
>>
>>> I suggest permanently enabling VHOST_F_LOG_ALL upon connection when the
>>> the backend supports it. No spec changes are required.
>>>
>>> libvhost-user looks like it will work. I didn't look at DPDK/SPDK, but
>>> checking that it works there is important too.
>> I can’t find VRING_F_LOG in libvhost-user, so what protocol do you mean
>> exactly that would work in libvhost-user?  Because SET_LOG_BASE won’t
>> work, as you can’t use it to disable logging.
> That's true. There is no way to disable logging.
>
>> (For DPDK, I’m not sure.  It looks like it sometimes takes VRING_F_LOG
>> into account, but I think only when it comes to logging accesses to the
>> vring specifically, i.e. not DMA to other areas of guest memory?  I
>> think only places that use vq->log_guest_addr implicitly check it,
>> others don’t.  So for example,
>> vhost_log_write_iova()/vhost_log_cache_write_iova() don’t seem to check
>> VRING_F_LOG, which seem to be the functions generally used for writes to
>> memory outside of the vrings.  So here, even if VRING_F_LOG is disabled
>> for all vrings, as long as a log SHM is set, all writes to memory
>> outside of the immediate vrings seem to cause logging (as long as
>> F_LOG_ALL is set).)
>>
>> Hanna
>>
>>> I have CCed people who may be interested in this issue. This is the
>>> first time I've looked at vhost-user logging, so this idea may not work.
>>>
>>> Stefan
>>
Michael S. Tsirkin April 13, 2023, 3:48 p.m. UTC | #10
On Thu, Apr 13, 2023 at 05:24:36PM +0300, Anton Kuchin wrote:
> But is there a valid use-case for logging some dirty memory but not all?
> I can't understand if this is a feature or a just flaw in specification.

IRC the use-case originally conceived was for shadow VQs.  If you use
shadow VQs the VQ access by backend does not change memory since shadow
VQ is not in memory. Not a practical concern right now but there you
have it.
Hanna Czenczek April 13, 2023, 5:32 p.m. UTC | #11
On 13.04.23 13:03, Stefan Hajnoczi wrote:
> On Tue, 11 Apr 2023 at 11:05, Hanna Czenczek <hreitz@redhat.com> wrote:
>> If the back-end supports the VHOST_USER_F_PROTOCOL_FEATURES feature,
>> setting the vhost features will set this feature, too.  Doing so
>> disables all vrings, which may not be intended.
>>
>> For example, enabling or disabling logging during migration requires
>> setting those features (to set or unset VHOST_F_LOG_ALL), which will
>> automatically disable all vrings.  In either case, the VM is running
>> (disabling logging is done after a failed or cancelled migration, and
>> only once the VM is running again, see comment in
>> memory_global_dirty_log_stop()), so the vrings should really be enabled.
>> As a result, the back-end seems to hang.
>>
>> To fix this, we must remember whether the vrings are supposed to be
>> enabled, and, if so, re-enable them after a SET_FEATURES call that set
>> VHOST_USER_F_PROTOCOL_FEATURES.
>>
>> It seems less than ideal that there is a short period in which the VM is
>> running but the vrings will be stopped (between SET_FEATURES and
>> SET_VRING_ENABLE).  To fix this, we would need to change the protocol,
>> e.g. by introducing a new flag or vhost-user protocol feature to disable
>> disabling vrings whenever VHOST_USER_F_PROTOCOL_FEATURES is set, or add
>> new functions for setting/clearing singular feature bits (so that
>> F_LOG_ALL can be set/cleared without touching F_PROTOCOL_FEATURES).
>>
>> Even with such a potential addition to the protocol, we still need this
>> fix here, because we cannot expect that back-ends will implement this
>> addition.
>>
>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>> ---
>>   include/hw/virtio/vhost.h | 10 ++++++++++
>>   hw/virtio/vhost.c         | 13 +++++++++++++
>>   2 files changed, 23 insertions(+)
>>
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index a52f273347..2fe02ed5d4 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -90,6 +90,16 @@ struct vhost_dev {
>>       int vq_index_end;
>>       /* if non-zero, minimum required value for max_queues */
>>       int num_queues;
>> +
>> +    /*
>> +     * Whether the virtqueues are supposed to be enabled (via
>> +     * SET_VRING_ENABLE).  Setting the features (e.g. for
>> +     * enabling/disabling logging) will disable all virtqueues if
>> +     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
>> +     * re-enable them if this field is set.
>> +     */
>> +    bool enable_vqs;
>> +
>>       /**
>>        * vhost feature handling requires matching the feature set
>>        * offered by a backend which may be a subset of the total
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index a266396576..cbff589efa 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -50,6 +50,8 @@ static unsigned int used_memslots;
>>   static QLIST_HEAD(, vhost_dev) vhost_devices =
>>       QLIST_HEAD_INITIALIZER(vhost_devices);
>>
>> +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
>> +
>>   bool vhost_has_free_slot(void)
>>   {
>>       unsigned int slots_limit = ~0U;
>> @@ -899,6 +901,15 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>>           }
>>       }
>>
>> +    if (dev->enable_vqs) {
>> +        /*
>> +         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
> Is there a reason to put this vhost-user-specific workaround in
> vhost.c instead of vhost-user.c?

My feeling was that this isn’t really vhost-user-specific.  It just so
happens that vhost-user is the only implementation that has a special
feature that disables all vrings, but I can’t see a reason why that
would be vhost-user-specific.

I mean, vhost_dev_set_vring_enable() looks like it can work for any
vhost implementation, but:
- .vhost_set_vring_enable() is indeed only implemented for vhost-user
- vhost_dev_set_vring_enable() won’t do anything if (for a vhost-user
   back-end) the F_PROTOCOL_FEATURES feature hasn’t been negotiated.

So this looked to me like if .vhost_set_vring_enable() were ever
implemented for anything but vhost-user, it’s quite likely that this too
would be gated behind some feature that auto-disables all vrings.

So this, plus the fact that vhost_dev_set_vring_enable() already has
vhost-user-specific code (while being a vhost.c function), I thought
it’d be best to put this into generic vhost.c code, simply because in
the worst case, it’ll be a no-op on other vhost implementations.

But as it is, functionally, of course I can just put it into
vhost_user_set_features().  (It would be a tiny bit more complicated,
because then I’d have to use vhost_user_set_vring_enable(), which
returns an error if F_PROTOCOL_FEATURES isn’t there, which we’d have to
ignore – vhost_dev_set_vring_enable() does that already for me.)

Hanna
diff mbox series

Patch

diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index a52f273347..2fe02ed5d4 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -90,6 +90,16 @@  struct vhost_dev {
     int vq_index_end;
     /* if non-zero, minimum required value for max_queues */
     int num_queues;
+
+    /*
+     * Whether the virtqueues are supposed to be enabled (via
+     * SET_VRING_ENABLE).  Setting the features (e.g. for
+     * enabling/disabling logging) will disable all virtqueues if
+     * VHOST_USER_F_PROTOCOL_FEATURES is set, so then we need to
+     * re-enable them if this field is set.
+     */
+    bool enable_vqs;
+
     /**
      * vhost feature handling requires matching the feature set
      * offered by a backend which may be a subset of the total
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index a266396576..cbff589efa 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -50,6 +50,8 @@  static unsigned int used_memslots;
 static QLIST_HEAD(, vhost_dev) vhost_devices =
     QLIST_HEAD_INITIALIZER(vhost_devices);
 
+static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable);
+
 bool vhost_has_free_slot(void)
 {
     unsigned int slots_limit = ~0U;
@@ -899,6 +901,15 @@  static int vhost_dev_set_features(struct vhost_dev *dev,
         }
     }
 
+    if (dev->enable_vqs) {
+        /*
+         * Setting VHOST_USER_F_PROTOCOL_FEATURES would have disabled all
+         * virtqueues, even if that was not intended; re-enable them if
+         * necessary.
+         */
+        vhost_dev_set_vring_enable(dev, true);
+    }
+
 out:
     return r;
 }
@@ -1896,6 +1907,8 @@  int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
 
 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
 {
+    hdev->enable_vqs = enable;
+
     if (!hdev->vhost_ops->vhost_set_vring_enable) {
         return 0;
     }