Message ID | 20221226074908.8154-4-jasowang@redhat.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | virtio-net: don't busy poll for cvq command | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessed tree name to be net-next |
netdev/fixes_present | success | Fixes tag not required for -next series |
netdev/subject_prefix | success | Link |
netdev/cover_letter | success | Series has a cover letter |
netdev/patch_count | success | Link |
netdev/header_inline | success | No static functions without inline keyword in header files |
netdev/build_32bit | success | Errors and warnings before: 21 this patch: 21 |
netdev/cc_maintainers | success | CCed 3 of 3 maintainers |
netdev/build_clang | success | Errors and warnings before: 10 this patch: 10 |
netdev/module_param | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Signed-off-by tag matches author and committer |
netdev/check_selftest | success | No net selftest shell script |
netdev/verify_fixes | success | No Fixes tag |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 22 this patch: 22 |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 78 lines checked |
netdev/kdoc | success | Errors and warnings before: 2 this patch: 2 |
netdev/source_inline | success | Was 0 now: 0 |
On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: > This patch introduces a per virtqueue waitqueue to allow driver to > sleep and wait for more used. Two new helpers are introduced to allow > driver to sleep and wake up. > > Signed-off-by: Jason Wang <jasowang@redhat.com> > --- > Changes since V1: > - check virtqueue_is_broken() as well > - use more_used() instead of virtqueue_get_buf() to allow caller to > get buffers afterwards > --- > drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ > include/linux/virtio.h | 3 +++ > 2 files changed, 32 insertions(+) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 5cfb2fa8abee..9c83eb945493 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -13,6 +13,7 @@ > #include <linux/dma-mapping.h> > #include <linux/kmsan.h> > #include <linux/spinlock.h> > +#include <linux/wait.h> > #include <xen/xen.h> > > #ifdef DEBUG > @@ -60,6 +61,7 @@ > "%s:"fmt, (_vq)->vq.name, ##args); \ > /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ > WRITE_ONCE((_vq)->broken, true); \ > + wake_up_interruptible(&(_vq)->wq); \ > } while (0) > #define START_USE(vq) > #define END_USE(vq) > @@ -203,6 +205,9 @@ struct vring_virtqueue { > /* DMA, allocation, and size information */ > bool we_own_ring; > > + /* Wait for buffer to be used */ > + wait_queue_head_t wq; > + > #ifdef DEBUG > /* They're supposed to lock for us. */ > unsigned int in_use; > @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > vq->weak_barriers = false; > > + init_waitqueue_head(&vq->wq); > + > err = vring_alloc_state_extra_packed(&vring_packed); > if (err) > goto err_state_extra; > @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > vq->weak_barriers = false; > > + init_waitqueue_head(&vq->wq); > + > err = vring_alloc_state_extra_split(vring_split); > if (err) { > kfree(vq); > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > { > struct vring_virtqueue *vq = to_vvq(_vq); > > + wake_up_interruptible(&vq->wq); > + > if (vq->we_own_ring) { > if (vq->packed_ring) { > vring_free_queue(vq->vq.vdev, > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > } > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + > + /* TODO: Tweak the timeout. */ > + return wait_event_interruptible_timeout(vq->wq, > + virtqueue_is_broken(_vq) || more_used(vq), HZ); There's no good timeout. Let's not even go there, if device goes bad it should set the need reset bit. > +} > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > + > +void virtqueue_wake_up(struct virtqueue *_vq) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + > + wake_up_interruptible(&vq->wq); > +} > +EXPORT_SYMBOL_GPL(virtqueue_wake_up); > + > MODULE_LICENSE("GPL"); > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > index dcab9c7e8784..2eb62c774895 100644 > --- a/include/linux/virtio.h > +++ b/include/linux/virtio.h > @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); > void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, > void **ctx); > > +int virtqueue_wait_for_used(struct virtqueue *vq); > +void virtqueue_wake_up(struct virtqueue *vq); > + > void virtqueue_disable_cb(struct virtqueue *vq); > > bool virtqueue_enable_cb(struct virtqueue *vq); > -- > 2.25.1
On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: > This patch introduces a per virtqueue waitqueue to allow driver to > sleep and wait for more used. Two new helpers are introduced to allow > driver to sleep and wake up. > > Signed-off-by: Jason Wang <jasowang@redhat.com> > --- > Changes since V1: > - check virtqueue_is_broken() as well > - use more_used() instead of virtqueue_get_buf() to allow caller to > get buffers afterwards > --- > drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ > include/linux/virtio.h | 3 +++ > 2 files changed, 32 insertions(+) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 5cfb2fa8abee..9c83eb945493 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -13,6 +13,7 @@ > #include <linux/dma-mapping.h> > #include <linux/kmsan.h> > #include <linux/spinlock.h> > +#include <linux/wait.h> > #include <xen/xen.h> > > #ifdef DEBUG > @@ -60,6 +61,7 @@ > "%s:"fmt, (_vq)->vq.name, ##args); \ > /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ > WRITE_ONCE((_vq)->broken, true); \ > + wake_up_interruptible(&(_vq)->wq); \ > } while (0) > #define START_USE(vq) > #define END_USE(vq) > @@ -203,6 +205,9 @@ struct vring_virtqueue { > /* DMA, allocation, and size information */ > bool we_own_ring; > > + /* Wait for buffer to be used */ > + wait_queue_head_t wq; > + > #ifdef DEBUG > /* They're supposed to lock for us. */ > unsigned int in_use; > @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > vq->weak_barriers = false; > > + init_waitqueue_head(&vq->wq); > + > err = vring_alloc_state_extra_packed(&vring_packed); > if (err) > goto err_state_extra; > @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > vq->weak_barriers = false; > > + init_waitqueue_head(&vq->wq); > + > err = vring_alloc_state_extra_split(vring_split); > if (err) { > kfree(vq); > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > { > struct vring_virtqueue *vq = to_vvq(_vq); > > + wake_up_interruptible(&vq->wq); > + > if (vq->we_own_ring) { > if (vq->packed_ring) { > vring_free_queue(vq->vq.vdev, > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > } > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + > + /* TODO: Tweak the timeout. */ > + return wait_event_interruptible_timeout(vq->wq, > + virtqueue_is_broken(_vq) || more_used(vq), HZ); BTW undocumented that you also make it interruptible. So if we get an interrupt then this will fail. But device is still going and will later use the buffers. Same for timeout really. > +} > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > + > +void virtqueue_wake_up(struct virtqueue *_vq) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + > + wake_up_interruptible(&vq->wq); > +} > +EXPORT_SYMBOL_GPL(virtqueue_wake_up); > + > MODULE_LICENSE("GPL"); > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > index dcab9c7e8784..2eb62c774895 100644 > --- a/include/linux/virtio.h > +++ b/include/linux/virtio.h > @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); > void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, > void **ctx); > > +int virtqueue_wait_for_used(struct virtqueue *vq); > +void virtqueue_wake_up(struct virtqueue *vq); > + > void virtqueue_disable_cb(struct virtqueue *vq); > > bool virtqueue_enable_cb(struct virtqueue *vq); > -- > 2.25.1
On Tue, Dec 27, 2022 at 7:34 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: > > This patch introduces a per virtqueue waitqueue to allow driver to > > sleep and wait for more used. Two new helpers are introduced to allow > > driver to sleep and wake up. > > > > Signed-off-by: Jason Wang <jasowang@redhat.com> > > --- > > Changes since V1: > > - check virtqueue_is_broken() as well > > - use more_used() instead of virtqueue_get_buf() to allow caller to > > get buffers afterwards > > --- > > drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ > > include/linux/virtio.h | 3 +++ > > 2 files changed, 32 insertions(+) > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index 5cfb2fa8abee..9c83eb945493 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -13,6 +13,7 @@ > > #include <linux/dma-mapping.h> > > #include <linux/kmsan.h> > > #include <linux/spinlock.h> > > +#include <linux/wait.h> > > #include <xen/xen.h> > > > > #ifdef DEBUG > > @@ -60,6 +61,7 @@ > > "%s:"fmt, (_vq)->vq.name, ##args); \ > > /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ > > WRITE_ONCE((_vq)->broken, true); \ > > + wake_up_interruptible(&(_vq)->wq); \ > > } while (0) > > #define START_USE(vq) > > #define END_USE(vq) > > @@ -203,6 +205,9 @@ struct vring_virtqueue { > > /* DMA, allocation, and size information */ > > bool we_own_ring; > > > > + /* Wait for buffer to be used */ > > + wait_queue_head_t wq; > > + > > #ifdef DEBUG > > /* They're supposed to lock for us. */ > > unsigned int in_use; > > @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > vq->weak_barriers = false; > > > > + init_waitqueue_head(&vq->wq); > > + > > err = vring_alloc_state_extra_packed(&vring_packed); > > if (err) > > goto err_state_extra; > > @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > vq->weak_barriers = false; > > > > + init_waitqueue_head(&vq->wq); > > + > > err = vring_alloc_state_extra_split(vring_split); > > if (err) { > > kfree(vq); > > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > > { > > struct vring_virtqueue *vq = to_vvq(_vq); > > > > + wake_up_interruptible(&vq->wq); > > + > > if (vq->we_own_ring) { > > if (vq->packed_ring) { > > vring_free_queue(vq->vq.vdev, > > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > > } > > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > > +{ > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + /* TODO: Tweak the timeout. */ > > + return wait_event_interruptible_timeout(vq->wq, > > + virtqueue_is_broken(_vq) || more_used(vq), HZ); > > There's no good timeout. Let's not even go there, if device goes > bad it should set the need reset bit. The problem is that we can't depend on the device. If it takes too long for the device to respond to cvq, there's a high possibility that the device is buggy or even malicious. We can have a higher timeout here and it should be still better than waiting forever (the cvq commands need to be serialized so it needs to hold a lock anyway (RTNL) ). Thanks > > > > +} > > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > > + > > +void virtqueue_wake_up(struct virtqueue *_vq) > > +{ > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + wake_up_interruptible(&vq->wq); > > +} > > +EXPORT_SYMBOL_GPL(virtqueue_wake_up); > > + > > MODULE_LICENSE("GPL"); > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > > index dcab9c7e8784..2eb62c774895 100644 > > --- a/include/linux/virtio.h > > +++ b/include/linux/virtio.h > > @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); > > void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, > > void **ctx); > > > > +int virtqueue_wait_for_used(struct virtqueue *vq); > > +void virtqueue_wake_up(struct virtqueue *vq); > > + > > void virtqueue_disable_cb(struct virtqueue *vq); > > > > bool virtqueue_enable_cb(struct virtqueue *vq); > > -- > > 2.25.1 >
On Tue, Dec 27, 2022 at 7:38 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: > > This patch introduces a per virtqueue waitqueue to allow driver to > > sleep and wait for more used. Two new helpers are introduced to allow > > driver to sleep and wake up. > > > > Signed-off-by: Jason Wang <jasowang@redhat.com> > > --- > > Changes since V1: > > - check virtqueue_is_broken() as well > > - use more_used() instead of virtqueue_get_buf() to allow caller to > > get buffers afterwards > > --- > > drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ > > include/linux/virtio.h | 3 +++ > > 2 files changed, 32 insertions(+) > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index 5cfb2fa8abee..9c83eb945493 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -13,6 +13,7 @@ > > #include <linux/dma-mapping.h> > > #include <linux/kmsan.h> > > #include <linux/spinlock.h> > > +#include <linux/wait.h> > > #include <xen/xen.h> > > > > #ifdef DEBUG > > @@ -60,6 +61,7 @@ > > "%s:"fmt, (_vq)->vq.name, ##args); \ > > /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ > > WRITE_ONCE((_vq)->broken, true); \ > > + wake_up_interruptible(&(_vq)->wq); \ > > } while (0) > > #define START_USE(vq) > > #define END_USE(vq) > > @@ -203,6 +205,9 @@ struct vring_virtqueue { > > /* DMA, allocation, and size information */ > > bool we_own_ring; > > > > + /* Wait for buffer to be used */ > > + wait_queue_head_t wq; > > + > > #ifdef DEBUG > > /* They're supposed to lock for us. */ > > unsigned int in_use; > > @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > vq->weak_barriers = false; > > > > + init_waitqueue_head(&vq->wq); > > + > > err = vring_alloc_state_extra_packed(&vring_packed); > > if (err) > > goto err_state_extra; > > @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > vq->weak_barriers = false; > > > > + init_waitqueue_head(&vq->wq); > > + > > err = vring_alloc_state_extra_split(vring_split); > > if (err) { > > kfree(vq); > > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > > { > > struct vring_virtqueue *vq = to_vvq(_vq); > > > > + wake_up_interruptible(&vq->wq); > > + > > if (vq->we_own_ring) { > > if (vq->packed_ring) { > > vring_free_queue(vq->vq.vdev, > > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > > } > > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > > +{ > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + /* TODO: Tweak the timeout. */ > > + return wait_event_interruptible_timeout(vq->wq, > > + virtqueue_is_broken(_vq) || more_used(vq), HZ); > > BTW undocumented that you also make it interruptible. > So if we get an interrupt then this will fail. Yes, this is sub-optimal. > But device is still going and will later use the buffers. > > Same for timeout really. Avoiding infinite wait/poll is one of the goals, another is to sleep. If we think the timeout is hard, we can start from the wait. Thanks > > > > > +} > > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > > + > > +void virtqueue_wake_up(struct virtqueue *_vq) > > +{ > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + wake_up_interruptible(&vq->wq); > > +} > > +EXPORT_SYMBOL_GPL(virtqueue_wake_up); > > + > > MODULE_LICENSE("GPL"); > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > > index dcab9c7e8784..2eb62c774895 100644 > > --- a/include/linux/virtio.h > > +++ b/include/linux/virtio.h > > @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); > > void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, > > void **ctx); > > > > +int virtqueue_wait_for_used(struct virtqueue *vq); > > +void virtqueue_wake_up(struct virtqueue *vq); > > + > > void virtqueue_disable_cb(struct virtqueue *vq); > > > > bool virtqueue_enable_cb(struct virtqueue *vq); > > -- > > 2.25.1 >
On Tue, Dec 27, 2022 at 11:47:34AM +0800, Jason Wang wrote: > On Tue, Dec 27, 2022 at 7:34 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: > > > This patch introduces a per virtqueue waitqueue to allow driver to > > > sleep and wait for more used. Two new helpers are introduced to allow > > > driver to sleep and wake up. > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com> > > > --- > > > Changes since V1: > > > - check virtqueue_is_broken() as well > > > - use more_used() instead of virtqueue_get_buf() to allow caller to > > > get buffers afterwards > > > --- > > > drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ > > > include/linux/virtio.h | 3 +++ > > > 2 files changed, 32 insertions(+) > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > > index 5cfb2fa8abee..9c83eb945493 100644 > > > --- a/drivers/virtio/virtio_ring.c > > > +++ b/drivers/virtio/virtio_ring.c > > > @@ -13,6 +13,7 @@ > > > #include <linux/dma-mapping.h> > > > #include <linux/kmsan.h> > > > #include <linux/spinlock.h> > > > +#include <linux/wait.h> > > > #include <xen/xen.h> > > > > > > #ifdef DEBUG > > > @@ -60,6 +61,7 @@ > > > "%s:"fmt, (_vq)->vq.name, ##args); \ > > > /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ > > > WRITE_ONCE((_vq)->broken, true); \ > > > + wake_up_interruptible(&(_vq)->wq); \ > > > } while (0) > > > #define START_USE(vq) > > > #define END_USE(vq) > > > @@ -203,6 +205,9 @@ struct vring_virtqueue { > > > /* DMA, allocation, and size information */ > > > bool we_own_ring; > > > > > > + /* Wait for buffer to be used */ > > > + wait_queue_head_t wq; > > > + > > > #ifdef DEBUG > > > /* They're supposed to lock for us. */ > > > unsigned int in_use; > > > @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( > > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > > vq->weak_barriers = false; > > > > > > + init_waitqueue_head(&vq->wq); > > > + > > > err = vring_alloc_state_extra_packed(&vring_packed); > > > if (err) > > > goto err_state_extra; > > > @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, > > > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) > > > vq->weak_barriers = false; > > > > > > + init_waitqueue_head(&vq->wq); > > > + > > > err = vring_alloc_state_extra_split(vring_split); > > > if (err) { > > > kfree(vq); > > > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > > > { > > > struct vring_virtqueue *vq = to_vvq(_vq); > > > > > > + wake_up_interruptible(&vq->wq); > > > + > > > if (vq->we_own_ring) { > > > if (vq->packed_ring) { > > > vring_free_queue(vq->vq.vdev, > > > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > > > } > > > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > > > > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > > > +{ > > > + struct vring_virtqueue *vq = to_vvq(_vq); > > > + > > > + /* TODO: Tweak the timeout. */ > > > + return wait_event_interruptible_timeout(vq->wq, > > > + virtqueue_is_broken(_vq) || more_used(vq), HZ); > > > > There's no good timeout. Let's not even go there, if device goes > > bad it should set the need reset bit. > > The problem is that we can't depend on the device. If it takes too > long for the device to respond to cvq, there's a high possibility that > the device is buggy or even malicious. We can have a higher timeout > here and it should be still better than waiting forever (the cvq > commands need to be serialized so it needs to hold a lock anyway > (RTNL) ). > > Thanks With a TODO item like this I'd expect this to be an RFC. Here's why: Making driver more robust from device failures is a laudable goal but it's really hard to be 100% foolproof here. E.g. device can just block pci reads and it would be very hard to recover. So I'm going to only merge patches like this if they at least theoretically have very little chance of breaking existing users. And note that in most setups, CVQ is only used at startup and then left mostly alone. Finally, note that lots of guests need virtio to do anything useful at all. So just failing commands is not enough to recover - you need to try harder maybe by attempting to reset device. Could be a question of policy - might need to make this guest configurable. > > > > > > > +} > > > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > > > + > > > +void virtqueue_wake_up(struct virtqueue *_vq) > > > +{ > > > + struct vring_virtqueue *vq = to_vvq(_vq); > > > + > > > + wake_up_interruptible(&vq->wq); > > > +} > > > +EXPORT_SYMBOL_GPL(virtqueue_wake_up); > > > + > > > MODULE_LICENSE("GPL"); > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > > > index dcab9c7e8784..2eb62c774895 100644 > > > --- a/include/linux/virtio.h > > > +++ b/include/linux/virtio.h > > > @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); > > > void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, > > > void **ctx); > > > > > > +int virtqueue_wait_for_used(struct virtqueue *vq); > > > +void virtqueue_wake_up(struct virtqueue *vq); > > > + > > > void virtqueue_disable_cb(struct virtqueue *vq); > > > > > > bool virtqueue_enable_cb(struct virtqueue *vq); > > > -- > > > 2.25.1 > >
On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > But device is still going and will later use the buffers. > > > > Same for timeout really. > > Avoiding infinite wait/poll is one of the goals, another is to sleep. > If we think the timeout is hard, we can start from the wait. > > Thanks If the goal is to avoid disrupting traffic while CVQ is in use, that sounds more reasonable. E.g. someone is turning on promisc, a spike in CPU usage might be unwelcome. things we should be careful to address then: 1- debugging. Currently it's easy to see a warning if CPU is stuck in a loop for a while, and we also get a backtrace. E.g. with this - how do we know who has the RTNL? We need to integrate with kernel/watchdog.c for good results and to make sure policy is consistent. 2- overhead. In a very common scenario when device is in hypervisor, programming timers etc has a very high overhead, at bootup lots of CVQ commands are run and slowing boot down is not nice. let's poll for a bit before waiting? 3- suprise removal. need to wake up thread in some way. what about other cases of device breakage - is there a chance this introduces new bugs around that? at least enumerate them please.
在 2022/12/27 15:19, Michael S. Tsirkin 写道: > On Tue, Dec 27, 2022 at 11:47:34AM +0800, Jason Wang wrote: >> On Tue, Dec 27, 2022 at 7:34 AM Michael S. Tsirkin <mst@redhat.com> wrote: >>> On Mon, Dec 26, 2022 at 03:49:07PM +0800, Jason Wang wrote: >>>> This patch introduces a per virtqueue waitqueue to allow driver to >>>> sleep and wait for more used. Two new helpers are introduced to allow >>>> driver to sleep and wake up. >>>> >>>> Signed-off-by: Jason Wang <jasowang@redhat.com> >>>> --- >>>> Changes since V1: >>>> - check virtqueue_is_broken() as well >>>> - use more_used() instead of virtqueue_get_buf() to allow caller to >>>> get buffers afterwards >>>> --- >>>> drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ >>>> include/linux/virtio.h | 3 +++ >>>> 2 files changed, 32 insertions(+) >>>> >>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c >>>> index 5cfb2fa8abee..9c83eb945493 100644 >>>> --- a/drivers/virtio/virtio_ring.c >>>> +++ b/drivers/virtio/virtio_ring.c >>>> @@ -13,6 +13,7 @@ >>>> #include <linux/dma-mapping.h> >>>> #include <linux/kmsan.h> >>>> #include <linux/spinlock.h> >>>> +#include <linux/wait.h> >>>> #include <xen/xen.h> >>>> >>>> #ifdef DEBUG >>>> @@ -60,6 +61,7 @@ >>>> "%s:"fmt, (_vq)->vq.name, ##args); \ >>>> /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ >>>> WRITE_ONCE((_vq)->broken, true); \ >>>> + wake_up_interruptible(&(_vq)->wq); \ >>>> } while (0) >>>> #define START_USE(vq) >>>> #define END_USE(vq) >>>> @@ -203,6 +205,9 @@ struct vring_virtqueue { >>>> /* DMA, allocation, and size information */ >>>> bool we_own_ring; >>>> >>>> + /* Wait for buffer to be used */ >>>> + wait_queue_head_t wq; >>>> + >>>> #ifdef DEBUG >>>> /* They're supposed to lock for us. */ >>>> unsigned int in_use; >>>> @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( >>>> if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) >>>> vq->weak_barriers = false; >>>> >>>> + init_waitqueue_head(&vq->wq); >>>> + >>>> err = vring_alloc_state_extra_packed(&vring_packed); >>>> if (err) >>>> goto err_state_extra; >>>> @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, >>>> if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) >>>> vq->weak_barriers = false; >>>> >>>> + init_waitqueue_head(&vq->wq); >>>> + >>>> err = vring_alloc_state_extra_split(vring_split); >>>> if (err) { >>>> kfree(vq); >>>> @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) >>>> { >>>> struct vring_virtqueue *vq = to_vvq(_vq); >>>> >>>> + wake_up_interruptible(&vq->wq); >>>> + >>>> if (vq->we_own_ring) { >>>> if (vq->packed_ring) { >>>> vring_free_queue(vq->vq.vdev, >>>> @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) >>>> } >>>> EXPORT_SYMBOL_GPL(virtqueue_get_vring); >>>> >>>> +int virtqueue_wait_for_used(struct virtqueue *_vq) >>>> +{ >>>> + struct vring_virtqueue *vq = to_vvq(_vq); >>>> + >>>> + /* TODO: Tweak the timeout. */ >>>> + return wait_event_interruptible_timeout(vq->wq, >>>> + virtqueue_is_broken(_vq) || more_used(vq), HZ); >>> There's no good timeout. Let's not even go there, if device goes >>> bad it should set the need reset bit. >> The problem is that we can't depend on the device. If it takes too >> long for the device to respond to cvq, there's a high possibility that >> the device is buggy or even malicious. We can have a higher timeout >> here and it should be still better than waiting forever (the cvq >> commands need to be serialized so it needs to hold a lock anyway >> (RTNL) ). >> >> Thanks > With a TODO item like this I'd expect this to be an RFC. > Here's why: > > Making driver more robust from device failures is a laudable goal but it's really > hard to be 100% foolproof here. E.g. device can just block pci reads and > it would be very hard to recover. Yes. > So I'm going to only merge patches > like this if they at least theoretically have very little chance > of breaking existing users. AFAIK, this is not theoretical, consider: 1) DPU may implement virtio-net CVQ with codes running in CPU 2) VDUSE may want to support CVQ in the future > > And note that in most setups, CVQ is only used at startup and then left mostly alone. > > Finally, note that lots of guests need virtio to do anything useful at all. > So just failing commands is not enough to recover - you need to try > harder maybe by attempting to reset device. This requires upper layer support which seems not existed in the networking subsystem. > Could be a question of > policy - might need to make this guest configurable. Yes. Thanks > > > >>> >>>> +} >>>> +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); >>>> + >>>> +void virtqueue_wake_up(struct virtqueue *_vq) >>>> +{ >>>> + struct vring_virtqueue *vq = to_vvq(_vq); >>>> + >>>> + wake_up_interruptible(&vq->wq); >>>> +} >>>> +EXPORT_SYMBOL_GPL(virtqueue_wake_up); >>>> + >>>> MODULE_LICENSE("GPL"); >>>> diff --git a/include/linux/virtio.h b/include/linux/virtio.h >>>> index dcab9c7e8784..2eb62c774895 100644 >>>> --- a/include/linux/virtio.h >>>> +++ b/include/linux/virtio.h >>>> @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); >>>> void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, >>>> void **ctx); >>>> >>>> +int virtqueue_wait_for_used(struct virtqueue *vq); >>>> +void virtqueue_wake_up(struct virtqueue *vq); >>>> + >>>> void virtqueue_disable_cb(struct virtqueue *vq); >>>> >>>> bool virtqueue_enable_cb(struct virtqueue *vq); >>>> -- >>>> 2.25.1
在 2022/12/27 15:33, Michael S. Tsirkin 写道: > On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: >>> But device is still going and will later use the buffers. >>> >>> Same for timeout really. >> Avoiding infinite wait/poll is one of the goals, another is to sleep. >> If we think the timeout is hard, we can start from the wait. >> >> Thanks > If the goal is to avoid disrupting traffic while CVQ is in use, > that sounds more reasonable. E.g. someone is turning on promisc, > a spike in CPU usage might be unwelcome. Yes, this would be more obvious is UP is used. > > things we should be careful to address then: > 1- debugging. Currently it's easy to see a warning if CPU is stuck > in a loop for a while, and we also get a backtrace. > E.g. with this - how do we know who has the RTNL? > We need to integrate with kernel/watchdog.c for good results > and to make sure policy is consistent. That's fine, will consider this. > 2- overhead. In a very common scenario when device is in hypervisor, > programming timers etc has a very high overhead, at bootup > lots of CVQ commands are run and slowing boot down is not nice. > let's poll for a bit before waiting? Then we go back to the question of choosing a good timeout for poll. And poll seems problematic in the case of UP, scheduler might not have the chance to run. > 3- suprise removal. need to wake up thread in some way. what about > other cases of device breakage - is there a chance this > introduces new bugs around that? at least enumerate them please. The current code did: 1) check for vq->broken 2) wakeup during BAD_RING() So we won't end up with a never woke up process which should be fine. Thanks > >
On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > But device is still going and will later use the buffers. > > > > > > > > Same for timeout really. > > > Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > If we think the timeout is hard, we can start from the wait. > > > > > > Thanks > > If the goal is to avoid disrupting traffic while CVQ is in use, > > that sounds more reasonable. E.g. someone is turning on promisc, > > a spike in CPU usage might be unwelcome. > > > Yes, this would be more obvious is UP is used. > > > > > > things we should be careful to address then: > > 1- debugging. Currently it's easy to see a warning if CPU is stuck > > in a loop for a while, and we also get a backtrace. > > E.g. with this - how do we know who has the RTNL? > > We need to integrate with kernel/watchdog.c for good results > > and to make sure policy is consistent. > > > That's fine, will consider this. > > > > 2- overhead. In a very common scenario when device is in hypervisor, > > programming timers etc has a very high overhead, at bootup > > lots of CVQ commands are run and slowing boot down is not nice. > > let's poll for a bit before waiting? > > > Then we go back to the question of choosing a good timeout for poll. And > poll seems problematic in the case of UP, scheduler might not have the > chance to run. Poll just a bit :) Seriously I don't know, but at least check once after kick. > > > 3- suprise removal. need to wake up thread in some way. what about > > other cases of device breakage - is there a chance this > > introduces new bugs around that? at least enumerate them please. > > > The current code did: > > 1) check for vq->broken > 2) wakeup during BAD_RING() > > So we won't end up with a never woke up process which should be fine. > > Thanks BTW BAD_RING on removal will trigger dev_err. Not sure that is a good idea - can cause crashes if kernel panics on error. > > > > >
在 2022/12/27 17:38, Michael S. Tsirkin 写道: > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: >>>>> But device is still going and will later use the buffers. >>>>> >>>>> Same for timeout really. >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. >>>> If we think the timeout is hard, we can start from the wait. >>>> >>>> Thanks >>> If the goal is to avoid disrupting traffic while CVQ is in use, >>> that sounds more reasonable. E.g. someone is turning on promisc, >>> a spike in CPU usage might be unwelcome. >> >> Yes, this would be more obvious is UP is used. >> >> >>> things we should be careful to address then: >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck >>> in a loop for a while, and we also get a backtrace. >>> E.g. with this - how do we know who has the RTNL? >>> We need to integrate with kernel/watchdog.c for good results >>> and to make sure policy is consistent. >> >> That's fine, will consider this. >> >> >>> 2- overhead. In a very common scenario when device is in hypervisor, >>> programming timers etc has a very high overhead, at bootup >>> lots of CVQ commands are run and slowing boot down is not nice. >>> let's poll for a bit before waiting? >> >> Then we go back to the question of choosing a good timeout for poll. And >> poll seems problematic in the case of UP, scheduler might not have the >> chance to run. > Poll just a bit :) Seriously I don't know, but at least check once > after kick. I think it is what the current code did where the condition will be check before trying to sleep in the wait_event(). > >>> 3- suprise removal. need to wake up thread in some way. what about >>> other cases of device breakage - is there a chance this >>> introduces new bugs around that? at least enumerate them please. >> >> The current code did: >> >> 1) check for vq->broken >> 2) wakeup during BAD_RING() >> >> So we won't end up with a never woke up process which should be fine. >> >> Thanks > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > idea - can cause crashes if kernel panics on error. Yes, it's better to use __virtqueue_break() instead. But consider we will start from a wait first, I will limit the changes in virtio-net without bothering virtio core. Thanks > >>>
On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > >>>>> But device is still going and will later use the buffers. > >>>>> > >>>>> Same for timeout really. > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > >>>> If we think the timeout is hard, we can start from the wait. > >>>> > >>>> Thanks > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > >>> that sounds more reasonable. E.g. someone is turning on promisc, > >>> a spike in CPU usage might be unwelcome. > >> > >> Yes, this would be more obvious is UP is used. > >> > >> > >>> things we should be careful to address then: > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > >>> in a loop for a while, and we also get a backtrace. > >>> E.g. with this - how do we know who has the RTNL? > >>> We need to integrate with kernel/watchdog.c for good results > >>> and to make sure policy is consistent. > >> > >> That's fine, will consider this. So after some investigation, it seems the watchdog.c doesn't help. The only export helper is touch_softlockup_watchdog() which tries to avoid triggering the lockups warning for the known slow path. And before the patch, we end up with a real infinite loop which could be caught by RCU stall detector which is not the case of the sleep. What we can do is probably do a periodic netdev_err(). Thanks > >> > >> > >>> 2- overhead. In a very common scenario when device is in hypervisor, > >>> programming timers etc has a very high overhead, at bootup > >>> lots of CVQ commands are run and slowing boot down is not nice. > >>> let's poll for a bit before waiting? > >> > >> Then we go back to the question of choosing a good timeout for poll. And > >> poll seems problematic in the case of UP, scheduler might not have the > >> chance to run. > > Poll just a bit :) Seriously I don't know, but at least check once > > after kick. > > > I think it is what the current code did where the condition will be > check before trying to sleep in the wait_event(). > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > >>> other cases of device breakage - is there a chance this > >>> introduces new bugs around that? at least enumerate them please. > >> > >> The current code did: > >> > >> 1) check for vq->broken > >> 2) wakeup during BAD_RING() > >> > >> So we won't end up with a never woke up process which should be fine. > >> > >> Thanks > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > idea - can cause crashes if kernel panics on error. > > > Yes, it's better to use __virtqueue_break() instead. > > But consider we will start from a wait first, I will limit the changes > in virtio-net without bothering virtio core. > > Thanks > > > > > >>>
On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > >>>>> But device is still going and will later use the buffers. > > >>>>> > > >>>>> Same for timeout really. > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > >>>> If we think the timeout is hard, we can start from the wait. > > >>>> > > >>>> Thanks > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > >>> a spike in CPU usage might be unwelcome. > > >> > > >> Yes, this would be more obvious is UP is used. > > >> > > >> > > >>> things we should be careful to address then: > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > >>> in a loop for a while, and we also get a backtrace. > > >>> E.g. with this - how do we know who has the RTNL? > > >>> We need to integrate with kernel/watchdog.c for good results > > >>> and to make sure policy is consistent. > > >> > > >> That's fine, will consider this. > > So after some investigation, it seems the watchdog.c doesn't help. The > only export helper is touch_softlockup_watchdog() which tries to avoid > triggering the lockups warning for the known slow path. I never said you can just use existing exporting APIs. You'll have to write new ones :) > And before the patch, we end up with a real infinite loop which could > be caught by RCU stall detector which is not the case of the sleep. > What we can do is probably do a periodic netdev_err(). > > Thanks Only with a bad device. > > >> > > >> > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > >>> programming timers etc has a very high overhead, at bootup > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > >>> let's poll for a bit before waiting? > > >> > > >> Then we go back to the question of choosing a good timeout for poll. And > > >> poll seems problematic in the case of UP, scheduler might not have the > > >> chance to run. > > > Poll just a bit :) Seriously I don't know, but at least check once > > > after kick. > > > > > > I think it is what the current code did where the condition will be > > check before trying to sleep in the wait_event(). > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > >>> other cases of device breakage - is there a chance this > > >>> introduces new bugs around that? at least enumerate them please. > > >> > > >> The current code did: > > >> > > >> 1) check for vq->broken > > >> 2) wakeup during BAD_RING() > > >> > > >> So we won't end up with a never woke up process which should be fine. > > >> > > >> Thanks > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > idea - can cause crashes if kernel panics on error. > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > But consider we will start from a wait first, I will limit the changes > > in virtio-net without bothering virtio core. > > > > Thanks > > > > > > > > > >>>
On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > >>>>> But device is still going and will later use the buffers. > > > >>>>> > > > >>>>> Same for timeout really. > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > >>>> If we think the timeout is hard, we can start from the wait. > > > >>>> > > > >>>> Thanks > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > >>> a spike in CPU usage might be unwelcome. > > > >> > > > >> Yes, this would be more obvious is UP is used. > > > >> > > > >> > > > >>> things we should be careful to address then: > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > >>> in a loop for a while, and we also get a backtrace. > > > >>> E.g. with this - how do we know who has the RTNL? > > > >>> We need to integrate with kernel/watchdog.c for good results > > > >>> and to make sure policy is consistent. > > > >> > > > >> That's fine, will consider this. > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > only export helper is touch_softlockup_watchdog() which tries to avoid > > triggering the lockups warning for the known slow path. > > I never said you can just use existing exporting APIs. You'll have to > write new ones :) Ok, I thought you wanted to trigger similar warnings as a watchdog. Btw, I wonder what kind of logic you want here. If we switch to using sleep, there won't be soft lockup anymore. A simple wait + timeout + warning seems sufficient? Thanks > > > And before the patch, we end up with a real infinite loop which could > > be caught by RCU stall detector which is not the case of the sleep. > > What we can do is probably do a periodic netdev_err(). > > > > Thanks > > Only with a bad device. > > > > >> > > > >> > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > >>> programming timers etc has a very high overhead, at bootup > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > >>> let's poll for a bit before waiting? > > > >> > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > >> chance to run. > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > after kick. > > > > > > > > > I think it is what the current code did where the condition will be > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > >>> other cases of device breakage - is there a chance this > > > >>> introduces new bugs around that? at least enumerate them please. > > > >> > > > >> The current code did: > > > >> > > > >> 1) check for vq->broken > > > >> 2) wakeup during BAD_RING() > > > >> > > > >> So we won't end up with a never woke up process which should be fine. > > > >> > > > >> Thanks > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > But consider we will start from a wait first, I will limit the changes > > > in virtio-net without bothering virtio core. > > > > > > Thanks > > > > > > > > > > > > > >>> >
On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > >>>>> But device is still going and will later use the buffers. > > > > >>>>> > > > > >>>>> Same for timeout really. > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > >>>> > > > > >>>> Thanks > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > >>> a spike in CPU usage might be unwelcome. > > > > >> > > > > >> Yes, this would be more obvious is UP is used. > > > > >> > > > > >> > > > > >>> things we should be careful to address then: > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > >>> in a loop for a while, and we also get a backtrace. > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > >>> and to make sure policy is consistent. > > > > >> > > > > >> That's fine, will consider this. > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > triggering the lockups warning for the known slow path. > > > > I never said you can just use existing exporting APIs. You'll have to > > write new ones :) > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > Btw, I wonder what kind of logic you want here. If we switch to using > sleep, there won't be soft lockup anymore. A simple wait + timeout + > warning seems sufficient? > > Thanks I'd like to avoid need to teach users new APIs. So watchdog setup to apply to this driver. The warning can be different. > > > > > And before the patch, we end up with a real infinite loop which could > > > be caught by RCU stall detector which is not the case of the sleep. > > > What we can do is probably do a periodic netdev_err(). > > > > > > Thanks > > > > Only with a bad device. > > > > > > >> > > > > >> > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > >>> programming timers etc has a very high overhead, at bootup > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > >>> let's poll for a bit before waiting? > > > > >> > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > >> chance to run. > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > after kick. > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > >>> other cases of device breakage - is there a chance this > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > >> > > > > >> The current code did: > > > > >> > > > > >> 1) check for vq->broken > > > > >> 2) wakeup during BAD_RING() > > > > >> > > > > >> So we won't end up with a never woke up process which should be fine. > > > > >> > > > > >> Thanks > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > in virtio-net without bothering virtio core. > > > > > > > > Thanks > > > > > > > > > > > > > > > > > >>> > >
On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > >>>>> But device is still going and will later use the buffers. > > > > > >>>>> > > > > > >>>>> Same for timeout really. > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > >>>> > > > > > >>>> Thanks > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > >> > > > > > >> Yes, this would be more obvious is UP is used. > > > > > >> > > > > > >> > > > > > >>> things we should be careful to address then: > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > >>> and to make sure policy is consistent. > > > > > >> > > > > > >> That's fine, will consider this. > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > triggering the lockups warning for the known slow path. > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > write new ones :) > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > warning seems sufficient? > > > > Thanks > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > to this driver. The warning can be different. Right, so it looks to me the only possible setup is the watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 second (as softlockup did). And I think it would still make sense to fail, we can start with a very long timeout like 1 minutes and break the device. Does this make sense? Thanks > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > Thanks > > > > > > Only with a bad device. > > > > > > > > >> > > > > > >> > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > >>> let's poll for a bit before waiting? > > > > > >> > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > >> chance to run. > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > after kick. > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > >>> other cases of device breakage - is there a chance this > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > >> > > > > > >> The current code did: > > > > > >> > > > > > >> 1) check for vq->broken > > > > > >> 2) wakeup during BAD_RING() > > > > > >> > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > >> > > > > > >> Thanks > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > >>> > > > >
On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > >>>>> > > > > > > >>>>> Same for timeout really. > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > >>>> > > > > > > >>>> Thanks > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > >> > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > >> > > > > > > >> > > > > > > >>> things we should be careful to address then: > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > >>> and to make sure policy is consistent. > > > > > > >> > > > > > > >> That's fine, will consider this. > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > triggering the lockups warning for the known slow path. > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > write new ones :) > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > warning seems sufficient? > > > > > > Thanks > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > to this driver. The warning can be different. > > Right, so it looks to me the only possible setup is the > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > second (as softlockup did). > > And I think it would still make sense to fail, we can start with a > very long timeout like 1 minutes and break the device. Does this make > sense? > > Thanks I'd say we need to make this manageable then. Can't we do it normally e.g. react to an interrupt to return to userspace? > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > Thanks > > > > > > > > Only with a bad device. > > > > > > > > > > >> > > > > > > >> > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > >>> let's poll for a bit before waiting? > > > > > > >> > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > >> chance to run. > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > after kick. > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > >> > > > > > > >> The current code did: > > > > > > >> > > > > > > >> 1) check for vq->broken > > > > > > >> 2) wakeup during BAD_RING() > > > > > > >> > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > >> > > > > > > >> Thanks > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > >
On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > >>>>> > > > > > > > >>>>> Same for timeout really. > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > >>>> > > > > > > > >>>> Thanks > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > >> > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > >> > > > > > > > >> > > > > > > > >>> things we should be careful to address then: > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > >>> and to make sure policy is consistent. > > > > > > > >> > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > write new ones :) > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > warning seems sufficient? > > > > > > > > Thanks > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > to this driver. The warning can be different. > > > > Right, so it looks to me the only possible setup is the > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > second (as softlockup did). > > > > And I think it would still make sense to fail, we can start with a > > very long timeout like 1 minutes and break the device. Does this make > > sense? > > > > Thanks > > I'd say we need to make this manageable then. Did you mean something like sysfs or module parameters? > Can't we do it normally > e.g. react to an interrupt to return to userspace? I didn't get the meaning of this. Sorry. Thanks > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > Thanks > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > >> > > > > > > > >> > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > >> > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > >> chance to run. > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > >> > > > > > > > >> The current code did: > > > > > > > >> > > > > > > > >> 1) check for vq->broken > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > >> > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > >> > > > > > > > >> Thanks > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > >
On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > >>>>> > > > > > > > > >>>>> Same for timeout really. > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > >>>> > > > > > > > > >>>> Thanks > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > >> > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > >> > > > > > > > > >> > > > > > > > > >>> things we should be careful to address then: > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > >> > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > write new ones :) > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > warning seems sufficient? > > > > > > > > > > Thanks > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > to this driver. The warning can be different. > > > > > > Right, so it looks to me the only possible setup is the > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > second (as softlockup did). > > > > > > And I think it would still make sense to fail, we can start with a > > > very long timeout like 1 minutes and break the device. Does this make > > > sense? > > > > > > Thanks > > > > I'd say we need to make this manageable then. > > Did you mean something like sysfs or module parameters? No I'd say pass it with an ioctl. > > Can't we do it normally > > e.g. react to an interrupt to return to userspace? > > I didn't get the meaning of this. Sorry. > > Thanks Standard way to handle things that can timeout and where userspace did not supply the time is to block until an interrupt then return EINTR. Userspace controls the timeout by using e.g. alarm(2). > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > >> > > > > > > > > >> > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > >> > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > >> chance to run. > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > >> > > > > > > > > >> The current code did: > > > > > > > > >> > > > > > > > > >> 1) check for vq->broken > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > >> > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > >> > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > >
On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > >>>>> > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > >>>> > > > > > > > > > >>>> Thanks > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > >> > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > >> > > > > > > > > > >> > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > >> > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > write new ones :) > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > warning seems sufficient? > > > > > > > > > > > > Thanks > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > to this driver. The warning can be different. > > > > > > > > Right, so it looks to me the only possible setup is the > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > second (as softlockup did). > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > very long timeout like 1 minutes and break the device. Does this make > > > > sense? > > > > > > > > Thanks > > > > > > I'd say we need to make this manageable then. > > > > Did you mean something like sysfs or module parameters? > > No I'd say pass it with an ioctl. > > > > Can't we do it normally > > > e.g. react to an interrupt to return to userspace? > > > > I didn't get the meaning of this. Sorry. > > > > Thanks > > Standard way to handle things that can timeout and where userspace > did not supply the time is to block until an interrupt > then return EINTR. Well this seems to be a huge change, ioctl(2) doesn't say it can return EINTR now. Actually, a driver timeout is used by other drivers when using controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes to avoid false negatives) seems to be a good first step. > Userspace controls the timeout by > using e.g. alarm(2). Not used in iproute2 after a git grep. Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > >> > > > > > > > > > >> > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > >> > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > >> chance to run. > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > >> > > > > > > > > > >> The current code did: > > > > > > > > > >> > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > >> > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > >> > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > >
On Sun, Jan 29, 2023 at 3:37 PM Hillf Danton <hdanton@sina.com> wrote: > > On Mon, 26 Dec 2022 15:49:07 +0800 Jason Wang <jasowang@redhat.com> > > @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) > > { > > struct vring_virtqueue *vq = to_vvq(_vq); > > > > + wake_up_interruptible(&vq->wq); > > + > > if (vq->we_own_ring) { > > if (vq->packed_ring) { > > vring_free_queue(vq->vq.vdev, > > @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) > > } > > EXPORT_SYMBOL_GPL(virtqueue_get_vring); > > > > +int virtqueue_wait_for_used(struct virtqueue *_vq) > > +{ > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + /* TODO: Tweak the timeout. */ > > + return wait_event_interruptible_timeout(vq->wq, > > + virtqueue_is_broken(_vq) || more_used(vq), HZ); > > +} > > +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); > > waker waiter > --- --- > vring_del_virtqueue > vring_free(_vq); > wakeup > kfree(vq); > get on CPU a tick later > uaf ? > Exactly, this wakeup of vring_free is not needed. It's up to the driver to do the proper wake up to avoid race when subsystem un registration. Thanks
On Mon, Jan 30, 2023 at 10:53:54AM +0800, Jason Wang wrote: > On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > > >>>>> > > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > > >>>> > > > > > > > > > > >>>> Thanks > > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > > >> > > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > > >> > > > > > > > > > > >> > > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > > >> > > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > > write new ones :) > > > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > > warning seems sufficient? > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > > to this driver. The warning can be different. > > > > > > > > > > Right, so it looks to me the only possible setup is the > > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > > second (as softlockup did). > > > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > > very long timeout like 1 minutes and break the device. Does this make > > > > > sense? > > > > > > > > > > Thanks > > > > > > > > I'd say we need to make this manageable then. > > > > > > Did you mean something like sysfs or module parameters? > > > > No I'd say pass it with an ioctl. > > > > > > Can't we do it normally > > > > e.g. react to an interrupt to return to userspace? > > > > > > I didn't get the meaning of this. Sorry. > > > > > > Thanks > > > > Standard way to handle things that can timeout and where userspace > > did not supply the time is to block until an interrupt > > then return EINTR. > > Well this seems to be a huge change, ioctl(2) doesn't say it can > return EINTR now. the one on fedora 37 does not but it says: No single standard. Arguments, returns, and semantics of ioctl() vary according to the device driver in question (the call is used as a catch-all for operations that don't cleanly fit the UNIX stream I/O model). so it depends on the device e.g. for a streams device it does: https://pubs.opengroup.org/onlinepubs/9699919799/functions/ioctl.html has EINTR. > Actually, a driver timeout is used by other drivers when using > controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes > to avoid false negatives) seems to be a good first step. Well because it's specific hardware so timeout matches what it can promise. virtio spec does not give guarantees. One issue is with software implementations. At the moment I can set a breakpoint in qemu or vhost user backend and nothing bad happens in just continues. > > Userspace controls the timeout by > > using e.g. alarm(2). > > Not used in iproute2 after a git grep. > > Thanks No need for iproute2 to do it user can just do it from shell. Or user can just press CTRL-C. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > > > >> > > > > > > > > > > >> > > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > > >> > > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > > >> chance to run. > > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > > >> > > > > > > > > > > >> The current code did: > > > > > > > > > > >> > > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > > >> > > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > > >> > > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > > > > > >
On Mon, Jan 30, 2023 at 1:43 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jan 30, 2023 at 10:53:54AM +0800, Jason Wang wrote: > > On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > > > >>>>> > > > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > > > >>>> > > > > > > > > > > > >>>> Thanks > > > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > > > >> > > > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > > > >> > > > > > > > > > > > >> > > > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > > > >> > > > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > > > write new ones :) > > > > > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > > > warning seems sufficient? > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > > > to this driver. The warning can be different. > > > > > > > > > > > > Right, so it looks to me the only possible setup is the > > > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > > > second (as softlockup did). > > > > > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > > > very long timeout like 1 minutes and break the device. Does this make > > > > > > sense? > > > > > > > > > > > > Thanks > > > > > > > > > > I'd say we need to make this manageable then. > > > > > > > > Did you mean something like sysfs or module parameters? > > > > > > No I'd say pass it with an ioctl. > > > > > > > > Can't we do it normally > > > > > e.g. react to an interrupt to return to userspace? > > > > > > > > I didn't get the meaning of this. Sorry. > > > > > > > > Thanks > > > > > > Standard way to handle things that can timeout and where userspace > > > did not supply the time is to block until an interrupt > > > then return EINTR. > > > > Well this seems to be a huge change, ioctl(2) doesn't say it can > > return EINTR now. > > the one on fedora 37 does not but it says: > No single standard. Arguments, returns, and semantics of ioctl() vary according to the device driver in question (the call is > used as a catch-all for operations that don't cleanly fit the UNIX stream I/O model). > > so it depends on the device e.g. for a streams device it does: > https://pubs.opengroup.org/onlinepubs/9699919799/functions/ioctl.html > has EINTR. Ok, I saw signal(7) also mention about EINTR for ioctl(2): """ If a blocked call to one of the following interfaces is interrupted by a signal handler, then the call is automatically restarted after the signal handler re‐ turns if the SA_RESTART flag was used; otherwise the call fails with the error EINTR: * read(2), readv(2), write(2), writev(2), and ioctl(2) calls on "slow" devices. A "slow" device is one where the I/O call may block for an indefinite time, for example, a terminal, pipe, or socket. If an I/O call on a slow device has already transferred some data by the time it is interrupted by a signal handler, then the call will return a success status (normally, the number of bytes transferred). Note that a (local) disk is not a slow device according to this defi‐ nition; I/O operations on disk devices are not interrupted by signals. """ > > > > > Actually, a driver timeout is used by other drivers when using > > controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes > > to avoid false negatives) seems to be a good first step. > > Well because it's specific hardware so timeout matches what it can > promise. virtio spec does not give guarantees. One issue is with > software implementations. At the moment I can set a breakpoint in qemu > or vhost user backend and nothing bad happens in just continues. Yes but it should be no difference from using a kgdb to debug i40e drivers. > > > > > Userspace controls the timeout by > > > using e.g. alarm(2). > > > > Not used in iproute2 after a git grep. > > > > Thanks > > No need for iproute2 to do it user can just do it from shell. Or user can just press CTRL-C. Yes, but iproute2 needs to deal with EINTR, that is the challenge part, if we simply return an error, the next cvq command might get confused. Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > > > > > >> > > > > > > > > > > > >> > > > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > > > >> > > > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > > > >> chance to run. > > > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > > > >> > > > > > > > > > > > >> The current code did: > > > > > > > > > > > >> > > > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > > > >> > > > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > > > >> > > > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > > > > > > > > > > >
On Mon, Jan 30, 2023 at 03:44:24PM +0800, Jason Wang wrote: > On Mon, Jan 30, 2023 at 1:43 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Mon, Jan 30, 2023 at 10:53:54AM +0800, Jason Wang wrote: > > > On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > > > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > > > > >>>>> > > > > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > > > > >>>> > > > > > > > > > > > > >>>> Thanks > > > > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > > > > >> > > > > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > > > > >> > > > > > > > > > > > > >> > > > > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > > > > >> > > > > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > > > > write new ones :) > > > > > > > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > > > > warning seems sufficient? > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > > > > to this driver. The warning can be different. > > > > > > > > > > > > > > Right, so it looks to me the only possible setup is the > > > > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > > > > second (as softlockup did). > > > > > > > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > > > > very long timeout like 1 minutes and break the device. Does this make > > > > > > > sense? > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > I'd say we need to make this manageable then. > > > > > > > > > > Did you mean something like sysfs or module parameters? > > > > > > > > No I'd say pass it with an ioctl. > > > > > > > > > > Can't we do it normally > > > > > > e.g. react to an interrupt to return to userspace? > > > > > > > > > > I didn't get the meaning of this. Sorry. > > > > > > > > > > Thanks > > > > > > > > Standard way to handle things that can timeout and where userspace > > > > did not supply the time is to block until an interrupt > > > > then return EINTR. > > > > > > Well this seems to be a huge change, ioctl(2) doesn't say it can > > > return EINTR now. > > > > the one on fedora 37 does not but it says: > > No single standard. Arguments, returns, and semantics of ioctl() vary according to the device driver in question (the call is > > used as a catch-all for operations that don't cleanly fit the UNIX stream I/O model). > > > > so it depends on the device e.g. for a streams device it does: > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/ioctl.html > > has EINTR. > > Ok, I saw signal(7) also mention about EINTR for ioctl(2): > > """ > If a blocked call to one of the following interfaces is > interrupted by a signal handler, then the call is automatically > restarted after the signal handler re‐ > turns if the SA_RESTART flag was used; otherwise the call fails > with the error EINTR: > > * read(2), readv(2), write(2), writev(2), and ioctl(2) calls on > "slow" devices. A "slow" device is one where the I/O call may block > for an indefinite time, for > example, a terminal, pipe, or socket. If an I/O call on a > slow device has already transferred some data by the time it is > interrupted by a signal handler, > then the call will return a success status (normally, the > number of bytes transferred). Note that a (local) disk is not a slow > device according to this defi‐ > nition; I/O operations on disk devices are not interrupted by signals. > """ And note that if you interrupt then you don't know whether ioctl changed device state or not generally. > > > > > > > > > Actually, a driver timeout is used by other drivers when using > > > controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes > > > to avoid false negatives) seems to be a good first step. > > > > Well because it's specific hardware so timeout matches what it can > > promise. virtio spec does not give guarantees. One issue is with > > software implementations. At the moment I can set a breakpoint in qemu > > or vhost user backend and nothing bad happens in just continues. > > Yes but it should be no difference from using a kgdb to debug i40e drivers. Except one of the reasons people prefer programming in userspace is because debugging is so much less painful. Someone using kgdb knows what driver is doing and can work around that. > > > > > > > > Userspace controls the timeout by > > > > using e.g. alarm(2). > > > > > > Not used in iproute2 after a git grep. > > > > > > Thanks > > > > No need for iproute2 to do it user can just do it from shell. Or user can just press CTRL-C. > > Yes, but iproute2 needs to deal with EINTR, that is the challenge > part, if we simply return an error, the next cvq command might get > confused. > > Thanks You mean this: start command interrupt start next command ? next command is confused? I think if you try a new command until previous one finished it's ok to just return EBUSY. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > > > > > > > >> > > > > > > > > > > > > >> > > > > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > > > > >> > > > > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > > > > >> chance to run. > > > > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > > > > >> > > > > > > > > > > > > >> The current code did: > > > > > > > > > > > > >> > > > > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > > > > >> > > > > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > > > > >> > > > > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >
On Mon, Jan 30, 2023 at 7:18 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jan 30, 2023 at 03:44:24PM +0800, Jason Wang wrote: > > On Mon, Jan 30, 2023 at 1:43 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Mon, Jan 30, 2023 at 10:53:54AM +0800, Jason Wang wrote: > > > > On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > > > > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > > > > > >>>>> > > > > > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > > > > > >>>> > > > > > > > > > > > > > >>>> Thanks > > > > > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> > > > > > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > > > > > write new ones :) > > > > > > > > > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > > > > > warning seems sufficient? > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > > > > > to this driver. The warning can be different. > > > > > > > > > > > > > > > > Right, so it looks to me the only possible setup is the > > > > > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > > > > > second (as softlockup did). > > > > > > > > > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > > > > > very long timeout like 1 minutes and break the device. Does this make > > > > > > > > sense? > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > I'd say we need to make this manageable then. > > > > > > > > > > > > Did you mean something like sysfs or module parameters? > > > > > > > > > > No I'd say pass it with an ioctl. > > > > > > > > > > > > Can't we do it normally > > > > > > > e.g. react to an interrupt to return to userspace? > > > > > > > > > > > > I didn't get the meaning of this. Sorry. > > > > > > > > > > > > Thanks > > > > > > > > > > Standard way to handle things that can timeout and where userspace > > > > > did not supply the time is to block until an interrupt > > > > > then return EINTR. > > > > > > > > Well this seems to be a huge change, ioctl(2) doesn't say it can > > > > return EINTR now. > > > > > > the one on fedora 37 does not but it says: > > > No single standard. Arguments, returns, and semantics of ioctl() vary according to the device driver in question (the call is > > > used as a catch-all for operations that don't cleanly fit the UNIX stream I/O model). > > > > > > so it depends on the device e.g. for a streams device it does: > > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/ioctl.html > > > has EINTR. > > > > Ok, I saw signal(7) also mention about EINTR for ioctl(2): > > > > """ > > If a blocked call to one of the following interfaces is > > interrupted by a signal handler, then the call is automatically > > restarted after the signal handler re‐ > > turns if the SA_RESTART flag was used; otherwise the call fails > > with the error EINTR: > > > > * read(2), readv(2), write(2), writev(2), and ioctl(2) calls on > > "slow" devices. A "slow" device is one where the I/O call may block > > for an indefinite time, for > > example, a terminal, pipe, or socket. If an I/O call on a > > slow device has already transferred some data by the time it is > > interrupted by a signal handler, > > then the call will return a success status (normally, the > > number of bytes transferred). Note that a (local) disk is not a slow > > device according to this defi‐ > > nition; I/O operations on disk devices are not interrupted by signals. > > """ > > > And note that if you interrupt then you don't know whether ioctl > changed device state or not generally. Yes. > > > > > > > > > > > > > Actually, a driver timeout is used by other drivers when using > > > > controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes > > > > to avoid false negatives) seems to be a good first step. > > > > > > Well because it's specific hardware so timeout matches what it can > > > promise. virtio spec does not give guarantees. One issue is with > > > software implementations. At the moment I can set a breakpoint in qemu > > > or vhost user backend and nothing bad happens in just continues. > > > > Yes but it should be no difference from using a kgdb to debug i40e drivers. > > Except one of the reasons people prefer programming in userspace is > because debugging is so much less painful. Someone using kgdb > knows what driver is doing and can work around that. Ok. > > > > > > > > > > > > Userspace controls the timeout by > > > > > using e.g. alarm(2). > > > > > > > > Not used in iproute2 after a git grep. > > > > > > > > Thanks > > > > > > No need for iproute2 to do it user can just do it from shell. Or user can just press CTRL-C. > > > > Yes, but iproute2 needs to deal with EINTR, that is the challenge > > part, if we simply return an error, the next cvq command might get > > confused. > > > > Thanks > > You mean this: > start command > interrupt > start next command > > ? > > next command is confused? > I think if you try a new command until previous > one finished it's ok to just return EBUSY. That would be fine. And we go back to somehow the idea here: https://lore.kernel.org/all/CACGkMEvQwhOhgGW6F22+3vmR4AW90qYXF+ZO6BQZguUF2xt2SA@mail.gmail.com/T/#m2da63932eae775d7d05d93d44c2f1d115ffbcefe Will try to do that in the next version. Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> > > > > > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > > > > > >> chance to run. > > > > > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> The current code did: > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > > > > > >> > > > > > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >
On Tue, Jan 31, 2023 at 11:24:52AM +0800, Jason Wang wrote: > On Mon, Jan 30, 2023 at 7:18 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Mon, Jan 30, 2023 at 03:44:24PM +0800, Jason Wang wrote: > > > On Mon, Jan 30, 2023 at 1:43 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Mon, Jan 30, 2023 at 10:53:54AM +0800, Jason Wang wrote: > > > > > On Sun, Jan 29, 2023 at 3:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Sun, Jan 29, 2023 at 01:48:49PM +0800, Jason Wang wrote: > > > > > > > On Fri, Jan 27, 2023 at 6:35 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Fri, Dec 30, 2022 at 11:43:08AM +0800, Jason Wang wrote: > > > > > > > > > On Thu, Dec 29, 2022 at 4:10 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Thu, Dec 29, 2022 at 04:04:13PM +0800, Jason Wang wrote: > > > > > > > > > > > On Thu, Dec 29, 2022 at 3:07 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 07:53:08PM +0800, Jason Wang wrote: > > > > > > > > > > > > > On Wed, Dec 28, 2022 at 2:34 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > 在 2022/12/27 17:38, Michael S. Tsirkin 写道: > > > > > > > > > > > > > > > On Tue, Dec 27, 2022 at 05:12:58PM +0800, Jason Wang wrote: > > > > > > > > > > > > > > >> 在 2022/12/27 15:33, Michael S. Tsirkin 写道: > > > > > > > > > > > > > > >>> On Tue, Dec 27, 2022 at 12:30:35PM +0800, Jason Wang wrote: > > > > > > > > > > > > > > >>>>> But device is still going and will later use the buffers. > > > > > > > > > > > > > > >>>>> > > > > > > > > > > > > > > >>>>> Same for timeout really. > > > > > > > > > > > > > > >>>> Avoiding infinite wait/poll is one of the goals, another is to sleep. > > > > > > > > > > > > > > >>>> If we think the timeout is hard, we can start from the wait. > > > > > > > > > > > > > > >>>> > > > > > > > > > > > > > > >>>> Thanks > > > > > > > > > > > > > > >>> If the goal is to avoid disrupting traffic while CVQ is in use, > > > > > > > > > > > > > > >>> that sounds more reasonable. E.g. someone is turning on promisc, > > > > > > > > > > > > > > >>> a spike in CPU usage might be unwelcome. > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> Yes, this would be more obvious is UP is used. > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >>> things we should be careful to address then: > > > > > > > > > > > > > > >>> 1- debugging. Currently it's easy to see a warning if CPU is stuck > > > > > > > > > > > > > > >>> in a loop for a while, and we also get a backtrace. > > > > > > > > > > > > > > >>> E.g. with this - how do we know who has the RTNL? > > > > > > > > > > > > > > >>> We need to integrate with kernel/watchdog.c for good results > > > > > > > > > > > > > > >>> and to make sure policy is consistent. > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> That's fine, will consider this. > > > > > > > > > > > > > > > > > > > > > > > > > > So after some investigation, it seems the watchdog.c doesn't help. The > > > > > > > > > > > > > only export helper is touch_softlockup_watchdog() which tries to avoid > > > > > > > > > > > > > triggering the lockups warning for the known slow path. > > > > > > > > > > > > > > > > > > > > > > > > I never said you can just use existing exporting APIs. You'll have to > > > > > > > > > > > > write new ones :) > > > > > > > > > > > > > > > > > > > > > > Ok, I thought you wanted to trigger similar warnings as a watchdog. > > > > > > > > > > > > > > > > > > > > > > Btw, I wonder what kind of logic you want here. If we switch to using > > > > > > > > > > > sleep, there won't be soft lockup anymore. A simple wait + timeout + > > > > > > > > > > > warning seems sufficient? > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > I'd like to avoid need to teach users new APIs. So watchdog setup to apply > > > > > > > > > > to this driver. The warning can be different. > > > > > > > > > > > > > > > > > > Right, so it looks to me the only possible setup is the > > > > > > > > > watchdog_thres. I plan to trigger the warning every watchdog_thres * 2 > > > > > > > > > second (as softlockup did). > > > > > > > > > > > > > > > > > > And I think it would still make sense to fail, we can start with a > > > > > > > > > very long timeout like 1 minutes and break the device. Does this make > > > > > > > > > sense? > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > I'd say we need to make this manageable then. > > > > > > > > > > > > > > Did you mean something like sysfs or module parameters? > > > > > > > > > > > > No I'd say pass it with an ioctl. > > > > > > > > > > > > > > Can't we do it normally > > > > > > > > e.g. react to an interrupt to return to userspace? > > > > > > > > > > > > > > I didn't get the meaning of this. Sorry. > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > Standard way to handle things that can timeout and where userspace > > > > > > did not supply the time is to block until an interrupt > > > > > > then return EINTR. > > > > > > > > > > Well this seems to be a huge change, ioctl(2) doesn't say it can > > > > > return EINTR now. > > > > > > > > the one on fedora 37 does not but it says: > > > > No single standard. Arguments, returns, and semantics of ioctl() vary according to the device driver in question (the call is > > > > used as a catch-all for operations that don't cleanly fit the UNIX stream I/O model). > > > > > > > > so it depends on the device e.g. for a streams device it does: > > > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/ioctl.html > > > > has EINTR. > > > > > > Ok, I saw signal(7) also mention about EINTR for ioctl(2): > > > > > > """ > > > If a blocked call to one of the following interfaces is > > > interrupted by a signal handler, then the call is automatically > > > restarted after the signal handler re‐ > > > turns if the SA_RESTART flag was used; otherwise the call fails > > > with the error EINTR: > > > > > > * read(2), readv(2), write(2), writev(2), and ioctl(2) calls on > > > "slow" devices. A "slow" device is one where the I/O call may block > > > for an indefinite time, for > > > example, a terminal, pipe, or socket. If an I/O call on a > > > slow device has already transferred some data by the time it is > > > interrupted by a signal handler, > > > then the call will return a success status (normally, the > > > number of bytes transferred). Note that a (local) disk is not a slow > > > device according to this defi‐ > > > nition; I/O operations on disk devices are not interrupted by signals. > > > """ > > > > > > And note that if you interrupt then you don't know whether ioctl > > changed device state or not generally. > > Yes. > > > > > > > > > > > > > > > > > > Actually, a driver timeout is used by other drivers when using > > > > > controlq/adminq (e.g i40e). Starting from a sane value (e.g 1 minutes > > > > > to avoid false negatives) seems to be a good first step. > > > > > > > > Well because it's specific hardware so timeout matches what it can > > > > promise. virtio spec does not give guarantees. One issue is with > > > > software implementations. At the moment I can set a breakpoint in qemu > > > > or vhost user backend and nothing bad happens in just continues. > > > > > > Yes but it should be no difference from using a kgdb to debug i40e drivers. > > > > Except one of the reasons people prefer programming in userspace is > > because debugging is so much less painful. Someone using kgdb > > knows what driver is doing and can work around that. > > Ok. > > > > > > > > > > > > > > > > > Userspace controls the timeout by > > > > > > using e.g. alarm(2). > > > > > > > > > > Not used in iproute2 after a git grep. > > > > > > > > > > Thanks > > > > > > > > No need for iproute2 to do it user can just do it from shell. Or user can just press CTRL-C. > > > > > > Yes, but iproute2 needs to deal with EINTR, that is the challenge > > > part, if we simply return an error, the next cvq command might get > > > confused. > > > > > > Thanks > > > > You mean this: > > start command > > interrupt > > start next command > > > > ? > > > > next command is confused? > > I think if you try a new command until previous > > one finished it's ok to just return EBUSY. > > That would be fine. > > And we go back to somehow the idea here: > > https://lore.kernel.org/all/CACGkMEvQwhOhgGW6F22+3vmR4AW90qYXF+ZO6BQZguUF2xt2SA@mail.gmail.com/T/#m2da63932eae775d7d05d93d44c2f1d115ffbcefe > > Will try to do that in the next version. > > Thanks Where you wrote: We can put the process into interruptible sleep, then it should be fine? (FYI, some transport specific methods may sleep e.g ccw). indeed. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > And before the patch, we end up with a real infinite loop which could > > > > > > > > > > > > > be caught by RCU stall detector which is not the case of the sleep. > > > > > > > > > > > > > What we can do is probably do a periodic netdev_err(). > > > > > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > Only with a bad device. > > > > > > > > > > > > > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >>> 2- overhead. In a very common scenario when device is in hypervisor, > > > > > > > > > > > > > > >>> programming timers etc has a very high overhead, at bootup > > > > > > > > > > > > > > >>> lots of CVQ commands are run and slowing boot down is not nice. > > > > > > > > > > > > > > >>> let's poll for a bit before waiting? > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> Then we go back to the question of choosing a good timeout for poll. And > > > > > > > > > > > > > > >> poll seems problematic in the case of UP, scheduler might not have the > > > > > > > > > > > > > > >> chance to run. > > > > > > > > > > > > > > > Poll just a bit :) Seriously I don't know, but at least check once > > > > > > > > > > > > > > > after kick. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I think it is what the current code did where the condition will be > > > > > > > > > > > > > > check before trying to sleep in the wait_event(). > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> 3- suprise removal. need to wake up thread in some way. what about > > > > > > > > > > > > > > >>> other cases of device breakage - is there a chance this > > > > > > > > > > > > > > >>> introduces new bugs around that? at least enumerate them please. > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> The current code did: > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> 1) check for vq->broken > > > > > > > > > > > > > > >> 2) wakeup during BAD_RING() > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> So we won't end up with a never woke up process which should be fine. > > > > > > > > > > > > > > >> > > > > > > > > > > > > > > >> Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > BTW BAD_RING on removal will trigger dev_err. Not sure that is a good > > > > > > > > > > > > > > > idea - can cause crashes if kernel panics on error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Yes, it's better to use __virtqueue_break() instead. > > > > > > > > > > > > > > > > > > > > > > > > > > > > But consider we will start from a wait first, I will limit the changes > > > > > > > > > > > > > > in virtio-net without bothering virtio core. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >>> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 5cfb2fa8abee..9c83eb945493 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -13,6 +13,7 @@ #include <linux/dma-mapping.h> #include <linux/kmsan.h> #include <linux/spinlock.h> +#include <linux/wait.h> #include <xen/xen.h> #ifdef DEBUG @@ -60,6 +61,7 @@ "%s:"fmt, (_vq)->vq.name, ##args); \ /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ \ WRITE_ONCE((_vq)->broken, true); \ + wake_up_interruptible(&(_vq)->wq); \ } while (0) #define START_USE(vq) #define END_USE(vq) @@ -203,6 +205,9 @@ struct vring_virtqueue { /* DMA, allocation, and size information */ bool we_own_ring; + /* Wait for buffer to be used */ + wait_queue_head_t wq; + #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; @@ -2024,6 +2029,8 @@ static struct virtqueue *vring_create_virtqueue_packed( if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; + init_waitqueue_head(&vq->wq); + err = vring_alloc_state_extra_packed(&vring_packed); if (err) goto err_state_extra; @@ -2517,6 +2524,8 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; + init_waitqueue_head(&vq->wq); + err = vring_alloc_state_extra_split(vring_split); if (err) { kfree(vq); @@ -2654,6 +2663,8 @@ static void vring_free(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + wake_up_interruptible(&vq->wq); + if (vq->we_own_ring) { if (vq->packed_ring) { vring_free_queue(vq->vq.vdev, @@ -2863,4 +2874,22 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring); +int virtqueue_wait_for_used(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* TODO: Tweak the timeout. */ + return wait_event_interruptible_timeout(vq->wq, + virtqueue_is_broken(_vq) || more_used(vq), HZ); +} +EXPORT_SYMBOL_GPL(virtqueue_wait_for_used); + +void virtqueue_wake_up(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + wake_up_interruptible(&vq->wq); +} +EXPORT_SYMBOL_GPL(virtqueue_wake_up); + MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index dcab9c7e8784..2eb62c774895 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -72,6 +72,9 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, void **ctx); +int virtqueue_wait_for_used(struct virtqueue *vq); +void virtqueue_wake_up(struct virtqueue *vq); + void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq);
This patch introduces a per virtqueue waitqueue to allow driver to sleep and wait for more used. Two new helpers are introduced to allow driver to sleep and wake up. Signed-off-by: Jason Wang <jasowang@redhat.com> --- Changes since V1: - check virtqueue_is_broken() as well - use more_used() instead of virtqueue_get_buf() to allow caller to get buffers afterwards --- drivers/virtio/virtio_ring.c | 29 +++++++++++++++++++++++++++++ include/linux/virtio.h | 3 +++ 2 files changed, 32 insertions(+)