diff mbox series

[RFC,13/27] vhost: Send buffers to device

Message ID 20201120185105.279030-14-eperezma@redhat.com (mailing list archive)
State New, archived
Headers show
Series vDPA software assisted live migration | expand

Commit Message

Eugenio Perez Martin Nov. 20, 2020, 6:50 p.m. UTC
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-sw-lm-ring.h |   3 +
 hw/virtio/vhost-sw-lm-ring.c | 134 +++++++++++++++++++++++++++++++++--
 hw/virtio/vhost.c            |  59 ++++++++++++++-
 3 files changed, 189 insertions(+), 7 deletions(-)

Comments

Stefan Hajnoczi Dec. 8, 2020, 8:16 a.m. UTC | #1
On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> -static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
> +static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)

"vhost_vring_" is a confusing name. This is not related to
vhost_virtqueue or the vhost_vring_* structs.

vhost_shadow_vq_should_kick_rcu()?

>  {
> -    return virtio_queue_get_used_notify_split(vq->vq);
> +    VirtIODevice *vdev = vq->vdev;
> +    vq->num_added = 0;

I'm surprised that a bool function modifies state. Is this assignment
intentional?

> +/* virtqueue_add:
> + * @vq: The #VirtQueue
> + * @elem: The #VirtQueueElement

The copy-pasted doc comment doesn't match this function.

> +int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
> +{
> +    int host_head = vhost_vring_add_split(vq, elem);
> +    if (vq->ring_id_maps[host_head]) {
> +        g_free(vq->ring_id_maps[host_head]);
> +    }

VirtQueueElement is freed lazily? Is there a reason for this approach? I
would have expected it to be freed when the used element is process by
the kick fd handler.

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 9352c56bfa..304e0baa61 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -956,8 +956,34 @@ static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
>      uint16_t idx = virtio_get_queue_index(vq);
>  
>      VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
> +    VirtQueueElement *elem;
>  
> -    vhost_vring_kick(svq);
> +    /*
> +     * Make available all buffers as possible.
> +     */
> +    do {
> +        if (virtio_queue_get_notification(vq)) {
> +            virtio_queue_set_notification(vq, false);
> +        }
> +
> +        while (true) {
> +            int r;
> +            if (virtio_queue_full(vq)) {
> +                break;
> +            }

Why is this check necessary? The guest cannot provide more descriptors
than there is ring space. If that happens somehow then it's a driver
error that is already reported in virtqueue_pop() below.

I wonder if you added this because the vring implementation above
doesn't support indirect descriptors? It's easy to exhaust the vhost
hdev vring while there is still room in the VirtIODevice's VirtQueue
vring.
Eugenio Perez Martin Dec. 9, 2020, 6:41 p.m. UTC | #2
On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > -static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
> > +static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)
>
> "vhost_vring_" is a confusing name. This is not related to
> vhost_virtqueue or the vhost_vring_* structs.
>
> vhost_shadow_vq_should_kick_rcu()?
>
> >  {
> > -    return virtio_queue_get_used_notify_split(vq->vq);
> > +    VirtIODevice *vdev = vq->vdev;
> > +    vq->num_added = 0;
>
> I'm surprised that a bool function modifies state. Is this assignment
> intentional?
>

It's from the kernel code, virtqueue_kick_prepare_split function. The
num_added member is internal (mutable) state, counting for the batch
so the driver sends a notification in case of uint16_t wrapping in
vhost_vring_add_split with no notification in between. I don't know if
some actual virtio devices could be actually affected from this, since
actual vqs are smaller than (uint16_t)-1 so they should be aware that
more buffers have been added anyway.

> > +/* virtqueue_add:
> > + * @vq: The #VirtQueue
> > + * @elem: The #VirtQueueElement
>
> The copy-pasted doc comment doesn't match this function.
>

Right, I will fix it.

> > +int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
> > +{
> > +    int host_head = vhost_vring_add_split(vq, elem);
> > +    if (vq->ring_id_maps[host_head]) {
> > +        g_free(vq->ring_id_maps[host_head]);
> > +    }
>
> VirtQueueElement is freed lazily? Is there a reason for this approach? I
> would have expected it to be freed when the used element is process by
> the kick fd handler.
>

Maybe it has more sense to free immediately in this commit and
introduce ring_id_maps in later commits, yes.

> > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > index 9352c56bfa..304e0baa61 100644
> > --- a/hw/virtio/vhost.c
> > +++ b/hw/virtio/vhost.c
> > @@ -956,8 +956,34 @@ static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
> >      uint16_t idx = virtio_get_queue_index(vq);
> >
> >      VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
> > +    VirtQueueElement *elem;
> >
> > -    vhost_vring_kick(svq);
> > +    /*
> > +     * Make available all buffers as possible.
> > +     */
> > +    do {
> > +        if (virtio_queue_get_notification(vq)) {
> > +            virtio_queue_set_notification(vq, false);
> > +        }
> > +
> > +        while (true) {
> > +            int r;
> > +            if (virtio_queue_full(vq)) {
> > +                break;
> > +            }
>
> Why is this check necessary? The guest cannot provide more descriptors
> than there is ring space. If that happens somehow then it's a driver
> error that is already reported in virtqueue_pop() below.
>

It's just checked because virtqueue_pop prints an error on that case,
and there is no way to tell the difference between a regular error and
another caused by other causes. Maybe the right thing to do is just to
not to print that error? Caller should do the error printing in that
case. Should we return an error code?

> I wonder if you added this because the vring implementation above
> doesn't support indirect descriptors? It's easy to exhaust the vhost
> hdev vring while there is still room in the VirtIODevice's VirtQueue
> vring.
Stefan Hajnoczi Dec. 10, 2020, 11:55 a.m. UTC | #3
On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > +        while (true) {
> > > +            int r;
> > > +            if (virtio_queue_full(vq)) {
> > > +                break;
> > > +            }
> >
> > Why is this check necessary? The guest cannot provide more descriptors
> > than there is ring space. If that happens somehow then it's a driver
> > error that is already reported in virtqueue_pop() below.
> >
> 
> It's just checked because virtqueue_pop prints an error on that case,
> and there is no way to tell the difference between a regular error and
> another caused by other causes. Maybe the right thing to do is just to
> not to print that error? Caller should do the error printing in that
> case. Should we return an error code?

The reason an error is printed today is because it's a guest error that
never happens with correct guest drivers. Something is broken and the
user should know about it.

Why is "virtio_queue_full" (I already forgot what that actually means,
it's not clear whether this is referring to avail elements or used
elements) a condition that should be silently ignored in shadow vqs?

Stefan
Eugenio Perez Martin Jan. 22, 2021, 6:18 p.m. UTC | #4
On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > +        while (true) {
> > > > +            int r;
> > > > +            if (virtio_queue_full(vq)) {
> > > > +                break;
> > > > +            }
> > >
> > > Why is this check necessary? The guest cannot provide more descriptors
> > > than there is ring space. If that happens somehow then it's a driver
> > > error that is already reported in virtqueue_pop() below.
> > >
> >
> > It's just checked because virtqueue_pop prints an error on that case,
> > and there is no way to tell the difference between a regular error and
> > another caused by other causes. Maybe the right thing to do is just to
> > not to print that error? Caller should do the error printing in that
> > case. Should we return an error code?
>
> The reason an error is printed today is because it's a guest error that
> never happens with correct guest drivers. Something is broken and the
> user should know about it.
>
> Why is "virtio_queue_full" (I already forgot what that actually means,
> it's not clear whether this is referring to avail elements or used
> elements) a condition that should be silently ignored in shadow vqs?
>

TL;DR: It can be changed to a check of the number of available
descriptors in shadow vq, instead of returning as a regular operation.
However, I think that making it a special return of virtqueue_pop
could help in devices that run to completion, avoiding having to
duplicate the count logic in them.

The function virtio_queue_empty checks if the vq has all descriptors
available, so the device has no more work to do until the driver makes
another descriptor available. I can see how it can be a bad name
choice, but virtio_queue_full means the opposite: device has pop()
every descriptor available, and it has not returned any, so the driver
cannot progress until the device marks some descriptors as used.

As I understand, if vq->in_use >vq->num would mean we have a bug in
the device vq code, not in the driver. virtio_queue_full could even be
changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
vq->vring.num", as long as vq->in_use is operated right.

If we hit vq->in_use == vq->num in virtqueue_pop it means the device
tried to pop() one more buffer after having all of them available and
pop'ed. This would be invalid if the device is counting right the
number of in_use descriptors, but then we are duplicating that logic
in the device and the vq.

In shadow vq this situation happens with the correct guest network
driver, since the rx queue is filled for the device to write. Network
device in qemu fetch descriptors on demand, but shadow vq fetch all
available in batching. If the driver just happens to fill the queue of
available descriptors, the log will raise, so we need to check in
handle_sw_lm_vq before calling pop(). Of course the shadow vq can
duplicate guest_vq->in_use instead of checking virtio_queue_full, but
then it needs to check two things for every virtqueue_pop() [1].

Having said that, would you prefer a checking of available slots in
the shadow vq?

Thanks!

[1] if we don't change virtqueue_pop code.
Stefan Hajnoczi March 22, 2021, 10:51 a.m. UTC | #5
On Thu, Mar 11, 2021 at 07:53:53PM +0100, Eugenio Perez Martin wrote:
> On Fri, Jan 22, 2021 at 7:18 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >
> > > On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > > > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > > > +        while (true) {
> > > > > > +            int r;
> > > > > > +            if (virtio_queue_full(vq)) {
> > > > > > +                break;
> > > > > > +            }
> > > > >
> > > > > Why is this check necessary? The guest cannot provide more descriptors
> > > > > than there is ring space. If that happens somehow then it's a driver
> > > > > error that is already reported in virtqueue_pop() below.
> > > > >
> > > >
> > > > It's just checked because virtqueue_pop prints an error on that case,
> > > > and there is no way to tell the difference between a regular error and
> > > > another caused by other causes. Maybe the right thing to do is just to
> > > > not to print that error? Caller should do the error printing in that
> > > > case. Should we return an error code?
> > >
> > > The reason an error is printed today is because it's a guest error that
> > > never happens with correct guest drivers. Something is broken and the
> > > user should know about it.
> > >
> > > Why is "virtio_queue_full" (I already forgot what that actually means,
> > > it's not clear whether this is referring to avail elements or used
> > > elements) a condition that should be silently ignored in shadow vqs?
> > >
> >
> > TL;DR: It can be changed to a check of the number of available
> > descriptors in shadow vq, instead of returning as a regular operation.
> > However, I think that making it a special return of virtqueue_pop
> > could help in devices that run to completion, avoiding having to
> > duplicate the count logic in them.
> >
> > The function virtio_queue_empty checks if the vq has all descriptors
> > available, so the device has no more work to do until the driver makes
> > another descriptor available. I can see how it can be a bad name
> > choice, but virtio_queue_full means the opposite: device has pop()
> > every descriptor available, and it has not returned any, so the driver
> > cannot progress until the device marks some descriptors as used.
> >
> > As I understand, if vq->in_use >vq->num would mean we have a bug in
> > the device vq code, not in the driver. virtio_queue_full could even be
> > changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
> > vq->vring.num", as long as vq->in_use is operated right.
> >
> > If we hit vq->in_use == vq->num in virtqueue_pop it means the device
> > tried to pop() one more buffer after having all of them available and
> > pop'ed. This would be invalid if the device is counting right the
> > number of in_use descriptors, but then we are duplicating that logic
> > in the device and the vq.

Devices call virtqueue_pop() until it returns NULL. They don't need to
count virtqueue buffers explicitly. It returns NULL when vq->num
virtqueue buffers have already been popped (either because
virtio_queue_empty() is true or because an invalid driver state is
detected by checking vq->num in virtqueue_pop()).

> > In shadow vq this situation happens with the correct guest network
> > driver, since the rx queue is filled for the device to write. Network
> > device in qemu fetch descriptors on demand, but shadow vq fetch all
> > available in batching. If the driver just happens to fill the queue of
> > available descriptors, the log will raise, so we need to check in
> > handle_sw_lm_vq before calling pop(). Of course the shadow vq can
> > duplicate guest_vq->in_use instead of checking virtio_queue_full, but
> > then it needs to check two things for every virtqueue_pop() [1].

I don't understand this scenario. It sounds like you are saying the
guest and shadow rx vq are not in sync so there is a case where
vq->in_use > vq->num is triggered? I'm not sure how that can happen
since both vqs have equal vq->num. Maybe you can explain the scenario in
more detail?

Stefan
Eugenio Perez Martin March 22, 2021, 3:55 p.m. UTC | #6
On Mon, Mar 22, 2021 at 11:51 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Thu, Mar 11, 2021 at 07:53:53PM +0100, Eugenio Perez Martin wrote:
> > On Fri, Jan 22, 2021 at 7:18 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > >
> > > > On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > > > > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > > > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > > > > +        while (true) {
> > > > > > > +            int r;
> > > > > > > +            if (virtio_queue_full(vq)) {
> > > > > > > +                break;
> > > > > > > +            }
> > > > > >
> > > > > > Why is this check necessary? The guest cannot provide more descriptors
> > > > > > than there is ring space. If that happens somehow then it's a driver
> > > > > > error that is already reported in virtqueue_pop() below.
> > > > > >
> > > > >
> > > > > It's just checked because virtqueue_pop prints an error on that case,
> > > > > and there is no way to tell the difference between a regular error and
> > > > > another caused by other causes. Maybe the right thing to do is just to
> > > > > not to print that error? Caller should do the error printing in that
> > > > > case. Should we return an error code?
> > > >
> > > > The reason an error is printed today is because it's a guest error that
> > > > never happens with correct guest drivers. Something is broken and the
> > > > user should know about it.
> > > >
> > > > Why is "virtio_queue_full" (I already forgot what that actually means,
> > > > it's not clear whether this is referring to avail elements or used
> > > > elements) a condition that should be silently ignored in shadow vqs?
> > > >
> > >
> > > TL;DR: It can be changed to a check of the number of available
> > > descriptors in shadow vq, instead of returning as a regular operation.
> > > However, I think that making it a special return of virtqueue_pop
> > > could help in devices that run to completion, avoiding having to
> > > duplicate the count logic in them.
> > >
> > > The function virtio_queue_empty checks if the vq has all descriptors
> > > available, so the device has no more work to do until the driver makes
> > > another descriptor available. I can see how it can be a bad name
> > > choice, but virtio_queue_full means the opposite: device has pop()
> > > every descriptor available, and it has not returned any, so the driver
> > > cannot progress until the device marks some descriptors as used.
> > >
> > > As I understand, if vq->in_use >vq->num would mean we have a bug in
> > > the device vq code, not in the driver. virtio_queue_full could even be
> > > changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
> > > vq->vring.num", as long as vq->in_use is operated right.
> > >
> > > If we hit vq->in_use == vq->num in virtqueue_pop it means the device
> > > tried to pop() one more buffer after having all of them available and
> > > pop'ed. This would be invalid if the device is counting right the
> > > number of in_use descriptors, but then we are duplicating that logic
> > > in the device and the vq.
>
> Devices call virtqueue_pop() until it returns NULL. They don't need to
> count virtqueue buffers explicitly. It returns NULL when vq->num
> virtqueue buffers have already been popped (either because
> virtio_queue_empty() is true or because an invalid driver state is
> detected by checking vq->num in virtqueue_pop()).

If I understood you right, the virtio_queue_full addresses the reverse
problem: it controls when the virtqueue is out of buffers to make
available for the device because the latter has not consumed any, not
when the driver does not offer more buffers to the device because it
has no more data to offer.

I find it easier to explain with the virtio-net rx queue (and I think
it's the easier way to trigger this issue). You are describing it's
regular behavior: The guest fills it (let's say 100%), and the device
picks buffers one by one:

virtio_net_receive_rcu:
while (offset < size) {
    elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
    if (!elem) {
        virtio_error("unexpected empty queue");
    }
    /* [1] */
    /* fill elem with rx packet */
    virtqueue_fill(virtqueue, elem);
    ...
    virtqueue_flush(q->rx_vq, i);
}

Every device as far as I know does this buffer by buffer, there is
just processing code in [1], and it never tries to pop more than one
buffers/chain of buffers at the same time. In the case of a queue
empty (no more available buffers), we hit an error, because there are
no more buffers to write. In other devices (or tx queue), empty
buffers means there is no more work to do, not an error.

In the case of shadow virtqueue, we cannot limit the number of exposed
rx buffers to 1 buffer/chain of buffers in [1], since it will affect
batching. We have the opposite problem: All devices (but rx queue)
want to queue "as empty as possible", or "to mark all available
buffers empty". Net rx queue is ok as long as it has a buffer/buffer
chain big enough to write to, but it will fetch them on demand, so
"queue full" (as in all buffers are available) is not a problem for
the device.

However, the part of the shadow virtqueue that forwards the available
buffer seeks the opposite: It wants as many buffers as possible to be
available. That means that there is no [1] code that fills/read &
flush/detach the buffer immediately: Shadow virtqueue wants to make
available as many buffers as possible, but the device may not use them
until it has more data available. To the extreme (virtio-net rx queue
full), shadow virtqueue may make available all buffers, so in a
while(true) loop, it will try to make them available until it hits
that all the buffers are already available (vq->in_use == vq->num).

The solution is to check the number of buffers already available
before calling virtio_queue_pop(). We could duplicate in_use in shadow
virtqueue, of course, but everything we need is already done in
VirtQueue code, so I think to reuse it is a better solution. Another
solution could be to treat vq->in_use == vq->num as an special return
code with no printed error in virtqueue_pop, but to expose if the
queue is full (as vq->in_use == vq->num) sounds less invasive to me.

>
> > > In shadow vq this situation happens with the correct guest network
> > > driver, since the rx queue is filled for the device to write. Network
> > > device in qemu fetch descriptors on demand, but shadow vq fetch all
> > > available in batching. If the driver just happens to fill the queue of
> > > available descriptors, the log will raise, so we need to check in
> > > handle_sw_lm_vq before calling pop(). Of course the shadow vq can
> > > duplicate guest_vq->in_use instead of checking virtio_queue_full, but
> > > then it needs to check two things for every virtqueue_pop() [1].
>
> I don't understand this scenario. It sounds like you are saying the
> guest and shadow rx vq are not in sync so there is a case where
> vq->in_use > vq->num is triggered?

Sorry if I explain it bad, what I meant is that there is a case where
SVQ (as device code) will call virtqueue_pop() when vq->in_use ==
vq->num. virtio_queue_full maintains the check as >=, I think it
should be safe to even to code virtio_queue_full to:

assert(vq->in_use > vq->num);
return vq->inuse == vq->num;

Please let me know if this is not clear enough.

Thanks!

> I'm not sure how that can happen
> since both vqs have equal vq->num. Maybe you can explain the scenario in
> more detail?
>
> Stefan
Stefan Hajnoczi March 22, 2021, 5:40 p.m. UTC | #7
On Mon, Mar 22, 2021 at 04:55:13PM +0100, Eugenio Perez Martin wrote:
> On Mon, Mar 22, 2021 at 11:51 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > On Thu, Mar 11, 2021 at 07:53:53PM +0100, Eugenio Perez Martin wrote:
> > > On Fri, Jan 22, 2021 at 7:18 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > > >
> > > > > On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > > > > > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > > > > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > > > > > +        while (true) {
> > > > > > > > +            int r;
> > > > > > > > +            if (virtio_queue_full(vq)) {
> > > > > > > > +                break;
> > > > > > > > +            }
> > > > > > >
> > > > > > > Why is this check necessary? The guest cannot provide more descriptors
> > > > > > > than there is ring space. If that happens somehow then it's a driver
> > > > > > > error that is already reported in virtqueue_pop() below.
> > > > > > >
> > > > > >
> > > > > > It's just checked because virtqueue_pop prints an error on that case,
> > > > > > and there is no way to tell the difference between a regular error and
> > > > > > another caused by other causes. Maybe the right thing to do is just to
> > > > > > not to print that error? Caller should do the error printing in that
> > > > > > case. Should we return an error code?
> > > > >
> > > > > The reason an error is printed today is because it's a guest error that
> > > > > never happens with correct guest drivers. Something is broken and the
> > > > > user should know about it.
> > > > >
> > > > > Why is "virtio_queue_full" (I already forgot what that actually means,
> > > > > it's not clear whether this is referring to avail elements or used
> > > > > elements) a condition that should be silently ignored in shadow vqs?
> > > > >
> > > >
> > > > TL;DR: It can be changed to a check of the number of available
> > > > descriptors in shadow vq, instead of returning as a regular operation.
> > > > However, I think that making it a special return of virtqueue_pop
> > > > could help in devices that run to completion, avoiding having to
> > > > duplicate the count logic in them.
> > > >
> > > > The function virtio_queue_empty checks if the vq has all descriptors
> > > > available, so the device has no more work to do until the driver makes
> > > > another descriptor available. I can see how it can be a bad name
> > > > choice, but virtio_queue_full means the opposite: device has pop()
> > > > every descriptor available, and it has not returned any, so the driver
> > > > cannot progress until the device marks some descriptors as used.
> > > >
> > > > As I understand, if vq->in_use >vq->num would mean we have a bug in
> > > > the device vq code, not in the driver. virtio_queue_full could even be
> > > > changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
> > > > vq->vring.num", as long as vq->in_use is operated right.
> > > >
> > > > If we hit vq->in_use == vq->num in virtqueue_pop it means the device
> > > > tried to pop() one more buffer after having all of them available and
> > > > pop'ed. This would be invalid if the device is counting right the
> > > > number of in_use descriptors, but then we are duplicating that logic
> > > > in the device and the vq.
> >
> > Devices call virtqueue_pop() until it returns NULL. They don't need to
> > count virtqueue buffers explicitly. It returns NULL when vq->num
> > virtqueue buffers have already been popped (either because
> > virtio_queue_empty() is true or because an invalid driver state is
> > detected by checking vq->num in virtqueue_pop()).
> 
> If I understood you right, the virtio_queue_full addresses the reverse
> problem: it controls when the virtqueue is out of buffers to make
> available for the device because the latter has not consumed any, not
> when the driver does not offer more buffers to the device because it
> has no more data to offer.
> 
> I find it easier to explain with the virtio-net rx queue (and I think
> it's the easier way to trigger this issue). You are describing it's
> regular behavior: The guest fills it (let's say 100%), and the device
> picks buffers one by one:
> 
> virtio_net_receive_rcu:
> while (offset < size) {
>     elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));

The lines before this loop return early when the virtqueue does not have
sufficient buffer space:

  if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
      return 0;
  }

When entering this loop we know that we can pop the buffers needed to
fill one rx packet.

>     if (!elem) {
>         virtio_error("unexpected empty queue");
>     }
>     /* [1] */
>     /* fill elem with rx packet */
>     virtqueue_fill(virtqueue, elem);
>     ...
>     virtqueue_flush(q->rx_vq, i);
> }
> 
> Every device as far as I know does this buffer by buffer, there is
> just processing code in [1], and it never tries to pop more than one
> buffers/chain of buffers at the same time. In the case of a queue
> empty (no more available buffers), we hit an error, because there are
> no more buffers to write.

It's an error because we already checked that the virtqueue has buffer
space. This should never happen.

> In other devices (or tx queue), empty
> buffers means there is no more work to do, not an error.
> 
> In the case of shadow virtqueue, we cannot limit the number of exposed
> rx buffers to 1 buffer/chain of buffers in [1], since it will affect
> batching. We have the opposite problem: All devices (but rx queue)
> want to queue "as empty as possible", or "to mark all available
> buffers empty". Net rx queue is ok as long as it has a buffer/buffer
> chain big enough to write to, but it will fetch them on demand, so
> "queue full" (as in all buffers are available) is not a problem for
> the device.
> 
> However, the part of the shadow virtqueue that forwards the available
> buffer seeks the opposite: It wants as many buffers as possible to be
> available. That means that there is no [1] code that fills/read &
> flush/detach the buffer immediately: Shadow virtqueue wants to make
> available as many buffers as possible, but the device may not use them
> until it has more data available. To the extreme (virtio-net rx queue
> full), shadow virtqueue may make available all buffers, so in a
> while(true) loop, it will try to make them available until it hits
> that all the buffers are already available (vq->in_use == vq->num).
> 
> The solution is to check the number of buffers already available
> before calling virtio_queue_pop(). We could duplicate in_use in shadow
> virtqueue, of course, but everything we need is already done in
> VirtQueue code, so I think to reuse it is a better solution. Another
> solution could be to treat vq->in_use == vq->num as an special return
> code with no printed error in virtqueue_pop, but to expose if the
> queue is full (as vq->in_use == vq->num) sounds less invasive to me.
>
> >
> > > > In shadow vq this situation happens with the correct guest network
> > > > driver, since the rx queue is filled for the device to write. Network
> > > > device in qemu fetch descriptors on demand, but shadow vq fetch all
> > > > available in batching. If the driver just happens to fill the queue of
> > > > available descriptors, the log will raise, so we need to check in
> > > > handle_sw_lm_vq before calling pop(). Of course the shadow vq can
> > > > duplicate guest_vq->in_use instead of checking virtio_queue_full, but
> > > > then it needs to check two things for every virtqueue_pop() [1].
> >
> > I don't understand this scenario. It sounds like you are saying the
> > guest and shadow rx vq are not in sync so there is a case where
> > vq->in_use > vq->num is triggered?
> 
> Sorry if I explain it bad, what I meant is that there is a case where
> SVQ (as device code) will call virtqueue_pop() when vq->in_use ==
> vq->num. virtio_queue_full maintains the check as >=, I think it
> should be safe to even to code virtio_queue_full to:
> 
> assert(vq->in_use > vq->num);
> return vq->inuse == vq->num;
> 
> Please let me know if this is not clear enough.

I don't get it. When virtqueue_split_pop() has popped all requests
virtio_queue_empty_rcu() should return true and we shouldn't reach if
(vq->inuse >= vq->vring.num). The guest driver cannot submit more
available buffers at this point.

I only checked split rings, not packed rings.

Can you point to the SVQ code which has this problem? It may be easier
to re-read the code than try to describe it in an email.

Stefan
Eugenio Perez Martin March 24, 2021, 7:04 p.m. UTC | #8
On Mon, Mar 22, 2021 at 6:40 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Mon, Mar 22, 2021 at 04:55:13PM +0100, Eugenio Perez Martin wrote:
> > On Mon, Mar 22, 2021 at 11:51 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >
> > > On Thu, Mar 11, 2021 at 07:53:53PM +0100, Eugenio Perez Martin wrote:
> > > > On Fri, Jan 22, 2021 at 7:18 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > > > > > > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > > > > > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > > > > > > +        while (true) {
> > > > > > > > > +            int r;
> > > > > > > > > +            if (virtio_queue_full(vq)) {
> > > > > > > > > +                break;
> > > > > > > > > +            }
> > > > > > > >
> > > > > > > > Why is this check necessary? The guest cannot provide more descriptors
> > > > > > > > than there is ring space. If that happens somehow then it's a driver
> > > > > > > > error that is already reported in virtqueue_pop() below.
> > > > > > > >
> > > > > > >
> > > > > > > It's just checked because virtqueue_pop prints an error on that case,
> > > > > > > and there is no way to tell the difference between a regular error and
> > > > > > > another caused by other causes. Maybe the right thing to do is just to
> > > > > > > not to print that error? Caller should do the error printing in that
> > > > > > > case. Should we return an error code?
> > > > > >
> > > > > > The reason an error is printed today is because it's a guest error that
> > > > > > never happens with correct guest drivers. Something is broken and the
> > > > > > user should know about it.
> > > > > >
> > > > > > Why is "virtio_queue_full" (I already forgot what that actually means,
> > > > > > it's not clear whether this is referring to avail elements or used
> > > > > > elements) a condition that should be silently ignored in shadow vqs?
> > > > > >
> > > > >
> > > > > TL;DR: It can be changed to a check of the number of available
> > > > > descriptors in shadow vq, instead of returning as a regular operation.
> > > > > However, I think that making it a special return of virtqueue_pop
> > > > > could help in devices that run to completion, avoiding having to
> > > > > duplicate the count logic in them.
> > > > >
> > > > > The function virtio_queue_empty checks if the vq has all descriptors
> > > > > available, so the device has no more work to do until the driver makes
> > > > > another descriptor available. I can see how it can be a bad name
> > > > > choice, but virtio_queue_full means the opposite: device has pop()
> > > > > every descriptor available, and it has not returned any, so the driver
> > > > > cannot progress until the device marks some descriptors as used.
> > > > >
> > > > > As I understand, if vq->in_use >vq->num would mean we have a bug in
> > > > > the device vq code, not in the driver. virtio_queue_full could even be
> > > > > changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
> > > > > vq->vring.num", as long as vq->in_use is operated right.
> > > > >
> > > > > If we hit vq->in_use == vq->num in virtqueue_pop it means the device
> > > > > tried to pop() one more buffer after having all of them available and
> > > > > pop'ed. This would be invalid if the device is counting right the
> > > > > number of in_use descriptors, but then we are duplicating that logic
> > > > > in the device and the vq.
> > >
> > > Devices call virtqueue_pop() until it returns NULL. They don't need to
> > > count virtqueue buffers explicitly. It returns NULL when vq->num
> > > virtqueue buffers have already been popped (either because
> > > virtio_queue_empty() is true or because an invalid driver state is
> > > detected by checking vq->num in virtqueue_pop()).
> >
> > If I understood you right, the virtio_queue_full addresses the reverse
> > problem: it controls when the virtqueue is out of buffers to make
> > available for the device because the latter has not consumed any, not
> > when the driver does not offer more buffers to the device because it
> > has no more data to offer.
> >
> > I find it easier to explain with the virtio-net rx queue (and I think
> > it's the easier way to trigger this issue). You are describing it's
> > regular behavior: The guest fills it (let's say 100%), and the device
> > picks buffers one by one:
> >
> > virtio_net_receive_rcu:
> > while (offset < size) {
> >     elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
>
> The lines before this loop return early when the virtqueue does not have
> sufficient buffer space:
>
>   if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
>       return 0;
>   }
>
> When entering this loop we know that we can pop the buffers needed to
> fill one rx packet.
>
> >     if (!elem) {
> >         virtio_error("unexpected empty queue");
> >     }
> >     /* [1] */
> >     /* fill elem with rx packet */
> >     virtqueue_fill(virtqueue, elem);
> >     ...
> >     virtqueue_flush(q->rx_vq, i);
> > }
> >
> > Every device as far as I know does this buffer by buffer, there is
> > just processing code in [1], and it never tries to pop more than one
> > buffers/chain of buffers at the same time. In the case of a queue
> > empty (no more available buffers), we hit an error, because there are
> > no more buffers to write.
>
> It's an error because we already checked that the virtqueue has buffer
> space. This should never happen.
>
> > In other devices (or tx queue), empty
> > buffers means there is no more work to do, not an error.
> >
> > In the case of shadow virtqueue, we cannot limit the number of exposed
> > rx buffers to 1 buffer/chain of buffers in [1], since it will affect
> > batching. We have the opposite problem: All devices (but rx queue)
> > want to queue "as empty as possible", or "to mark all available
> > buffers empty". Net rx queue is ok as long as it has a buffer/buffer
> > chain big enough to write to, but it will fetch them on demand, so
> > "queue full" (as in all buffers are available) is not a problem for
> > the device.
> >
> > However, the part of the shadow virtqueue that forwards the available
> > buffer seeks the opposite: It wants as many buffers as possible to be
> > available. That means that there is no [1] code that fills/read &
> > flush/detach the buffer immediately: Shadow virtqueue wants to make
> > available as many buffers as possible, but the device may not use them
> > until it has more data available. To the extreme (virtio-net rx queue
> > full), shadow virtqueue may make available all buffers, so in a
> > while(true) loop, it will try to make them available until it hits
> > that all the buffers are already available (vq->in_use == vq->num).
> >
> > The solution is to check the number of buffers already available
> > before calling virtio_queue_pop(). We could duplicate in_use in shadow
> > virtqueue, of course, but everything we need is already done in
> > VirtQueue code, so I think to reuse it is a better solution. Another
> > solution could be to treat vq->in_use == vq->num as an special return
> > code with no printed error in virtqueue_pop, but to expose if the
> > queue is full (as vq->in_use == vq->num) sounds less invasive to me.
> >
> > >
> > > > > In shadow vq this situation happens with the correct guest network
> > > > > driver, since the rx queue is filled for the device to write. Network
> > > > > device in qemu fetch descriptors on demand, but shadow vq fetch all
> > > > > available in batching. If the driver just happens to fill the queue of
> > > > > available descriptors, the log will raise, so we need to check in
> > > > > handle_sw_lm_vq before calling pop(). Of course the shadow vq can
> > > > > duplicate guest_vq->in_use instead of checking virtio_queue_full, but
> > > > > then it needs to check two things for every virtqueue_pop() [1].
> > >
> > > I don't understand this scenario. It sounds like you are saying the
> > > guest and shadow rx vq are not in sync so there is a case where
> > > vq->in_use > vq->num is triggered?
> >
> > Sorry if I explain it bad, what I meant is that there is a case where
> > SVQ (as device code) will call virtqueue_pop() when vq->in_use ==
> > vq->num. virtio_queue_full maintains the check as >=, I think it
> > should be safe to even to code virtio_queue_full to:
> >
> > assert(vq->in_use > vq->num);
> > return vq->inuse == vq->num;
> >
> > Please let me know if this is not clear enough.
>
> I don't get it. When virtqueue_split_pop() has popped all requests
> virtio_queue_empty_rcu() should return true and we shouldn't reach if
> (vq->inuse >= vq->vring.num). The guest driver cannot submit more
> available buffers at this point.
>

Hi Stefan.

After the meeting, and reviewing the code carefully, I think you are
right. I'm not sure what I did to reproduce the issue, but I'm not
able to do it anymore, even in the conditions I thought where it was
trivially reproducible. Now I think it was caused in the previous
series because of accessing directly to guest's vring.

So I will delete this commit from the series. I still need to test SVQ
with the new additions, so if the bug persists it will reproduce for
sure.

Thank you very much!

> I only checked split rings, not packed rings.
>
> Can you point to the SVQ code which has this problem? It may be easier
> to re-read the code than try to describe it in an email.
>
> Stefan
Stefan Hajnoczi March 24, 2021, 7:56 p.m. UTC | #9
On Wed, Mar 24, 2021 at 08:04:07PM +0100, Eugenio Perez Martin wrote:
> On Mon, Mar 22, 2021 at 6:40 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > On Mon, Mar 22, 2021 at 04:55:13PM +0100, Eugenio Perez Martin wrote:
> > > On Mon, Mar 22, 2021 at 11:51 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > >
> > > > On Thu, Mar 11, 2021 at 07:53:53PM +0100, Eugenio Perez Martin wrote:
> > > > > On Fri, Jan 22, 2021 at 7:18 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, Dec 10, 2020 at 12:55 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Dec 09, 2020 at 07:41:23PM +0100, Eugenio Perez Martin wrote:
> > > > > > > > On Tue, Dec 8, 2020 at 9:16 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > > > > > > > > On Fri, Nov 20, 2020 at 07:50:51PM +0100, Eugenio Pérez wrote:
> > > > > > > > > > +        while (true) {
> > > > > > > > > > +            int r;
> > > > > > > > > > +            if (virtio_queue_full(vq)) {
> > > > > > > > > > +                break;
> > > > > > > > > > +            }
> > > > > > > > >
> > > > > > > > > Why is this check necessary? The guest cannot provide more descriptors
> > > > > > > > > than there is ring space. If that happens somehow then it's a driver
> > > > > > > > > error that is already reported in virtqueue_pop() below.
> > > > > > > > >
> > > > > > > >
> > > > > > > > It's just checked because virtqueue_pop prints an error on that case,
> > > > > > > > and there is no way to tell the difference between a regular error and
> > > > > > > > another caused by other causes. Maybe the right thing to do is just to
> > > > > > > > not to print that error? Caller should do the error printing in that
> > > > > > > > case. Should we return an error code?
> > > > > > >
> > > > > > > The reason an error is printed today is because it's a guest error that
> > > > > > > never happens with correct guest drivers. Something is broken and the
> > > > > > > user should know about it.
> > > > > > >
> > > > > > > Why is "virtio_queue_full" (I already forgot what that actually means,
> > > > > > > it's not clear whether this is referring to avail elements or used
> > > > > > > elements) a condition that should be silently ignored in shadow vqs?
> > > > > > >
> > > > > >
> > > > > > TL;DR: It can be changed to a check of the number of available
> > > > > > descriptors in shadow vq, instead of returning as a regular operation.
> > > > > > However, I think that making it a special return of virtqueue_pop
> > > > > > could help in devices that run to completion, avoiding having to
> > > > > > duplicate the count logic in them.
> > > > > >
> > > > > > The function virtio_queue_empty checks if the vq has all descriptors
> > > > > > available, so the device has no more work to do until the driver makes
> > > > > > another descriptor available. I can see how it can be a bad name
> > > > > > choice, but virtio_queue_full means the opposite: device has pop()
> > > > > > every descriptor available, and it has not returned any, so the driver
> > > > > > cannot progress until the device marks some descriptors as used.
> > > > > >
> > > > > > As I understand, if vq->in_use >vq->num would mean we have a bug in
> > > > > > the device vq code, not in the driver. virtio_queue_full could even be
> > > > > > changed to "assert(vq->inuse <= vq->vring.num); return vq->inuse ==
> > > > > > vq->vring.num", as long as vq->in_use is operated right.
> > > > > >
> > > > > > If we hit vq->in_use == vq->num in virtqueue_pop it means the device
> > > > > > tried to pop() one more buffer after having all of them available and
> > > > > > pop'ed. This would be invalid if the device is counting right the
> > > > > > number of in_use descriptors, but then we are duplicating that logic
> > > > > > in the device and the vq.
> > > >
> > > > Devices call virtqueue_pop() until it returns NULL. They don't need to
> > > > count virtqueue buffers explicitly. It returns NULL when vq->num
> > > > virtqueue buffers have already been popped (either because
> > > > virtio_queue_empty() is true or because an invalid driver state is
> > > > detected by checking vq->num in virtqueue_pop()).
> > >
> > > If I understood you right, the virtio_queue_full addresses the reverse
> > > problem: it controls when the virtqueue is out of buffers to make
> > > available for the device because the latter has not consumed any, not
> > > when the driver does not offer more buffers to the device because it
> > > has no more data to offer.
> > >
> > > I find it easier to explain with the virtio-net rx queue (and I think
> > > it's the easier way to trigger this issue). You are describing it's
> > > regular behavior: The guest fills it (let's say 100%), and the device
> > > picks buffers one by one:
> > >
> > > virtio_net_receive_rcu:
> > > while (offset < size) {
> > >     elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
> >
> > The lines before this loop return early when the virtqueue does not have
> > sufficient buffer space:
> >
> >   if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
> >       return 0;
> >   }
> >
> > When entering this loop we know that we can pop the buffers needed to
> > fill one rx packet.
> >
> > >     if (!elem) {
> > >         virtio_error("unexpected empty queue");
> > >     }
> > >     /* [1] */
> > >     /* fill elem with rx packet */
> > >     virtqueue_fill(virtqueue, elem);
> > >     ...
> > >     virtqueue_flush(q->rx_vq, i);
> > > }
> > >
> > > Every device as far as I know does this buffer by buffer, there is
> > > just processing code in [1], and it never tries to pop more than one
> > > buffers/chain of buffers at the same time. In the case of a queue
> > > empty (no more available buffers), we hit an error, because there are
> > > no more buffers to write.
> >
> > It's an error because we already checked that the virtqueue has buffer
> > space. This should never happen.
> >
> > > In other devices (or tx queue), empty
> > > buffers means there is no more work to do, not an error.
> > >
> > > In the case of shadow virtqueue, we cannot limit the number of exposed
> > > rx buffers to 1 buffer/chain of buffers in [1], since it will affect
> > > batching. We have the opposite problem: All devices (but rx queue)
> > > want to queue "as empty as possible", or "to mark all available
> > > buffers empty". Net rx queue is ok as long as it has a buffer/buffer
> > > chain big enough to write to, but it will fetch them on demand, so
> > > "queue full" (as in all buffers are available) is not a problem for
> > > the device.
> > >
> > > However, the part of the shadow virtqueue that forwards the available
> > > buffer seeks the opposite: It wants as many buffers as possible to be
> > > available. That means that there is no [1] code that fills/read &
> > > flush/detach the buffer immediately: Shadow virtqueue wants to make
> > > available as many buffers as possible, but the device may not use them
> > > until it has more data available. To the extreme (virtio-net rx queue
> > > full), shadow virtqueue may make available all buffers, so in a
> > > while(true) loop, it will try to make them available until it hits
> > > that all the buffers are already available (vq->in_use == vq->num).
> > >
> > > The solution is to check the number of buffers already available
> > > before calling virtio_queue_pop(). We could duplicate in_use in shadow
> > > virtqueue, of course, but everything we need is already done in
> > > VirtQueue code, so I think to reuse it is a better solution. Another
> > > solution could be to treat vq->in_use == vq->num as an special return
> > > code with no printed error in virtqueue_pop, but to expose if the
> > > queue is full (as vq->in_use == vq->num) sounds less invasive to me.
> > >
> > > >
> > > > > > In shadow vq this situation happens with the correct guest network
> > > > > > driver, since the rx queue is filled for the device to write. Network
> > > > > > device in qemu fetch descriptors on demand, but shadow vq fetch all
> > > > > > available in batching. If the driver just happens to fill the queue of
> > > > > > available descriptors, the log will raise, so we need to check in
> > > > > > handle_sw_lm_vq before calling pop(). Of course the shadow vq can
> > > > > > duplicate guest_vq->in_use instead of checking virtio_queue_full, but
> > > > > > then it needs to check two things for every virtqueue_pop() [1].
> > > >
> > > > I don't understand this scenario. It sounds like you are saying the
> > > > guest and shadow rx vq are not in sync so there is a case where
> > > > vq->in_use > vq->num is triggered?
> > >
> > > Sorry if I explain it bad, what I meant is that there is a case where
> > > SVQ (as device code) will call virtqueue_pop() when vq->in_use ==
> > > vq->num. virtio_queue_full maintains the check as >=, I think it
> > > should be safe to even to code virtio_queue_full to:
> > >
> > > assert(vq->in_use > vq->num);
> > > return vq->inuse == vq->num;
> > >
> > > Please let me know if this is not clear enough.
> >
> > I don't get it. When virtqueue_split_pop() has popped all requests
> > virtio_queue_empty_rcu() should return true and we shouldn't reach if
> > (vq->inuse >= vq->vring.num). The guest driver cannot submit more
> > available buffers at this point.
> >
> 
> Hi Stefan.
> 
> After the meeting, and reviewing the code carefully, I think you are
> right. I'm not sure what I did to reproduce the issue, but I'm not
> able to do it anymore, even in the conditions I thought where it was
> trivially reproducible. Now I think it was caused in the previous
> series because of accessing directly to guest's vring.
> 
> So I will delete this commit from the series. I still need to test SVQ
> with the new additions, so if the bug persists it will reproduce for
> sure.

Okay, thanks!

Stefan
diff mbox series

Patch

diff --git a/hw/virtio/vhost-sw-lm-ring.h b/hw/virtio/vhost-sw-lm-ring.h
index 86dc081b93..29d21feaf4 100644
--- a/hw/virtio/vhost-sw-lm-ring.h
+++ b/hw/virtio/vhost-sw-lm-ring.h
@@ -18,6 +18,9 @@ 
 typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
 
 bool vhost_vring_kick(VhostShadowVirtqueue *vq);
+int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem);
+void vhost_vring_write_addr(const VhostShadowVirtqueue *vq,
+	                    struct vhost_vring_addr *addr);
 
 VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx);
 
diff --git a/hw/virtio/vhost-sw-lm-ring.c b/hw/virtio/vhost-sw-lm-ring.c
index cd7b5ba772..aed005c2d9 100644
--- a/hw/virtio/vhost-sw-lm-ring.c
+++ b/hw/virtio/vhost-sw-lm-ring.c
@@ -9,6 +9,7 @@ 
 
 #include "hw/virtio/vhost-sw-lm-ring.h"
 #include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
 
 #include "standard-headers/linux/vhost_types.h"
 #include "standard-headers/linux/virtio_ring.h"
@@ -19,21 +20,140 @@  typedef struct VhostShadowVirtqueue {
     struct vring vring;
     EventNotifier hdev_notifier;
     VirtQueue *vq;
+    VirtIODevice *vdev;
+
+    /* Map for returning guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next head to expose to device */
+    uint16_t avail_idx_shadow;
+
+    /* Number of descriptors added since last notification */
+    uint16_t num_added;
+
+    /* Next free descriptor */
+    uint16_t free_head;
 
     vring_desc_t descs[];
 } VhostShadowVirtqueue;
 
-static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
+static bool vhost_vring_should_kick_rcu(VhostShadowVirtqueue *vq)
 {
-    return virtio_queue_get_used_notify_split(vq->vq);
+    VirtIODevice *vdev = vq->vdev;
+    vq->num_added = 0;
+
+    smp_rmb();
+    return !(vq->vring.used->flags
+             & virtio_tswap16(vdev, VRING_USED_F_NO_NOTIFY));
 }
 
+static bool vhost_vring_should_kick(VhostShadowVirtqueue *vq)
+{
+    RCU_READ_LOCK_GUARD();
+    return vhost_vring_should_kick_rcu(vq);
+}
+
+
 bool vhost_vring_kick(VhostShadowVirtqueue *vq)
 {
     return vhost_vring_should_kick(vq) ? event_notifier_set(&vq->hdev_notifier)
                                        : true;
 }
 
+static void vhost_vring_write_descs(VhostShadowVirtqueue *vq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = vq->free_head, last = vq->free_head;
+    unsigned n;
+    const VirtIODevice *vdev = vq->vdev;
+    uint16_t flags = write ? virtio_tswap16(vdev, VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = vq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | virtio_tswap16(vdev, VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = virtio_tswap64(vdev, (hwaddr)iovec[n].iov_base);
+        descs[i].len = virtio_tswap32(vdev, iovec[n].iov_len);
+
+        last = i;
+        i = virtio_tswap16(vdev, descs[i].next);
+    }
+
+    vq->free_head = virtio_tswap16(vdev, descs[last].next);
+}
+
+/* virtqueue_add:
+ * @vq: The #VirtQueue
+ * @elem: The #VirtQueueElement
+ *
+ * Add an avail element to a virtqueue.
+ */
+static int vhost_vring_add_split(VhostShadowVirtqueue *vq,
+                                 const VirtQueueElement *elem)
+{
+    int head;
+    unsigned avail_idx;
+    const VirtIODevice *vdev;
+    vring_avail_t *avail;
+
+    RCU_READ_LOCK_GUARD();
+    vdev = vq->vdev;
+    avail = vq->vring.avail;
+
+    head = vq->free_head;
+
+    /* We need some descriptors here */
+    assert(elem->out_num || elem->in_num);
+
+    vhost_vring_write_descs(vq, elem->out_sg, elem->out_num,
+                   elem->in_num > 0, false);
+    vhost_vring_write_descs(vq, elem->in_sg, elem->in_num, false, true);
+
+    /* Put entry in available array (but don't update avail->idx until they
+     * do sync). */
+    avail_idx = vq->avail_idx_shadow & (vq->vring.num - 1);
+    avail->ring[avail_idx] = virtio_tswap16(vdev, head);
+    vq->avail_idx_shadow++;
+
+    /* Expose descriptors to device */
+    smp_wmb();
+    avail->idx = virtio_tswap16(vdev, vq->avail_idx_shadow);
+
+    /* threoretically possible. Kick just in case */
+    if (unlikely(vq->num_added++ == (uint16_t)-1)) {
+        vhost_vring_kick(vq);
+    }
+
+    return head;
+}
+
+int vhost_vring_add(VhostShadowVirtqueue *vq, VirtQueueElement *elem)
+{
+    int host_head = vhost_vring_add_split(vq, elem);
+    if (vq->ring_id_maps[host_head]) {
+        g_free(vq->ring_id_maps[host_head]);
+    }
+
+    vq->ring_id_maps[host_head] = elem;
+    return 0;
+}
+
+void vhost_vring_write_addr(const VhostShadowVirtqueue *vq,
+                            struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)vq->vring.desc;
+    addr->avail_user_addr = (uint64_t)vq->vring.avail;
+    addr->used_user_addr = (uint64_t)vq->vring.used;
+}
+
 VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
 {
     struct vhost_vring_file file = {
@@ -43,9 +163,11 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
     unsigned num = virtio_queue_get_num(dev->vdev, idx);
     size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE);
     VhostShadowVirtqueue *svq;
-    int r;
+    int r, i;
 
     svq = g_malloc0(sizeof(*svq) + ring_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
+    svq->vdev = dev->vdev;
     svq->vq = vq;
 
     r = event_notifier_init(&svq->hdev_notifier, 0);
@@ -55,8 +177,9 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
     assert(r == 0);
 
-    vhost_virtqueue_mask(dev, dev->vdev, idx, true);
-    vhost_virtqueue_pending(dev, idx);
+    vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE);
+    for (i = 0; i < num - 1; i++)
+        svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1);
 
     return svq;
 }
@@ -64,5 +187,6 @@  VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx)
 void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq)
 {
     event_notifier_cleanup(&vq->hdev_notifier);
+    g_free(vq->ring_id_maps);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9352c56bfa..304e0baa61 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -956,8 +956,34 @@  static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq)
     uint16_t idx = virtio_get_queue_index(vq);
 
     VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx];
+    VirtQueueElement *elem;
 
-    vhost_vring_kick(svq);
+    /*
+     * Make available all buffers as possible.
+     */
+    do {
+        if (virtio_queue_get_notification(vq)) {
+            virtio_queue_set_notification(vq, false);
+        }
+
+        while (true) {
+            int r;
+            if (virtio_queue_full(vq)) {
+                break;
+            }
+
+            elem = virtqueue_pop(vq, sizeof(*elem));
+            if (!elem) {
+                break;
+            }
+
+            r = vhost_vring_add(svq, elem);
+            assert(r >= 0);
+            vhost_vring_kick(svq);
+        }
+
+        virtio_queue_set_notification(vq, true);
+    } while(!virtio_queue_empty(vq));
 }
 
 static void vhost_handle_call(EventNotifier *n)
@@ -975,6 +1001,11 @@  static void vhost_handle_call(EventNotifier *n)
     }
 }
 
+static void vhost_virtqueue_stop(struct vhost_dev *dev,
+                                 struct VirtIODevice *vdev,
+                                 struct vhost_virtqueue *vq,
+                                 unsigned idx);
+
 static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
 {
     int idx;
@@ -991,17 +1022,41 @@  static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
 
 static int vhost_sw_live_migration_start(struct vhost_dev *dev)
 {
-    int idx;
+    int idx, r;
+
+    assert(dev->vhost_ops->vhost_set_vring_enable);
+    dev->vhost_ops->vhost_set_vring_enable(dev, false);
 
     for (idx = 0; idx < dev->nvqs; ++idx) {
         struct vhost_virtqueue *vq = &dev->vqs[idx];
+        struct vhost_vring_addr addr = {
+            .index = idx,
+        };
+        struct vhost_vring_state s = {
+            .index = idx,
+        };
+
+        vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], idx);
 
         dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx);
         event_notifier_set_handler(&vq->masked_notifier, vhost_handle_call);
+
+        vhost_vring_write_addr(dev->sw_lm_shadow_vq[idx], &addr);
+        r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
+        assert(r == 0);
+
+        r = dev->vhost_ops->vhost_set_vring_base(dev, &s);
+        assert(r == 0);
     }
 
+    dev->vhost_ops->vhost_set_vring_enable(dev, true);
     vhost_dev_disable_notifiers(dev, dev->vdev);
 
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        vhost_virtqueue_mask(dev, dev->vdev, idx, true);
+        vhost_virtqueue_pending(dev, idx);
+    }
+
     return 0;
 }