diff mbox series

[v2,01/13] vdpa net: move iova tree creation from init to start

Message ID 20230208094253.702672-2-eperezma@redhat.com (mailing list archive)
State New, archived
Headers show
Series Dynamycally switch to vhost shadow virtqueues at vdpa net migration | expand

Commit Message

Eugenio Perez Martin Feb. 8, 2023, 9:42 a.m. UTC
Only create iova_tree if and when it is needed.

The cleanup keeps being responsible of last VQ but this change allows it
to merge both cleanup functions.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 28 deletions(-)

Comments

Si-Wei Liu Feb. 13, 2023, 6:50 a.m. UTC | #1
On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
> Only create iova_tree if and when it is needed.
>
> The cleanup keeps being responsible of last VQ but this change allows it
> to merge both cleanup functions.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> Acked-by: Jason Wang <jasowang@redhat.com>
> ---
>   net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
>   1 file changed, 71 insertions(+), 28 deletions(-)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index de5ed8ff22..a9e6c8f28e 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -178,13 +178,9 @@ err_init:
>   static void vhost_vdpa_cleanup(NetClientState *nc)
>   {
>       VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> -    struct vhost_dev *dev = &s->vhost_net->dev;
>   
>       qemu_vfree(s->cvq_cmd_out_buffer);
>       qemu_vfree(s->status);
> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> -    }
>       if (s->vhost_net) {
>           vhost_net_cleanup(s->vhost_net);
>           g_free(s->vhost_net);
> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
>       return size;
>   }
>   
> +/** From any vdpa net client, get the netclient of first queue pair */
> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> +{
> +    NICState *nic = qemu_get_nic(s->nc.peer);
> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> +
> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
> +}
> +
> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
> +{
> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> +
> +    if (v->shadow_vqs_enabled) {
> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> +                                           v->iova_range.last);
> +    }
> +}
> +
> +static int vhost_vdpa_net_data_start(NetClientState *nc)
> +{
> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> +
> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> +
> +    if (v->index == 0) {
> +        vhost_vdpa_net_data_start_first(s);
> +        return 0;
> +    }
> +
> +    if (v->shadow_vqs_enabled) {
> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
> +{
> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> +    struct vhost_dev *dev;
> +
> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> +
> +    dev = s->vhost_vdpa.dev;
> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> +    }
> +}
> +
>   static NetClientInfo net_vhost_vdpa_info = {
>           .type = NET_CLIENT_DRIVER_VHOST_VDPA,
>           .size = sizeof(VhostVDPAState),
>           .receive = vhost_vdpa_receive,
> +        .start = vhost_vdpa_net_data_start,
> +        .stop = vhost_vdpa_net_client_stop,
>           .cleanup = vhost_vdpa_cleanup,
>           .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
>           .has_ufo = vhost_vdpa_has_ufo,
> @@ -351,7 +401,7 @@ dma_map_err:
>   
>   static int vhost_vdpa_net_cvq_start(NetClientState *nc)
>   {
> -    VhostVDPAState *s;
> +    VhostVDPAState *s, *s0;
>       struct vhost_vdpa *v;
>       uint64_t backend_features;
>       int64_t cvq_group;
> @@ -425,6 +475,15 @@ out:
>           return 0;
>       }
>   
> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
> +    if (s0->vhost_vdpa.iova_tree) {
> +        /* SVQ is already configured for all virtqueues */
> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> +    } else {
> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> +                                           v->iova_range.last);
I wonder how this case could happen, vhost_vdpa_net_data_start_first() 
should've allocated an iova tree on the first data vq. Is zero data vq 
ever possible on net vhost-vdpa?

Thanks,
-Siwei
> +    }
> +
>       r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
>                                  vhost_vdpa_net_cvq_cmd_page_len(), false);
>       if (unlikely(r < 0)) {
> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
>       if (s->vhost_vdpa.shadow_vqs_enabled) {
>           vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
>           vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
> -        if (!s->always_svq) {
> -            /*
> -             * If only the CVQ is shadowed we can delete this safely.
> -             * If all the VQs are shadows this will be needed by the time the
> -             * device is started again to register SVQ vrings and similar.
> -             */
> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> -        }
>       }
> +
> +    vhost_vdpa_net_client_stop(nc);
>   }
>   
>   static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>                                          int nvqs,
>                                          bool is_datapath,
>                                          bool svq,
> -                                       struct vhost_vdpa_iova_range iova_range,
> -                                       VhostIOVATree *iova_tree)
> +                                       struct vhost_vdpa_iova_range iova_range)
>   {
>       NetClientState *nc = NULL;
>       VhostVDPAState *s;
> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>       s->vhost_vdpa.shadow_vqs_enabled = svq;
>       s->vhost_vdpa.iova_range = iova_range;
>       s->vhost_vdpa.shadow_data = svq;
> -    s->vhost_vdpa.iova_tree = iova_tree;
>       if (!is_datapath) {
>           s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
>                                               vhost_vdpa_net_cvq_cmd_page_len());
> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>       uint64_t features;
>       int vdpa_device_fd;
>       g_autofree NetClientState **ncs = NULL;
> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
>       struct vhost_vdpa_iova_range iova_range;
>       NetClientState *nc;
>       int queue_pairs, r, i = 0, has_cvq = 0;
> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>           goto err;
>       }
>   
> -    if (opts->x_svq) {
> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
> -            goto err_svq;
> -        }
> -
> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
> +        goto err;
>       }
>   
>       ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>       for (i = 0; i < queue_pairs; i++) {
>           ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>                                        vdpa_device_fd, i, 2, true, opts->x_svq,
> -                                     iova_range, iova_tree);
> +                                     iova_range);
>           if (!ncs[i])
>               goto err;
>       }
> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>       if (has_cvq) {
>           nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>                                    vdpa_device_fd, i, 1, false,
> -                                 opts->x_svq, iova_range, iova_tree);
> +                                 opts->x_svq, iova_range);
>           if (!nc)
>               goto err;
>       }
>   
> -    /* iova_tree ownership belongs to last NetClientState */
> -    g_steal_pointer(&iova_tree);
>       return 0;
>   
>   err:
> @@ -849,7 +893,6 @@ err:
>           }
>       }
>   
> -err_svq:
>       qemu_close(vdpa_device_fd);
>   
>       return -1;
Eugenio Perez Martin Feb. 13, 2023, 11:14 a.m. UTC | #2
On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
> > Only create iova_tree if and when it is needed.
> >
> > The cleanup keeps being responsible of last VQ but this change allows it
> > to merge both cleanup functions.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > Acked-by: Jason Wang <jasowang@redhat.com>
> > ---
> >   net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
> >   1 file changed, 71 insertions(+), 28 deletions(-)
> >
> > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > index de5ed8ff22..a9e6c8f28e 100644
> > --- a/net/vhost-vdpa.c
> > +++ b/net/vhost-vdpa.c
> > @@ -178,13 +178,9 @@ err_init:
> >   static void vhost_vdpa_cleanup(NetClientState *nc)
> >   {
> >       VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > -    struct vhost_dev *dev = &s->vhost_net->dev;
> >
> >       qemu_vfree(s->cvq_cmd_out_buffer);
> >       qemu_vfree(s->status);
> > -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> > -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> > -    }
> >       if (s->vhost_net) {
> >           vhost_net_cleanup(s->vhost_net);
> >           g_free(s->vhost_net);
> > @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
> >       return size;
> >   }
> >
> > +/** From any vdpa net client, get the netclient of first queue pair */
> > +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> > +{
> > +    NICState *nic = qemu_get_nic(s->nc.peer);
> > +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> > +
> > +    return DO_UPCAST(VhostVDPAState, nc, nc0);
> > +}
> > +
> > +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
> > +{
> > +    struct vhost_vdpa *v = &s->vhost_vdpa;
> > +
> > +    if (v->shadow_vqs_enabled) {
> > +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> > +                                           v->iova_range.last);
> > +    }
> > +}
> > +
> > +static int vhost_vdpa_net_data_start(NetClientState *nc)
> > +{
> > +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > +    struct vhost_vdpa *v = &s->vhost_vdpa;
> > +
> > +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> > +
> > +    if (v->index == 0) {
> > +        vhost_vdpa_net_data_start_first(s);
> > +        return 0;
> > +    }
> > +
> > +    if (v->shadow_vqs_enabled) {
> > +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
> > +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void vhost_vdpa_net_client_stop(NetClientState *nc)
> > +{
> > +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > +    struct vhost_dev *dev;
> > +
> > +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> > +
> > +    dev = s->vhost_vdpa.dev;
> > +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> > +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> > +    }
> > +}
> > +
> >   static NetClientInfo net_vhost_vdpa_info = {
> >           .type = NET_CLIENT_DRIVER_VHOST_VDPA,
> >           .size = sizeof(VhostVDPAState),
> >           .receive = vhost_vdpa_receive,
> > +        .start = vhost_vdpa_net_data_start,
> > +        .stop = vhost_vdpa_net_client_stop,
> >           .cleanup = vhost_vdpa_cleanup,
> >           .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
> >           .has_ufo = vhost_vdpa_has_ufo,
> > @@ -351,7 +401,7 @@ dma_map_err:
> >
> >   static int vhost_vdpa_net_cvq_start(NetClientState *nc)
> >   {
> > -    VhostVDPAState *s;
> > +    VhostVDPAState *s, *s0;
> >       struct vhost_vdpa *v;
> >       uint64_t backend_features;
> >       int64_t cvq_group;
> > @@ -425,6 +475,15 @@ out:
> >           return 0;
> >       }
> >
> > +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
> > +    if (s0->vhost_vdpa.iova_tree) {
> > +        /* SVQ is already configured for all virtqueues */
> > +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> > +    } else {
> > +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> > +                                           v->iova_range.last);
> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
> should've allocated an iova tree on the first data vq. Is zero data vq
> ever possible on net vhost-vdpa?
>

It's the case of the current qemu master when only CVQ is being
shadowed. It's not that "there are no data vq": If that case were
possible, CVQ vhost-vdpa state would be s0.

The case is that since only CVQ vhost-vdpa is the one being migrated,
only CVQ has an iova tree.

With this series applied and with no migration running, the case is
the same as before: only SVQ gets shadowed. When migration starts, all
vqs are migrated, and share iova tree.

Thanks!

> Thanks,
> -Siwei
> > +    }
> > +
> >       r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
> >                                  vhost_vdpa_net_cvq_cmd_page_len(), false);
> >       if (unlikely(r < 0)) {
> > @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
> >       if (s->vhost_vdpa.shadow_vqs_enabled) {
> >           vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
> >           vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
> > -        if (!s->always_svq) {
> > -            /*
> > -             * If only the CVQ is shadowed we can delete this safely.
> > -             * If all the VQs are shadows this will be needed by the time the
> > -             * device is started again to register SVQ vrings and similar.
> > -             */
> > -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> > -        }
> >       }
> > +
> > +    vhost_vdpa_net_client_stop(nc);
> >   }
> >
> >   static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
> > @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >                                          int nvqs,
> >                                          bool is_datapath,
> >                                          bool svq,
> > -                                       struct vhost_vdpa_iova_range iova_range,
> > -                                       VhostIOVATree *iova_tree)
> > +                                       struct vhost_vdpa_iova_range iova_range)
> >   {
> >       NetClientState *nc = NULL;
> >       VhostVDPAState *s;
> > @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >       s->vhost_vdpa.shadow_vqs_enabled = svq;
> >       s->vhost_vdpa.iova_range = iova_range;
> >       s->vhost_vdpa.shadow_data = svq;
> > -    s->vhost_vdpa.iova_tree = iova_tree;
> >       if (!is_datapath) {
> >           s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
> >                                               vhost_vdpa_net_cvq_cmd_page_len());
> > @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >       uint64_t features;
> >       int vdpa_device_fd;
> >       g_autofree NetClientState **ncs = NULL;
> > -    g_autoptr(VhostIOVATree) iova_tree = NULL;
> >       struct vhost_vdpa_iova_range iova_range;
> >       NetClientState *nc;
> >       int queue_pairs, r, i = 0, has_cvq = 0;
> > @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >           goto err;
> >       }
> >
> > -    if (opts->x_svq) {
> > -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
> > -            goto err_svq;
> > -        }
> > -
> > -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
> > +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
> > +        goto err;
> >       }
> >
> >       ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
> > @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >       for (i = 0; i < queue_pairs; i++) {
> >           ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >                                        vdpa_device_fd, i, 2, true, opts->x_svq,
> > -                                     iova_range, iova_tree);
> > +                                     iova_range);
> >           if (!ncs[i])
> >               goto err;
> >       }
> > @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >       if (has_cvq) {
> >           nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >                                    vdpa_device_fd, i, 1, false,
> > -                                 opts->x_svq, iova_range, iova_tree);
> > +                                 opts->x_svq, iova_range);
> >           if (!nc)
> >               goto err;
> >       }
> >
> > -    /* iova_tree ownership belongs to last NetClientState */
> > -    g_steal_pointer(&iova_tree);
> >       return 0;
> >
> >   err:
> > @@ -849,7 +893,6 @@ err:
> >           }
> >       }
> >
> > -err_svq:
> >       qemu_close(vdpa_device_fd);
> >
> >       return -1;
>
Si-Wei Liu Feb. 14, 2023, 1:45 a.m. UTC | #3
On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
> On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
>>> Only create iova_tree if and when it is needed.
>>>
>>> The cleanup keeps being responsible of last VQ but this change allows it
>>> to merge both cleanup functions.
>>>
>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>> Acked-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>>    net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
>>>    1 file changed, 71 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>>> index de5ed8ff22..a9e6c8f28e 100644
>>> --- a/net/vhost-vdpa.c
>>> +++ b/net/vhost-vdpa.c
>>> @@ -178,13 +178,9 @@ err_init:
>>>    static void vhost_vdpa_cleanup(NetClientState *nc)
>>>    {
>>>        VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>> -    struct vhost_dev *dev = &s->vhost_net->dev;
>>>
>>>        qemu_vfree(s->cvq_cmd_out_buffer);
>>>        qemu_vfree(s->status);
>>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>> -    }
>>>        if (s->vhost_net) {
>>>            vhost_net_cleanup(s->vhost_net);
>>>            g_free(s->vhost_net);
>>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
>>>        return size;
>>>    }
>>>
>>> +/** From any vdpa net client, get the netclient of first queue pair */
>>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
>>> +{
>>> +    NICState *nic = qemu_get_nic(s->nc.peer);
>>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
>>> +
>>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
>>> +}
>>> +
>>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
>>> +{
>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>> +
>>> +    if (v->shadow_vqs_enabled) {
>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>> +                                           v->iova_range.last);
>>> +    }
>>> +}
>>> +
>>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
>>> +{
>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>> +
>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>> +
>>> +    if (v->index == 0) {
>>> +        vhost_vdpa_net_data_start_first(s);
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (v->shadow_vqs_enabled) {
>>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
>>> +{
>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>> +    struct vhost_dev *dev;
>>> +
>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>> +
>>> +    dev = s->vhost_vdpa.dev;
>>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>> +    }
>>> +}
>>> +
>>>    static NetClientInfo net_vhost_vdpa_info = {
>>>            .type = NET_CLIENT_DRIVER_VHOST_VDPA,
>>>            .size = sizeof(VhostVDPAState),
>>>            .receive = vhost_vdpa_receive,
>>> +        .start = vhost_vdpa_net_data_start,
>>> +        .stop = vhost_vdpa_net_client_stop,
>>>            .cleanup = vhost_vdpa_cleanup,
>>>            .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
>>>            .has_ufo = vhost_vdpa_has_ufo,
>>> @@ -351,7 +401,7 @@ dma_map_err:
>>>
>>>    static int vhost_vdpa_net_cvq_start(NetClientState *nc)
>>>    {
>>> -    VhostVDPAState *s;
>>> +    VhostVDPAState *s, *s0;
>>>        struct vhost_vdpa *v;
>>>        uint64_t backend_features;
>>>        int64_t cvq_group;
>>> @@ -425,6 +475,15 @@ out:
>>>            return 0;
>>>        }
>>>
>>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>> +    if (s0->vhost_vdpa.iova_tree) {
>>> +        /* SVQ is already configured for all virtqueues */
>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>> +    } else {
>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>> +                                           v->iova_range.last);
>> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
>> should've allocated an iova tree on the first data vq. Is zero data vq
>> ever possible on net vhost-vdpa?
>>
> It's the case of the current qemu master when only CVQ is being
> shadowed. It's not that "there are no data vq": If that case were
> possible, CVQ vhost-vdpa state would be s0.
>
> The case is that since only CVQ vhost-vdpa is the one being migrated,
> only CVQ has an iova tree.
OK, so this corresponds to the case where live migration is not started 
and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID. 
Thanks for explaining it!

>
> With this series applied and with no migration running, the case is
> the same as before: only SVQ gets shadowed. When migration starts, all
> vqs are migrated, and share iova tree.
I wonder what is the reason to share the iova tree when migration 
starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?

Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I 
don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to 
VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I 
collision I mentioned earlier:

9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16 
msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000 
perm: 0x1 type: 2
9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16 
msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000 
perm: 0x3 type: 2
9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20 
index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000 
avail_user_addr: 0x2000 log_guest_addr: 0x0
:
:
9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16 
msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000 
perm: 0x1 type: 2
9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16 
msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000 
perm: 0x3 type: 2
9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930 
index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000 
avail_user_addr: 0x17000 log_guest_addr: 0x0
9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16 
msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000 
perm: 0x1 type: 2
9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16 
msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000 
perm: 0x3 type: 2
9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16 
msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000 
perm: 0x1 type: 2
9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16 
msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000 
perm: 0x3 type: 2
9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0 
index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000 
avail_user_addr: 0x1b400 log_guest_addr: 0x0
9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa: 
0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
9585@1676093788.635670:vhost_vdpa_listener_begin_batch 
vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16 
msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm: 
0x3 type: 2
2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16, 
errno=14 (Bad address)
2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping 
failed, unable to continue


Regards,
-Siwei
>
> Thanks!
>
>> Thanks,
>> -Siwei
>>> +    }
>>> +
>>>        r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
>>>                                   vhost_vdpa_net_cvq_cmd_page_len(), false);
>>>        if (unlikely(r < 0)) {
>>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
>>>        if (s->vhost_vdpa.shadow_vqs_enabled) {
>>>            vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
>>>            vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
>>> -        if (!s->always_svq) {
>>> -            /*
>>> -             * If only the CVQ is shadowed we can delete this safely.
>>> -             * If all the VQs are shadows this will be needed by the time the
>>> -             * device is started again to register SVQ vrings and similar.
>>> -             */
>>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>> -        }
>>>        }
>>> +
>>> +    vhost_vdpa_net_client_stop(nc);
>>>    }
>>>
>>>    static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
>>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>                                           int nvqs,
>>>                                           bool is_datapath,
>>>                                           bool svq,
>>> -                                       struct vhost_vdpa_iova_range iova_range,
>>> -                                       VhostIOVATree *iova_tree)
>>> +                                       struct vhost_vdpa_iova_range iova_range)
>>>    {
>>>        NetClientState *nc = NULL;
>>>        VhostVDPAState *s;
>>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>        s->vhost_vdpa.shadow_vqs_enabled = svq;
>>>        s->vhost_vdpa.iova_range = iova_range;
>>>        s->vhost_vdpa.shadow_data = svq;
>>> -    s->vhost_vdpa.iova_tree = iova_tree;
>>>        if (!is_datapath) {
>>>            s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
>>>                                                vhost_vdpa_net_cvq_cmd_page_len());
>>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>        uint64_t features;
>>>        int vdpa_device_fd;
>>>        g_autofree NetClientState **ncs = NULL;
>>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
>>>        struct vhost_vdpa_iova_range iova_range;
>>>        NetClientState *nc;
>>>        int queue_pairs, r, i = 0, has_cvq = 0;
>>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>            goto err;
>>>        }
>>>
>>> -    if (opts->x_svq) {
>>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
>>> -            goto err_svq;
>>> -        }
>>> -
>>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
>>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
>>> +        goto err;
>>>        }
>>>
>>>        ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
>>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>        for (i = 0; i < queue_pairs; i++) {
>>>            ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>                                         vdpa_device_fd, i, 2, true, opts->x_svq,
>>> -                                     iova_range, iova_tree);
>>> +                                     iova_range);
>>>            if (!ncs[i])
>>>                goto err;
>>>        }
>>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>        if (has_cvq) {
>>>            nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>                                     vdpa_device_fd, i, 1, false,
>>> -                                 opts->x_svq, iova_range, iova_tree);
>>> +                                 opts->x_svq, iova_range);
>>>            if (!nc)
>>>                goto err;
>>>        }
>>>
>>> -    /* iova_tree ownership belongs to last NetClientState */
>>> -    g_steal_pointer(&iova_tree);
>>>        return 0;
>>>
>>>    err:
>>> @@ -849,7 +893,6 @@ err:
>>>            }
>>>        }
>>>
>>> -err_svq:
>>>        qemu_close(vdpa_device_fd);
>>>
>>>        return -1;
Eugenio Perez Martin Feb. 14, 2023, 7:07 p.m. UTC | #4
On Tue, Feb 14, 2023 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
> > On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
> >>> Only create iova_tree if and when it is needed.
> >>>
> >>> The cleanup keeps being responsible of last VQ but this change allows it
> >>> to merge both cleanup functions.
> >>>
> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>> Acked-by: Jason Wang <jasowang@redhat.com>
> >>> ---
> >>>    net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
> >>>    1 file changed, 71 insertions(+), 28 deletions(-)
> >>>
> >>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> >>> index de5ed8ff22..a9e6c8f28e 100644
> >>> --- a/net/vhost-vdpa.c
> >>> +++ b/net/vhost-vdpa.c
> >>> @@ -178,13 +178,9 @@ err_init:
> >>>    static void vhost_vdpa_cleanup(NetClientState *nc)
> >>>    {
> >>>        VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>> -    struct vhost_dev *dev = &s->vhost_net->dev;
> >>>
> >>>        qemu_vfree(s->cvq_cmd_out_buffer);
> >>>        qemu_vfree(s->status);
> >>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>> -    }
> >>>        if (s->vhost_net) {
> >>>            vhost_net_cleanup(s->vhost_net);
> >>>            g_free(s->vhost_net);
> >>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
> >>>        return size;
> >>>    }
> >>>
> >>> +/** From any vdpa net client, get the netclient of first queue pair */
> >>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> >>> +{
> >>> +    NICState *nic = qemu_get_nic(s->nc.peer);
> >>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> >>> +
> >>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
> >>> +}
> >>> +
> >>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
> >>> +{
> >>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>> +
> >>> +    if (v->shadow_vqs_enabled) {
> >>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>> +                                           v->iova_range.last);
> >>> +    }
> >>> +}
> >>> +
> >>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
> >>> +{
> >>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>> +
> >>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>> +
> >>> +    if (v->index == 0) {
> >>> +        vhost_vdpa_net_data_start_first(s);
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    if (v->shadow_vqs_enabled) {
> >>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
> >>> +{
> >>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>> +    struct vhost_dev *dev;
> >>> +
> >>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>> +
> >>> +    dev = s->vhost_vdpa.dev;
> >>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>> +    }
> >>> +}
> >>> +
> >>>    static NetClientInfo net_vhost_vdpa_info = {
> >>>            .type = NET_CLIENT_DRIVER_VHOST_VDPA,
> >>>            .size = sizeof(VhostVDPAState),
> >>>            .receive = vhost_vdpa_receive,
> >>> +        .start = vhost_vdpa_net_data_start,
> >>> +        .stop = vhost_vdpa_net_client_stop,
> >>>            .cleanup = vhost_vdpa_cleanup,
> >>>            .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
> >>>            .has_ufo = vhost_vdpa_has_ufo,
> >>> @@ -351,7 +401,7 @@ dma_map_err:
> >>>
> >>>    static int vhost_vdpa_net_cvq_start(NetClientState *nc)
> >>>    {
> >>> -    VhostVDPAState *s;
> >>> +    VhostVDPAState *s, *s0;
> >>>        struct vhost_vdpa *v;
> >>>        uint64_t backend_features;
> >>>        int64_t cvq_group;
> >>> @@ -425,6 +475,15 @@ out:
> >>>            return 0;
> >>>        }
> >>>
> >>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>> +    if (s0->vhost_vdpa.iova_tree) {
> >>> +        /* SVQ is already configured for all virtqueues */
> >>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>> +    } else {
> >>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>> +                                           v->iova_range.last);
> >> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
> >> should've allocated an iova tree on the first data vq. Is zero data vq
> >> ever possible on net vhost-vdpa?
> >>
> > It's the case of the current qemu master when only CVQ is being
> > shadowed. It's not that "there are no data vq": If that case were
> > possible, CVQ vhost-vdpa state would be s0.
> >
> > The case is that since only CVQ vhost-vdpa is the one being migrated,
> > only CVQ has an iova tree.
> OK, so this corresponds to the case where live migration is not started
> and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID.
> Thanks for explaining it!
>
> >
> > With this series applied and with no migration running, the case is
> > the same as before: only SVQ gets shadowed. When migration starts, all
> > vqs are migrated, and share iova tree.
> I wonder what is the reason to share the iova tree when migration
> starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?
>
> Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I
> don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to
> VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I
> collision I mentioned earlier:
>

There is no such change. This code only migrates devices with no CVQ,
as they have their own difficulties.

In the previous RFC there was no such change either. Since it's hard
to modify passthrough devices IOVA tree, CVQ AS updates keep being
VHOST_VDPA_NET_CVQ_ASID.

They both share the same IOVA tree though, just for simplicity. If
address space exhaustion is a problem we can make them independent,
but this complicates the code a little bit.

> 9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000
> perm: 0x1 type: 2
> 9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000
> perm: 0x3 type: 2
> 9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20
> index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000
> avail_user_addr: 0x2000 log_guest_addr: 0x0
> :
> :
> 9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000
> perm: 0x1 type: 2
> 9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000
> perm: 0x3 type: 2
> 9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930
> index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000
> avail_user_addr: 0x17000 log_guest_addr: 0x0
> 9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000
> perm: 0x1 type: 2
> 9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000
> perm: 0x3 type: 2
> 9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000
> perm: 0x1 type: 2
> 9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000
> perm: 0x3 type: 2
> 9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0
> index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000
> avail_user_addr: 0x1b400 log_guest_addr: 0x0
> 9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa:
> 0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
> 9585@1676093788.635670:vhost_vdpa_listener_begin_batch
> vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
> 9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm:
> 0x3 type: 2
> 2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16,
> errno=14 (Bad address)
> 2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
> 2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping
> failed, unable to continue
>

I'm not sure how you get to this. Maybe you were able to start the
migration because the CVQ migration blocker was not effectively added?

Thanks!


>
> Regards,
> -Siwei
> >
> > Thanks!
> >
> >> Thanks,
> >> -Siwei
> >>> +    }
> >>> +
> >>>        r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
> >>>                                   vhost_vdpa_net_cvq_cmd_page_len(), false);
> >>>        if (unlikely(r < 0)) {
> >>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
> >>>        if (s->vhost_vdpa.shadow_vqs_enabled) {
> >>>            vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
> >>>            vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
> >>> -        if (!s->always_svq) {
> >>> -            /*
> >>> -             * If only the CVQ is shadowed we can delete this safely.
> >>> -             * If all the VQs are shadows this will be needed by the time the
> >>> -             * device is started again to register SVQ vrings and similar.
> >>> -             */
> >>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>> -        }
> >>>        }
> >>> +
> >>> +    vhost_vdpa_net_client_stop(nc);
> >>>    }
> >>>
> >>>    static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
> >>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>                                           int nvqs,
> >>>                                           bool is_datapath,
> >>>                                           bool svq,
> >>> -                                       struct vhost_vdpa_iova_range iova_range,
> >>> -                                       VhostIOVATree *iova_tree)
> >>> +                                       struct vhost_vdpa_iova_range iova_range)
> >>>    {
> >>>        NetClientState *nc = NULL;
> >>>        VhostVDPAState *s;
> >>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>        s->vhost_vdpa.shadow_vqs_enabled = svq;
> >>>        s->vhost_vdpa.iova_range = iova_range;
> >>>        s->vhost_vdpa.shadow_data = svq;
> >>> -    s->vhost_vdpa.iova_tree = iova_tree;
> >>>        if (!is_datapath) {
> >>>            s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
> >>>                                                vhost_vdpa_net_cvq_cmd_page_len());
> >>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>        uint64_t features;
> >>>        int vdpa_device_fd;
> >>>        g_autofree NetClientState **ncs = NULL;
> >>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
> >>>        struct vhost_vdpa_iova_range iova_range;
> >>>        NetClientState *nc;
> >>>        int queue_pairs, r, i = 0, has_cvq = 0;
> >>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>            goto err;
> >>>        }
> >>>
> >>> -    if (opts->x_svq) {
> >>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>> -            goto err_svq;
> >>> -        }
> >>> -
> >>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
> >>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>> +        goto err;
> >>>        }
> >>>
> >>>        ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
> >>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>        for (i = 0; i < queue_pairs; i++) {
> >>>            ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>                                         vdpa_device_fd, i, 2, true, opts->x_svq,
> >>> -                                     iova_range, iova_tree);
> >>> +                                     iova_range);
> >>>            if (!ncs[i])
> >>>                goto err;
> >>>        }
> >>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>        if (has_cvq) {
> >>>            nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>                                     vdpa_device_fd, i, 1, false,
> >>> -                                 opts->x_svq, iova_range, iova_tree);
> >>> +                                 opts->x_svq, iova_range);
> >>>            if (!nc)
> >>>                goto err;
> >>>        }
> >>>
> >>> -    /* iova_tree ownership belongs to last NetClientState */
> >>> -    g_steal_pointer(&iova_tree);
> >>>        return 0;
> >>>
> >>>    err:
> >>> @@ -849,7 +893,6 @@ err:
> >>>            }
> >>>        }
> >>>
> >>> -err_svq:
> >>>        qemu_close(vdpa_device_fd);
> >>>
> >>>        return -1;
>
Si-Wei Liu Feb. 16, 2023, 2:14 a.m. UTC | #5
On 2/14/2023 11:07 AM, Eugenio Perez Martin wrote:
> On Tue, Feb 14, 2023 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
>>> On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
>>>>> Only create iova_tree if and when it is needed.
>>>>>
>>>>> The cleanup keeps being responsible of last VQ but this change allows it
>>>>> to merge both cleanup functions.
>>>>>
>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>> Acked-by: Jason Wang <jasowang@redhat.com>
>>>>> ---
>>>>>     net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
>>>>>     1 file changed, 71 insertions(+), 28 deletions(-)
>>>>>
>>>>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>>>>> index de5ed8ff22..a9e6c8f28e 100644
>>>>> --- a/net/vhost-vdpa.c
>>>>> +++ b/net/vhost-vdpa.c
>>>>> @@ -178,13 +178,9 @@ err_init:
>>>>>     static void vhost_vdpa_cleanup(NetClientState *nc)
>>>>>     {
>>>>>         VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>> -    struct vhost_dev *dev = &s->vhost_net->dev;
>>>>>
>>>>>         qemu_vfree(s->cvq_cmd_out_buffer);
>>>>>         qemu_vfree(s->status);
>>>>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>>>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>> -    }
>>>>>         if (s->vhost_net) {
>>>>>             vhost_net_cleanup(s->vhost_net);
>>>>>             g_free(s->vhost_net);
>>>>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
>>>>>         return size;
>>>>>     }
>>>>>
>>>>> +/** From any vdpa net client, get the netclient of first queue pair */
>>>>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
>>>>> +{
>>>>> +    NICState *nic = qemu_get_nic(s->nc.peer);
>>>>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
>>>>> +
>>>>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
>>>>> +}
>>>>> +
>>>>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
>>>>> +{
>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>>>> +
>>>>> +    if (v->shadow_vqs_enabled) {
>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>>>> +                                           v->iova_range.last);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
>>>>> +{
>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>>>> +
>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>>>> +
>>>>> +    if (v->index == 0) {
>>>>> +        vhost_vdpa_net_data_start_first(s);
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    if (v->shadow_vqs_enabled) {
>>>>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>>>> +    }
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
>>>>> +{
>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>> +    struct vhost_dev *dev;
>>>>> +
>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>>>> +
>>>>> +    dev = s->vhost_vdpa.dev;
>>>>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>>>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>     static NetClientInfo net_vhost_vdpa_info = {
>>>>>             .type = NET_CLIENT_DRIVER_VHOST_VDPA,
>>>>>             .size = sizeof(VhostVDPAState),
>>>>>             .receive = vhost_vdpa_receive,
>>>>> +        .start = vhost_vdpa_net_data_start,
>>>>> +        .stop = vhost_vdpa_net_client_stop,
>>>>>             .cleanup = vhost_vdpa_cleanup,
>>>>>             .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
>>>>>             .has_ufo = vhost_vdpa_has_ufo,
>>>>> @@ -351,7 +401,7 @@ dma_map_err:
>>>>>
>>>>>     static int vhost_vdpa_net_cvq_start(NetClientState *nc)
>>>>>     {
>>>>> -    VhostVDPAState *s;
>>>>> +    VhostVDPAState *s, *s0;
>>>>>         struct vhost_vdpa *v;
>>>>>         uint64_t backend_features;
>>>>>         int64_t cvq_group;
>>>>> @@ -425,6 +475,15 @@ out:
>>>>>             return 0;
>>>>>         }
>>>>>
>>>>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>>>> +    if (s0->vhost_vdpa.iova_tree) {
>>>>> +        /* SVQ is already configured for all virtqueues */
>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>>>> +    } else {
>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>>>> +                                           v->iova_range.last);
>>>> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
>>>> should've allocated an iova tree on the first data vq. Is zero data vq
>>>> ever possible on net vhost-vdpa?
>>>>
>>> It's the case of the current qemu master when only CVQ is being
>>> shadowed. It's not that "there are no data vq": If that case were
>>> possible, CVQ vhost-vdpa state would be s0.
>>>
>>> The case is that since only CVQ vhost-vdpa is the one being migrated,
>>> only CVQ has an iova tree.
>> OK, so this corresponds to the case where live migration is not started
>> and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID.
>> Thanks for explaining it!
>>
>>> With this series applied and with no migration running, the case is
>>> the same as before: only SVQ gets shadowed. When migration starts, all
>>> vqs are migrated, and share iova tree.
>> I wonder what is the reason to share the iova tree when migration
>> starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?
>>
>> Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I
>> don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to
>> VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I
>> collision I mentioned earlier:
>>
> There is no such change. This code only migrates devices with no CVQ,
> as they have their own difficulties.
>
> In the previous RFC there was no such change either. Since it's hard
> to modify passthrough devices IOVA tree, CVQ AS updates keep being
> VHOST_VDPA_NET_CVQ_ASID.
That's my understanding too, the current code doesn't support changing 
AS once it is set, although uAPI doesn't prohibit it.

> They both share the same IOVA tree though, just for simplicity.
It would be good to document this assumption somewhere in the code, it's 
not easy to infer userspace doesn't have the same view as that in the 
kernel in terms of the iova tree being used.

>   If
> address space exhaustion is a problem we can make them independent,
> but this complicates the code a little bit.
>
>> 9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
>> msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000
>> perm: 0x1 type: 2
>> 9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
>> msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000
>> perm: 0x3 type: 2
>> 9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20
>> index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000
>> avail_user_addr: 0x2000 log_guest_addr: 0x0
>> :
>> :
>> 9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
>> msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000
>> perm: 0x1 type: 2
>> 9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
>> msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000
>> perm: 0x3 type: 2
>> 9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930
>> index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000
>> avail_user_addr: 0x17000 log_guest_addr: 0x0
>> 9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>> msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000
>> perm: 0x1 type: 2
>> 9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>> msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000
>> perm: 0x3 type: 2
>> 9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>> msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000
>> perm: 0x1 type: 2
>> 9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>> msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000
>> perm: 0x3 type: 2
>> 9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0
>> index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000
>> avail_user_addr: 0x1b400 log_guest_addr: 0x0
>> 9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa:
>> 0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
>> 9585@1676093788.635670:vhost_vdpa_listener_begin_batch
>> vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
>> 9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>> msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm:
>> 0x3 type: 2
>> 2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16,
>> errno=14 (Bad address)
>> 2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
>> 2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping
>> failed, unable to continue
>>
> I'm not sure how you get to this. Maybe you were able to start the
> migration because the CVQ migration blocker was not effectively added?
It's something else, below line at the start of 
vhost_vdpa_net_cvq_start() would override the shadow_data on the CVQ.

     v->shadow_data = s->always_svq;

Which leads to my previous question why shadow_data needs to apply to 
the CVQ, and why the userspace iova is shared between data queues and CVQ.

-Siwei


>
> Thanks!
>
>
>> Regards,
>> -Siwei
>>> Thanks!
>>>
>>>> Thanks,
>>>> -Siwei
>>>>> +    }
>>>>> +
>>>>>         r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
>>>>>                                    vhost_vdpa_net_cvq_cmd_page_len(), false);
>>>>>         if (unlikely(r < 0)) {
>>>>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
>>>>>         if (s->vhost_vdpa.shadow_vqs_enabled) {
>>>>>             vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
>>>>>             vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
>>>>> -        if (!s->always_svq) {
>>>>> -            /*
>>>>> -             * If only the CVQ is shadowed we can delete this safely.
>>>>> -             * If all the VQs are shadows this will be needed by the time the
>>>>> -             * device is started again to register SVQ vrings and similar.
>>>>> -             */
>>>>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>> -        }
>>>>>         }
>>>>> +
>>>>> +    vhost_vdpa_net_client_stop(nc);
>>>>>     }
>>>>>
>>>>>     static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
>>>>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>>>                                            int nvqs,
>>>>>                                            bool is_datapath,
>>>>>                                            bool svq,
>>>>> -                                       struct vhost_vdpa_iova_range iova_range,
>>>>> -                                       VhostIOVATree *iova_tree)
>>>>> +                                       struct vhost_vdpa_iova_range iova_range)
>>>>>     {
>>>>>         NetClientState *nc = NULL;
>>>>>         VhostVDPAState *s;
>>>>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>>>         s->vhost_vdpa.shadow_vqs_enabled = svq;
>>>>>         s->vhost_vdpa.iova_range = iova_range;
>>>>>         s->vhost_vdpa.shadow_data = svq;
>>>>> -    s->vhost_vdpa.iova_tree = iova_tree;
>>>>>         if (!is_datapath) {
>>>>>             s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
>>>>>                                                 vhost_vdpa_net_cvq_cmd_page_len());
>>>>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>         uint64_t features;
>>>>>         int vdpa_device_fd;
>>>>>         g_autofree NetClientState **ncs = NULL;
>>>>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
>>>>>         struct vhost_vdpa_iova_range iova_range;
>>>>>         NetClientState *nc;
>>>>>         int queue_pairs, r, i = 0, has_cvq = 0;
>>>>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>             goto err;
>>>>>         }
>>>>>
>>>>> -    if (opts->x_svq) {
>>>>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
>>>>> -            goto err_svq;
>>>>> -        }
>>>>> -
>>>>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
>>>>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
>>>>> +        goto err;
>>>>>         }
>>>>>
>>>>>         ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
>>>>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>         for (i = 0; i < queue_pairs; i++) {
>>>>>             ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>>>                                          vdpa_device_fd, i, 2, true, opts->x_svq,
>>>>> -                                     iova_range, iova_tree);
>>>>> +                                     iova_range);
>>>>>             if (!ncs[i])
>>>>>                 goto err;
>>>>>         }
>>>>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>         if (has_cvq) {
>>>>>             nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>>>                                      vdpa_device_fd, i, 1, false,
>>>>> -                                 opts->x_svq, iova_range, iova_tree);
>>>>> +                                 opts->x_svq, iova_range);
>>>>>             if (!nc)
>>>>>                 goto err;
>>>>>         }
>>>>>
>>>>> -    /* iova_tree ownership belongs to last NetClientState */
>>>>> -    g_steal_pointer(&iova_tree);
>>>>>         return 0;
>>>>>
>>>>>     err:
>>>>> @@ -849,7 +893,6 @@ err:
>>>>>             }
>>>>>         }
>>>>>
>>>>> -err_svq:
>>>>>         qemu_close(vdpa_device_fd);
>>>>>
>>>>>         return -1;
Eugenio Perez Martin Feb. 16, 2023, 7:35 a.m. UTC | #6
On Thu, Feb 16, 2023 at 3:15 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 2/14/2023 11:07 AM, Eugenio Perez Martin wrote:
> > On Tue, Feb 14, 2023 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
> >>> On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
> >>>>> Only create iova_tree if and when it is needed.
> >>>>>
> >>>>> The cleanup keeps being responsible of last VQ but this change allows it
> >>>>> to merge both cleanup functions.
> >>>>>
> >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>>>> Acked-by: Jason Wang <jasowang@redhat.com>
> >>>>> ---
> >>>>>     net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
> >>>>>     1 file changed, 71 insertions(+), 28 deletions(-)
> >>>>>
> >>>>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> >>>>> index de5ed8ff22..a9e6c8f28e 100644
> >>>>> --- a/net/vhost-vdpa.c
> >>>>> +++ b/net/vhost-vdpa.c
> >>>>> @@ -178,13 +178,9 @@ err_init:
> >>>>>     static void vhost_vdpa_cleanup(NetClientState *nc)
> >>>>>     {
> >>>>>         VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>> -    struct vhost_dev *dev = &s->vhost_net->dev;
> >>>>>
> >>>>>         qemu_vfree(s->cvq_cmd_out_buffer);
> >>>>>         qemu_vfree(s->status);
> >>>>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>>>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>> -    }
> >>>>>         if (s->vhost_net) {
> >>>>>             vhost_net_cleanup(s->vhost_net);
> >>>>>             g_free(s->vhost_net);
> >>>>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
> >>>>>         return size;
> >>>>>     }
> >>>>>
> >>>>> +/** From any vdpa net client, get the netclient of first queue pair */
> >>>>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> >>>>> +{
> >>>>> +    NICState *nic = qemu_get_nic(s->nc.peer);
> >>>>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> >>>>> +
> >>>>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
> >>>>> +}
> >>>>> +
> >>>>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
> >>>>> +{
> >>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>>>> +
> >>>>> +    if (v->shadow_vqs_enabled) {
> >>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>>>> +                                           v->iova_range.last);
> >>>>> +    }
> >>>>> +}
> >>>>> +
> >>>>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
> >>>>> +{
> >>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>>>> +
> >>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>>>> +
> >>>>> +    if (v->index == 0) {
> >>>>> +        vhost_vdpa_net_data_start_first(s);
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>> +    if (v->shadow_vqs_enabled) {
> >>>>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>>>> +    }
> >>>>> +
> >>>>> +    return 0;
> >>>>> +}
> >>>>> +
> >>>>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
> >>>>> +{
> >>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>> +    struct vhost_dev *dev;
> >>>>> +
> >>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>>>> +
> >>>>> +    dev = s->vhost_vdpa.dev;
> >>>>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>>>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>> +    }
> >>>>> +}
> >>>>> +
> >>>>>     static NetClientInfo net_vhost_vdpa_info = {
> >>>>>             .type = NET_CLIENT_DRIVER_VHOST_VDPA,
> >>>>>             .size = sizeof(VhostVDPAState),
> >>>>>             .receive = vhost_vdpa_receive,
> >>>>> +        .start = vhost_vdpa_net_data_start,
> >>>>> +        .stop = vhost_vdpa_net_client_stop,
> >>>>>             .cleanup = vhost_vdpa_cleanup,
> >>>>>             .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
> >>>>>             .has_ufo = vhost_vdpa_has_ufo,
> >>>>> @@ -351,7 +401,7 @@ dma_map_err:
> >>>>>
> >>>>>     static int vhost_vdpa_net_cvq_start(NetClientState *nc)
> >>>>>     {
> >>>>> -    VhostVDPAState *s;
> >>>>> +    VhostVDPAState *s, *s0;
> >>>>>         struct vhost_vdpa *v;
> >>>>>         uint64_t backend_features;
> >>>>>         int64_t cvq_group;
> >>>>> @@ -425,6 +475,15 @@ out:
> >>>>>             return 0;
> >>>>>         }
> >>>>>
> >>>>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>>>> +    if (s0->vhost_vdpa.iova_tree) {
> >>>>> +        /* SVQ is already configured for all virtqueues */
> >>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>>>> +    } else {
> >>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>>>> +                                           v->iova_range.last);
> >>>> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
> >>>> should've allocated an iova tree on the first data vq. Is zero data vq
> >>>> ever possible on net vhost-vdpa?
> >>>>
> >>> It's the case of the current qemu master when only CVQ is being
> >>> shadowed. It's not that "there are no data vq": If that case were
> >>> possible, CVQ vhost-vdpa state would be s0.
> >>>
> >>> The case is that since only CVQ vhost-vdpa is the one being migrated,
> >>> only CVQ has an iova tree.
> >> OK, so this corresponds to the case where live migration is not started
> >> and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID.
> >> Thanks for explaining it!
> >>
> >>> With this series applied and with no migration running, the case is
> >>> the same as before: only SVQ gets shadowed. When migration starts, all
> >>> vqs are migrated, and share iova tree.
> >> I wonder what is the reason to share the iova tree when migration
> >> starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?
> >>
> >> Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I
> >> don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to
> >> VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I
> >> collision I mentioned earlier:
> >>
> > There is no such change. This code only migrates devices with no CVQ,
> > as they have their own difficulties.
> >
> > In the previous RFC there was no such change either. Since it's hard
> > to modify passthrough devices IOVA tree, CVQ AS updates keep being
> > VHOST_VDPA_NET_CVQ_ASID.
> That's my understanding too, the current code doesn't support changing
> AS once it is set, although uAPI doesn't prohibit it.
>
> > They both share the same IOVA tree though, just for simplicity.
> It would be good to document this assumption somewhere in the code, it's
> not easy to infer userspace doesn't have the same view as that in the
> kernel in terms of the iova tree being used.
>
> >   If
> > address space exhaustion is a problem we can make them independent,
> > but this complicates the code a little bit.
> >
> >> 9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> >> msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000
> >> perm: 0x1 type: 2
> >> 9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> >> msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000
> >> perm: 0x3 type: 2
> >> 9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20
> >> index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000
> >> avail_user_addr: 0x2000 log_guest_addr: 0x0
> >> :
> >> :
> >> 9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> >> msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000
> >> perm: 0x1 type: 2
> >> 9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> >> msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000
> >> perm: 0x3 type: 2
> >> 9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930
> >> index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000
> >> avail_user_addr: 0x17000 log_guest_addr: 0x0
> >> 9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >> msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000
> >> perm: 0x1 type: 2
> >> 9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >> msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000
> >> perm: 0x3 type: 2
> >> 9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >> msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000
> >> perm: 0x1 type: 2
> >> 9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >> msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000
> >> perm: 0x3 type: 2
> >> 9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0
> >> index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000
> >> avail_user_addr: 0x1b400 log_guest_addr: 0x0
> >> 9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa:
> >> 0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
> >> 9585@1676093788.635670:vhost_vdpa_listener_begin_batch
> >> vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
> >> 9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >> msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm:
> >> 0x3 type: 2
> >> 2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16,
> >> errno=14 (Bad address)
> >> 2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
> >> 2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping
> >> failed, unable to continue
> >>
> > I'm not sure how you get to this. Maybe you were able to start the
> > migration because the CVQ migration blocker was not effectively added?
> It's something else, below line at the start of
> vhost_vdpa_net_cvq_start() would override the shadow_data on the CVQ.
>
>      v->shadow_data = s->always_svq;
>
> Which leads to my previous question why shadow_data needs to apply to
> the CVQ
>

Ok, I'm proposing some documentation here. I'll send a new patch
adding it to the sources if you think it is complete.

Shadow_data needs to apply to CVQ because memory_listener is
registered against CVQ, and memory listener needs to know if data vqs
are passthrough or shadowed. We could apply a memory register to a
different vhost_vdpa but then its lifecycle gets complicated.
---

For completion, the original discussion was [1].

> and why the userspace iova is shared between data queues and CVQ.

It's not shared unless the device does not support ASID. They only
share the iova tree because iova tree itself is not used for tracking
memory itself but only translations, so its lifecycle is easier. Each
piece of memory's lifecycle is tracked differently:
* Guest's memory is tracked by the memory listener itself, so we got
all the regions at register / unregister and in its own updates.
* SVQ vrings are tracked in vhost_vdpa->shadow_vqs[i].
* CVQ shadow buffers are tracked in net VhostVDPAState.
---

I'll send a new series adding the two pieces of doc if you think they
are complete. Please let me know if you'd add or remove something.

Note that this code is already on qemu master so this doc should not
block this series, correct?

Thanks!

[1] https://mail.gnu.org/archive/html/qemu-devel/2022-11/msg02033.html

> -Siwei
>
>
> >
> > Thanks!
> >
> >
> >> Regards,
> >> -Siwei
> >>> Thanks!
> >>>
> >>>> Thanks,
> >>>> -Siwei
> >>>>> +    }
> >>>>> +
> >>>>>         r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
> >>>>>                                    vhost_vdpa_net_cvq_cmd_page_len(), false);
> >>>>>         if (unlikely(r < 0)) {
> >>>>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
> >>>>>         if (s->vhost_vdpa.shadow_vqs_enabled) {
> >>>>>             vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
> >>>>>             vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
> >>>>> -        if (!s->always_svq) {
> >>>>> -            /*
> >>>>> -             * If only the CVQ is shadowed we can delete this safely.
> >>>>> -             * If all the VQs are shadows this will be needed by the time the
> >>>>> -             * device is started again to register SVQ vrings and similar.
> >>>>> -             */
> >>>>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>> -        }
> >>>>>         }
> >>>>> +
> >>>>> +    vhost_vdpa_net_client_stop(nc);
> >>>>>     }
> >>>>>
> >>>>>     static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
> >>>>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>>>                                            int nvqs,
> >>>>>                                            bool is_datapath,
> >>>>>                                            bool svq,
> >>>>> -                                       struct vhost_vdpa_iova_range iova_range,
> >>>>> -                                       VhostIOVATree *iova_tree)
> >>>>> +                                       struct vhost_vdpa_iova_range iova_range)
> >>>>>     {
> >>>>>         NetClientState *nc = NULL;
> >>>>>         VhostVDPAState *s;
> >>>>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>>>         s->vhost_vdpa.shadow_vqs_enabled = svq;
> >>>>>         s->vhost_vdpa.iova_range = iova_range;
> >>>>>         s->vhost_vdpa.shadow_data = svq;
> >>>>> -    s->vhost_vdpa.iova_tree = iova_tree;
> >>>>>         if (!is_datapath) {
> >>>>>             s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
> >>>>>                                                 vhost_vdpa_net_cvq_cmd_page_len());
> >>>>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>         uint64_t features;
> >>>>>         int vdpa_device_fd;
> >>>>>         g_autofree NetClientState **ncs = NULL;
> >>>>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
> >>>>>         struct vhost_vdpa_iova_range iova_range;
> >>>>>         NetClientState *nc;
> >>>>>         int queue_pairs, r, i = 0, has_cvq = 0;
> >>>>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>             goto err;
> >>>>>         }
> >>>>>
> >>>>> -    if (opts->x_svq) {
> >>>>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>>>> -            goto err_svq;
> >>>>> -        }
> >>>>> -
> >>>>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
> >>>>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>>>> +        goto err;
> >>>>>         }
> >>>>>
> >>>>>         ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
> >>>>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>         for (i = 0; i < queue_pairs; i++) {
> >>>>>             ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>>>                                          vdpa_device_fd, i, 2, true, opts->x_svq,
> >>>>> -                                     iova_range, iova_tree);
> >>>>> +                                     iova_range);
> >>>>>             if (!ncs[i])
> >>>>>                 goto err;
> >>>>>         }
> >>>>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>         if (has_cvq) {
> >>>>>             nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>>>                                      vdpa_device_fd, i, 1, false,
> >>>>> -                                 opts->x_svq, iova_range, iova_tree);
> >>>>> +                                 opts->x_svq, iova_range);
> >>>>>             if (!nc)
> >>>>>                 goto err;
> >>>>>         }
> >>>>>
> >>>>> -    /* iova_tree ownership belongs to last NetClientState */
> >>>>> -    g_steal_pointer(&iova_tree);
> >>>>>         return 0;
> >>>>>
> >>>>>     err:
> >>>>> @@ -849,7 +893,6 @@ err:
> >>>>>             }
> >>>>>         }
> >>>>>
> >>>>> -err_svq:
> >>>>>         qemu_close(vdpa_device_fd);
> >>>>>
> >>>>>         return -1;
>
Si-Wei Liu Feb. 17, 2023, 7:38 a.m. UTC | #7
On 2/15/2023 11:35 PM, Eugenio Perez Martin wrote:
> On Thu, Feb 16, 2023 at 3:15 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 2/14/2023 11:07 AM, Eugenio Perez Martin wrote:
>>> On Tue, Feb 14, 2023 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
>>>>> On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
>>>>>>> Only create iova_tree if and when it is needed.
>>>>>>>
>>>>>>> The cleanup keeps being responsible of last VQ but this change allows it
>>>>>>> to merge both cleanup functions.
>>>>>>>
>>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>>>> Acked-by: Jason Wang <jasowang@redhat.com>
>>>>>>> ---
>>>>>>>      net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
>>>>>>>      1 file changed, 71 insertions(+), 28 deletions(-)
>>>>>>>
>>>>>>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>>>>>>> index de5ed8ff22..a9e6c8f28e 100644
>>>>>>> --- a/net/vhost-vdpa.c
>>>>>>> +++ b/net/vhost-vdpa.c
>>>>>>> @@ -178,13 +178,9 @@ err_init:
>>>>>>>      static void vhost_vdpa_cleanup(NetClientState *nc)
>>>>>>>      {
>>>>>>>          VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>>>> -    struct vhost_dev *dev = &s->vhost_net->dev;
>>>>>>>
>>>>>>>          qemu_vfree(s->cvq_cmd_out_buffer);
>>>>>>>          qemu_vfree(s->status);
>>>>>>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>>>>>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>>>> -    }
>>>>>>>          if (s->vhost_net) {
>>>>>>>              vhost_net_cleanup(s->vhost_net);
>>>>>>>              g_free(s->vhost_net);
>>>>>>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
>>>>>>>          return size;
>>>>>>>      }
>>>>>>>
>>>>>>> +/** From any vdpa net client, get the netclient of first queue pair */
>>>>>>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
>>>>>>> +{
>>>>>>> +    NICState *nic = qemu_get_nic(s->nc.peer);
>>>>>>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
>>>>>>> +
>>>>>>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
>>>>>>> +{
>>>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>>>>>> +
>>>>>>> +    if (v->shadow_vqs_enabled) {
>>>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>>>>>> +                                           v->iova_range.last);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
>>>>>>> +{
>>>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
>>>>>>> +
>>>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>>>>>> +
>>>>>>> +    if (v->index == 0) {
>>>>>>> +        vhost_vdpa_net_data_start_first(s);
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (v->shadow_vqs_enabled) {
>>>>>>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
>>>>>>> +{
>>>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>>>>>>> +    struct vhost_dev *dev;
>>>>>>> +
>>>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
>>>>>>> +
>>>>>>> +    dev = s->vhost_vdpa.dev;
>>>>>>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
>>>>>>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>>      static NetClientInfo net_vhost_vdpa_info = {
>>>>>>>              .type = NET_CLIENT_DRIVER_VHOST_VDPA,
>>>>>>>              .size = sizeof(VhostVDPAState),
>>>>>>>              .receive = vhost_vdpa_receive,
>>>>>>> +        .start = vhost_vdpa_net_data_start,
>>>>>>> +        .stop = vhost_vdpa_net_client_stop,
>>>>>>>              .cleanup = vhost_vdpa_cleanup,
>>>>>>>              .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
>>>>>>>              .has_ufo = vhost_vdpa_has_ufo,
>>>>>>> @@ -351,7 +401,7 @@ dma_map_err:
>>>>>>>
>>>>>>>      static int vhost_vdpa_net_cvq_start(NetClientState *nc)
>>>>>>>      {
>>>>>>> -    VhostVDPAState *s;
>>>>>>> +    VhostVDPAState *s, *s0;
>>>>>>>          struct vhost_vdpa *v;
>>>>>>>          uint64_t backend_features;
>>>>>>>          int64_t cvq_group;
>>>>>>> @@ -425,6 +475,15 @@ out:
>>>>>>>              return 0;
>>>>>>>          }
>>>>>>>
>>>>>>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
>>>>>>> +    if (s0->vhost_vdpa.iova_tree) {
>>>>>>> +        /* SVQ is already configured for all virtqueues */
>>>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
>>>>>>> +    } else {
>>>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
>>>>>>> +                                           v->iova_range.last);
>>>>>> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
>>>>>> should've allocated an iova tree on the first data vq. Is zero data vq
>>>>>> ever possible on net vhost-vdpa?
>>>>>>
>>>>> It's the case of the current qemu master when only CVQ is being
>>>>> shadowed. It's not that "there are no data vq": If that case were
>>>>> possible, CVQ vhost-vdpa state would be s0.
>>>>>
>>>>> The case is that since only CVQ vhost-vdpa is the one being migrated,
>>>>> only CVQ has an iova tree.
>>>> OK, so this corresponds to the case where live migration is not started
>>>> and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID.
>>>> Thanks for explaining it!
>>>>
>>>>> With this series applied and with no migration running, the case is
>>>>> the same as before: only SVQ gets shadowed. When migration starts, all
>>>>> vqs are migrated, and share iova tree.
>>>> I wonder what is the reason to share the iova tree when migration
>>>> starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?
>>>>
>>>> Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I
>>>> don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to
>>>> VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I
>>>> collision I mentioned earlier:
>>>>
>>> There is no such change. This code only migrates devices with no CVQ,
>>> as they have their own difficulties.
>>>
>>> In the previous RFC there was no such change either. Since it's hard
>>> to modify passthrough devices IOVA tree, CVQ AS updates keep being
>>> VHOST_VDPA_NET_CVQ_ASID.
>> That's my understanding too, the current code doesn't support changing
>> AS once it is set, although uAPI doesn't prohibit it.
>>
>>> They both share the same IOVA tree though, just for simplicity.
>> It would be good to document this assumption somewhere in the code, it's
>> not easy to infer userspace doesn't have the same view as that in the
>> kernel in terms of the iova tree being used.
>>
>>>    If
>>> address space exhaustion is a problem we can make them independent,
>>> but this complicates the code a little bit.
>>>
>>>> 9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
>>>> msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000
>>>> perm: 0x1 type: 2
>>>> 9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
>>>> msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000
>>>> perm: 0x3 type: 2
>>>> 9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20
>>>> index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000
>>>> avail_user_addr: 0x2000 log_guest_addr: 0x0
>>>> :
>>>> :
>>>> 9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
>>>> msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000
>>>> perm: 0x1 type: 2
>>>> 9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
>>>> msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000
>>>> perm: 0x3 type: 2
>>>> 9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930
>>>> index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000
>>>> avail_user_addr: 0x17000 log_guest_addr: 0x0
>>>> 9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>>>> msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000
>>>> perm: 0x1 type: 2
>>>> 9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>>>> msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000
>>>> perm: 0x3 type: 2
>>>> 9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>>>> msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000
>>>> perm: 0x1 type: 2
>>>> 9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>>>> msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000
>>>> perm: 0x3 type: 2
>>>> 9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0
>>>> index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000
>>>> avail_user_addr: 0x1b400 log_guest_addr: 0x0
>>>> 9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa:
>>>> 0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
>>>> 9585@1676093788.635670:vhost_vdpa_listener_begin_batch
>>>> vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
>>>> 9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
>>>> msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm:
>>>> 0x3 type: 2
>>>> 2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16,
>>>> errno=14 (Bad address)
>>>> 2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
>>>> 2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping
>>>> failed, unable to continue
>>>>
>>> I'm not sure how you get to this. Maybe you were able to start the
>>> migration because the CVQ migration blocker was not effectively added?
>> It's something else, below line at the start of
>> vhost_vdpa_net_cvq_start() would override the shadow_data on the CVQ.
>>
>>       v->shadow_data = s->always_svq;
>>
>> Which leads to my previous question why shadow_data needs to apply to
>> the CVQ
>>
> Ok, I'm proposing some documentation here. I'll send a new patch
> adding it to the sources if you think it is complete.
It's fine, I don't intend to block on this. But what I really meant is 
that there is a bug in the line I pointed out earlier. shadow_data is 
already set by net_vhost_vdpa_init() at init time (for the x-svq=on 
case). For the x-svq=off case, vhost_vdpa_net_log_global_enable() sets 
shadow_data to true on the CVQ within the migration notifier, that's 
correct and expected; however, the subsequent vhost_net_start() function 
right after would call into vhost_vdpa_net_cvq_start(). The latter 
inadvertently sets the CVQ's shadow_data back to false, which defeats 
the purpose of using shadow_data to indicate translating iova on 
shadowed CVQ using the *shared* iova tree. You can say migration with 
CVQ is blocked anyway so this code path doesn't get exposed for now, but 
that somehow causes conflict and confusions for readers to understand 
what the code attempts to achieve. Maybe remove this line or move this 
line to vhost_vdpa_net_cvq_stop()?

> Shadow_data needs to apply to CVQ because memory_listener is
> registered against CVQ,
It's bound to the last virtqueue pair which is not necessarily a CVQ.
>   and memory listener needs to know if data vqs
> are passthrough or shadowed. We could apply a memory register to a
> different vhost_vdpa but then its lifecycle gets complicated.
The lifecycle can remain same but the code will be a lot messier for 
sure. :)

> ---
>
> For completion, the original discussion was [1].
>
>> and why the userspace iova is shared between data queues and CVQ.
> It's not shared unless the device does not support ASID. They only
> share the iova tree because iova tree itself is not used for tracking
> memory itself but only translations, so its lifecycle is easier. Each
> piece of memory's lifecycle is tracked differently:
> * Guest's memory is tracked by the memory listener itself, so we got
> all the regions at register / unregister and in its own updates.
> * SVQ vrings are tracked in vhost_vdpa->shadow_vqs[i].
> * CVQ shadow buffers are tracked in net VhostVDPAState.
> ---
>
> I'll send a new series adding the two pieces of doc if you think they
> are complete. Please let me know if you'd add or remove something.
No you don't have to. Just leave it as-is.

What I thought about making two iova trees independent was not just 
meant for translation but also keep sync with kernel's IOVA address 
space, so that it causes less fluctuations by sending down thinner iova 
update for the unmap and map cycle when switching mode. For now sharing 
the iova tree is fine. I'll see if there's other alternative to keep 
guest memory identity mapped 1:1 on the iova tree across the mode 
switch. Future work you don't have to worry about now.

Thanks,
-Siwei

>
> Note that this code is already on qemu master so this doc should not
> block this series, correct?
>
> Thanks!
>
> [1] https://mail.gnu.org/archive/html/qemu-devel/2022-11/msg02033.html
>
>> -Siwei
>>
>>
>>> Thanks!
>>>
>>>
>>>> Regards,
>>>> -Siwei
>>>>> Thanks!
>>>>>
>>>>>> Thanks,
>>>>>> -Siwei
>>>>>>> +    }
>>>>>>> +
>>>>>>>          r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
>>>>>>>                                     vhost_vdpa_net_cvq_cmd_page_len(), false);
>>>>>>>          if (unlikely(r < 0)) {
>>>>>>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
>>>>>>>          if (s->vhost_vdpa.shadow_vqs_enabled) {
>>>>>>>              vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
>>>>>>>              vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
>>>>>>> -        if (!s->always_svq) {
>>>>>>> -            /*
>>>>>>> -             * If only the CVQ is shadowed we can delete this safely.
>>>>>>> -             * If all the VQs are shadows this will be needed by the time the
>>>>>>> -             * device is started again to register SVQ vrings and similar.
>>>>>>> -             */
>>>>>>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
>>>>>>> -        }
>>>>>>>          }
>>>>>>> +
>>>>>>> +    vhost_vdpa_net_client_stop(nc);
>>>>>>>      }
>>>>>>>
>>>>>>>      static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
>>>>>>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>>>>>                                             int nvqs,
>>>>>>>                                             bool is_datapath,
>>>>>>>                                             bool svq,
>>>>>>> -                                       struct vhost_vdpa_iova_range iova_range,
>>>>>>> -                                       VhostIOVATree *iova_tree)
>>>>>>> +                                       struct vhost_vdpa_iova_range iova_range)
>>>>>>>      {
>>>>>>>          NetClientState *nc = NULL;
>>>>>>>          VhostVDPAState *s;
>>>>>>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
>>>>>>>          s->vhost_vdpa.shadow_vqs_enabled = svq;
>>>>>>>          s->vhost_vdpa.iova_range = iova_range;
>>>>>>>          s->vhost_vdpa.shadow_data = svq;
>>>>>>> -    s->vhost_vdpa.iova_tree = iova_tree;
>>>>>>>          if (!is_datapath) {
>>>>>>>              s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
>>>>>>>                                                  vhost_vdpa_net_cvq_cmd_page_len());
>>>>>>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>>>          uint64_t features;
>>>>>>>          int vdpa_device_fd;
>>>>>>>          g_autofree NetClientState **ncs = NULL;
>>>>>>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
>>>>>>>          struct vhost_vdpa_iova_range iova_range;
>>>>>>>          NetClientState *nc;
>>>>>>>          int queue_pairs, r, i = 0, has_cvq = 0;
>>>>>>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>>>              goto err;
>>>>>>>          }
>>>>>>>
>>>>>>> -    if (opts->x_svq) {
>>>>>>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
>>>>>>> -            goto err_svq;
>>>>>>> -        }
>>>>>>> -
>>>>>>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
>>>>>>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
>>>>>>> +        goto err;
>>>>>>>          }
>>>>>>>
>>>>>>>          ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
>>>>>>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>>>          for (i = 0; i < queue_pairs; i++) {
>>>>>>>              ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>>>>>                                           vdpa_device_fd, i, 2, true, opts->x_svq,
>>>>>>> -                                     iova_range, iova_tree);
>>>>>>> +                                     iova_range);
>>>>>>>              if (!ncs[i])
>>>>>>>                  goto err;
>>>>>>>          }
>>>>>>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
>>>>>>>          if (has_cvq) {
>>>>>>>              nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
>>>>>>>                                       vdpa_device_fd, i, 1, false,
>>>>>>> -                                 opts->x_svq, iova_range, iova_tree);
>>>>>>> +                                 opts->x_svq, iova_range);
>>>>>>>              if (!nc)
>>>>>>>                  goto err;
>>>>>>>          }
>>>>>>>
>>>>>>> -    /* iova_tree ownership belongs to last NetClientState */
>>>>>>> -    g_steal_pointer(&iova_tree);
>>>>>>>          return 0;
>>>>>>>
>>>>>>>      err:
>>>>>>> @@ -849,7 +893,6 @@ err:
>>>>>>>              }
>>>>>>>          }
>>>>>>>
>>>>>>> -err_svq:
>>>>>>>          qemu_close(vdpa_device_fd);
>>>>>>>
>>>>>>>          return -1;
Eugenio Perez Martin Feb. 17, 2023, 1:55 p.m. UTC | #8
On Fri, Feb 17, 2023 at 8:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 2/15/2023 11:35 PM, Eugenio Perez Martin wrote:
> > On Thu, Feb 16, 2023 at 3:15 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 2/14/2023 11:07 AM, Eugenio Perez Martin wrote:
> >>> On Tue, Feb 14, 2023 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 2/13/2023 3:14 AM, Eugenio Perez Martin wrote:
> >>>>> On Mon, Feb 13, 2023 at 7:51 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> On 2/8/2023 1:42 AM, Eugenio Pérez wrote:
> >>>>>>> Only create iova_tree if and when it is needed.
> >>>>>>>
> >>>>>>> The cleanup keeps being responsible of last VQ but this change allows it
> >>>>>>> to merge both cleanup functions.
> >>>>>>>
> >>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>>>>>> Acked-by: Jason Wang <jasowang@redhat.com>
> >>>>>>> ---
> >>>>>>>      net/vhost-vdpa.c | 99 ++++++++++++++++++++++++++++++++++--------------
> >>>>>>>      1 file changed, 71 insertions(+), 28 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> >>>>>>> index de5ed8ff22..a9e6c8f28e 100644
> >>>>>>> --- a/net/vhost-vdpa.c
> >>>>>>> +++ b/net/vhost-vdpa.c
> >>>>>>> @@ -178,13 +178,9 @@ err_init:
> >>>>>>>      static void vhost_vdpa_cleanup(NetClientState *nc)
> >>>>>>>      {
> >>>>>>>          VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>>>> -    struct vhost_dev *dev = &s->vhost_net->dev;
> >>>>>>>
> >>>>>>>          qemu_vfree(s->cvq_cmd_out_buffer);
> >>>>>>>          qemu_vfree(s->status);
> >>>>>>> -    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>>>>>> -        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>>>> -    }
> >>>>>>>          if (s->vhost_net) {
> >>>>>>>              vhost_net_cleanup(s->vhost_net);
> >>>>>>>              g_free(s->vhost_net);
> >>>>>>> @@ -234,10 +230,64 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
> >>>>>>>          return size;
> >>>>>>>      }
> >>>>>>>
> >>>>>>> +/** From any vdpa net client, get the netclient of first queue pair */
> >>>>>>> +static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
> >>>>>>> +{
> >>>>>>> +    NICState *nic = qemu_get_nic(s->nc.peer);
> >>>>>>> +    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
> >>>>>>> +
> >>>>>>> +    return DO_UPCAST(VhostVDPAState, nc, nc0);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
> >>>>>>> +{
> >>>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>>>>>> +
> >>>>>>> +    if (v->shadow_vqs_enabled) {
> >>>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>>>>>> +                                           v->iova_range.last);
> >>>>>>> +    }
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int vhost_vdpa_net_data_start(NetClientState *nc)
> >>>>>>> +{
> >>>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>>>> +    struct vhost_vdpa *v = &s->vhost_vdpa;
> >>>>>>> +
> >>>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>>>>>> +
> >>>>>>> +    if (v->index == 0) {
> >>>>>>> +        vhost_vdpa_net_data_start_first(s);
> >>>>>>> +        return 0;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    if (v->shadow_vqs_enabled) {
> >>>>>>> +        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static void vhost_vdpa_net_client_stop(NetClientState *nc)
> >>>>>>> +{
> >>>>>>> +    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> >>>>>>> +    struct vhost_dev *dev;
> >>>>>>> +
> >>>>>>> +    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
> >>>>>>> +
> >>>>>>> +    dev = s->vhost_vdpa.dev;
> >>>>>>> +    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
> >>>>>>> +        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>>>> +    }
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>      static NetClientInfo net_vhost_vdpa_info = {
> >>>>>>>              .type = NET_CLIENT_DRIVER_VHOST_VDPA,
> >>>>>>>              .size = sizeof(VhostVDPAState),
> >>>>>>>              .receive = vhost_vdpa_receive,
> >>>>>>> +        .start = vhost_vdpa_net_data_start,
> >>>>>>> +        .stop = vhost_vdpa_net_client_stop,
> >>>>>>>              .cleanup = vhost_vdpa_cleanup,
> >>>>>>>              .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
> >>>>>>>              .has_ufo = vhost_vdpa_has_ufo,
> >>>>>>> @@ -351,7 +401,7 @@ dma_map_err:
> >>>>>>>
> >>>>>>>      static int vhost_vdpa_net_cvq_start(NetClientState *nc)
> >>>>>>>      {
> >>>>>>> -    VhostVDPAState *s;
> >>>>>>> +    VhostVDPAState *s, *s0;
> >>>>>>>          struct vhost_vdpa *v;
> >>>>>>>          uint64_t backend_features;
> >>>>>>>          int64_t cvq_group;
> >>>>>>> @@ -425,6 +475,15 @@ out:
> >>>>>>>              return 0;
> >>>>>>>          }
> >>>>>>>
> >>>>>>> +    s0 = vhost_vdpa_net_first_nc_vdpa(s);
> >>>>>>> +    if (s0->vhost_vdpa.iova_tree) {
> >>>>>>> +        /* SVQ is already configured for all virtqueues */
> >>>>>>> +        v->iova_tree = s0->vhost_vdpa.iova_tree;
> >>>>>>> +    } else {
> >>>>>>> +        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
> >>>>>>> +                                           v->iova_range.last);
> >>>>>> I wonder how this case could happen, vhost_vdpa_net_data_start_first()
> >>>>>> should've allocated an iova tree on the first data vq. Is zero data vq
> >>>>>> ever possible on net vhost-vdpa?
> >>>>>>
> >>>>> It's the case of the current qemu master when only CVQ is being
> >>>>> shadowed. It's not that "there are no data vq": If that case were
> >>>>> possible, CVQ vhost-vdpa state would be s0.
> >>>>>
> >>>>> The case is that since only CVQ vhost-vdpa is the one being migrated,
> >>>>> only CVQ has an iova tree.
> >>>> OK, so this corresponds to the case where live migration is not started
> >>>> and CVQ starts in its own address space of VHOST_VDPA_NET_CVQ_ASID.
> >>>> Thanks for explaining it!
> >>>>
> >>>>> With this series applied and with no migration running, the case is
> >>>>> the same as before: only SVQ gets shadowed. When migration starts, all
> >>>>> vqs are migrated, and share iova tree.
> >>>> I wonder what is the reason to share the iova tree when migration
> >>>> starts, I think CVQ may stay on its own VHOST_VDPA_NET_CVQ_ASID still?
> >>>>
> >>>> Actually there's discrepancy in vhost_vdpa_net_log_global_enable(), I
> >>>> don't see explicit code to switch from VHOST_VDPA_NET_CVQ_ASID to
> >>>> VHOST_VDPA_GUEST_PA_ASID for the CVQ. This is the address space I
> >>>> collision I mentioned earlier:
> >>>>
> >>> There is no such change. This code only migrates devices with no CVQ,
> >>> as they have their own difficulties.
> >>>
> >>> In the previous RFC there was no such change either. Since it's hard
> >>> to modify passthrough devices IOVA tree, CVQ AS updates keep being
> >>> VHOST_VDPA_NET_CVQ_ASID.
> >> That's my understanding too, the current code doesn't support changing
> >> AS once it is set, although uAPI doesn't prohibit it.
> >>
> >>> They both share the same IOVA tree though, just for simplicity.
> >> It would be good to document this assumption somewhere in the code, it's
> >> not easy to infer userspace doesn't have the same view as that in the
> >> kernel in terms of the iova tree being used.
> >>
> >>>    If
> >>> address space exhaustion is a problem we can make them independent,
> >>> but this complicates the code a little bit.
> >>>
> >>>> 9585@1676093788.259201:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> >>>> msg_type: 2 asid: 0 iova: 0x1000 size: 0x2000 uaddr: 0x55a5a7ff3000
> >>>> perm: 0x1 type: 2
> >>>> 9585@1676093788.279923:vhost_vdpa_dma_map vdpa:0x7ff13088a190 fd: 16
> >>>> msg_type: 2 asid: 0 iova: 0x3000 size: 0x1000 uaddr: 0x55a5a7ff6000
> >>>> perm: 0x3 type: 2
> >>>> 9585@1676093788.290529:vhost_vdpa_set_vring_addr dev: 0x55a5a77cec20
> >>>> index: 0 flags: 0x0 desc_user_addr: 0x1000 used_user_addr: 0x3000
> >>>> avail_user_addr: 0x2000 log_guest_addr: 0x0
> >>>> :
> >>>> :
> >>>> 9585@1676093788.543567:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> >>>> msg_type: 2 asid: 0 iova: 0x16000 size: 0x2000 uaddr: 0x55a5a7959000
> >>>> perm: 0x1 type: 2
> >>>> 9585@1676093788.576923:vhost_vdpa_dma_map vdpa:0x7ff1302b6190 fd: 16
> >>>> msg_type: 2 asid: 0 iova: 0x18000 size: 0x1000 uaddr: 0x55a5a795c000
> >>>> perm: 0x3 type: 2
> >>>> 9585@1676093788.593881:vhost_vdpa_set_vring_addr dev: 0x55a5a7580930
> >>>> index: 7 flags: 0x0 desc_user_addr: 0x16000 used_user_addr: 0x18000
> >>>> avail_user_addr: 0x17000 log_guest_addr: 0x0
> >>>> 9585@1676093788.593904:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >>>> msg_type: 2 asid: 1 iova: 0x19000 size: 0x1000 uaddr: 0x55a5a77f8000
> >>>> perm: 0x1 type: 2
> >>>> 9585@1676093788.606448:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >>>> msg_type: 2 asid: 1 iova: 0x1a000 size: 0x1000 uaddr: 0x55a5a77fa000
> >>>> perm: 0x3 type: 2
> >>>> 9585@1676093788.616253:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >>>> msg_type: 2 asid: 1 iova: 0x1b000 size: 0x1000 uaddr: 0x55a5a795f000
> >>>> perm: 0x1 type: 2
> >>>> 9585@1676093788.625956:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >>>> msg_type: 2 asid: 1 iova: 0x1c000 size: 0x1000 uaddr: 0x55a5a7f4e000
> >>>> perm: 0x3 type: 2
> >>>> 9585@1676093788.635655:vhost_vdpa_set_vring_addr dev: 0x55a5a7580ec0
> >>>> index: 8 flags: 0x0 desc_user_addr: 0x1b000 used_user_addr: 0x1c000
> >>>> avail_user_addr: 0x1b400 log_guest_addr: 0x0
> >>>> 9585@1676093788.635667:vhost_vdpa_listener_region_add vdpa:
> >>>> 0x7ff13026d190 iova 0x0 llend 0xa0000 vaddr: 0x7fef1fe00000 read-only: 0
> >>>> 9585@1676093788.635670:vhost_vdpa_listener_begin_batch
> >>>> vdpa:0x7ff13026d190 fd: 16 msg_type: 2 type: 5
> >>>> 9585@1676093788.635677:vhost_vdpa_dma_map vdpa:0x7ff13026d190 fd: 16
> >>>> msg_type: 2 asid: 0 iova: 0x0 size: 0xa0000 uaddr: 0x7fef1fe00000 perm:
> >>>> 0x3 type: 2
> >>>> 2023-02-11T05:36:28.635686Z qemu-system-x86_64: failed to write, fd=16,
> >>>> errno=14 (Bad address)
> >>>> 2023-02-11T05:36:28.635721Z qemu-system-x86_64: vhost vdpa map fail!
> >>>> 2023-02-11T05:36:28.635744Z qemu-system-x86_64: vhost-vdpa: DMA mapping
> >>>> failed, unable to continue
> >>>>
> >>> I'm not sure how you get to this. Maybe you were able to start the
> >>> migration because the CVQ migration blocker was not effectively added?
> >> It's something else, below line at the start of
> >> vhost_vdpa_net_cvq_start() would override the shadow_data on the CVQ.
> >>
> >>       v->shadow_data = s->always_svq;
> >>
> >> Which leads to my previous question why shadow_data needs to apply to
> >> the CVQ
> >>
> > Ok, I'm proposing some documentation here. I'll send a new patch
> > adding it to the sources if you think it is complete.
> It's fine, I don't intend to block on this. But what I really meant is
> that there is a bug in the line I pointed out earlier. shadow_data is
> already set by net_vhost_vdpa_init() at init time (for the x-svq=on
> case). For the x-svq=off case, vhost_vdpa_net_log_global_enable() sets
> shadow_data to true on the CVQ within the migration notifier, that's
> correct and expected; however, the subsequent vhost_net_start() function
> right after would call into vhost_vdpa_net_cvq_start(). The latter
> inadvertently sets the CVQ's shadow_data back to false, which defeats
> the purpose of using shadow_data to indicate translating iova on
> shadowed CVQ using the *shared* iova tree. You can say migration with
> CVQ is blocked anyway so this code path doesn't get exposed for now, but
> that somehow causes conflict and confusions for readers to understand
> what the code attempts to achieve. Maybe remove this line or move this
> line to vhost_vdpa_net_cvq_stop()?
>

Ok now I get you. Thank you very much for the catches and
explanations! I'll remove that and those CVQ leftovers for the next
version.

> > Shadow_data needs to apply to CVQ because memory_listener is
> > registered against CVQ,
> It's bound to the last virtqueue pair which is not necessarily a CVQ.
> >   and memory listener needs to know if data vqs
> > are passthrough or shadowed. We could apply a memory register to a
> > different vhost_vdpa but then its lifecycle gets complicated.
> The lifecycle can remain same but the code will be a lot messier for
> sure. :)
>
> > ---
> >
> > For completion, the original discussion was [1].
> >
> >> and why the userspace iova is shared between data queues and CVQ.
> > It's not shared unless the device does not support ASID. They only
> > share the iova tree because iova tree itself is not used for tracking
> > memory itself but only translations, so its lifecycle is easier. Each
> > piece of memory's lifecycle is tracked differently:
> > * Guest's memory is tracked by the memory listener itself, so we got
> > all the regions at register / unregister and in its own updates.
> > * SVQ vrings are tracked in vhost_vdpa->shadow_vqs[i].
> > * CVQ shadow buffers are tracked in net VhostVDPAState.
> > ---
> >
> > I'll send a new series adding the two pieces of doc if you think they
> > are complete. Please let me know if you'd add or remove something.
> No you don't have to. Just leave it as-is.
>
> What I thought about making two iova trees independent was not just
> meant for translation but also keep sync with kernel's IOVA address
> space, so that it causes less fluctuations by sending down thinner iova
> update for the unmap and map cycle when switching mode. For now sharing
> the iova tree is fine. I'll see if there's other alternative to keep
> guest memory identity mapped 1:1 on the iova tree across the mode
> switch. Future work you don't have to worry about now.
>

Got it.

Thanks!

> Thanks,
> -Siwei
>
> >
> > Note that this code is already on qemu master so this doc should not
> > block this series, correct?
> >
> > Thanks!
> >
> > [1] https://mail.gnu.org/archive/html/qemu-devel/2022-11/msg02033.html
> >
> >> -Siwei
> >>
> >>
> >>> Thanks!
> >>>
> >>>
> >>>> Regards,
> >>>> -Siwei
> >>>>> Thanks!
> >>>>>
> >>>>>> Thanks,
> >>>>>> -Siwei
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>>          r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
> >>>>>>>                                     vhost_vdpa_net_cvq_cmd_page_len(), false);
> >>>>>>>          if (unlikely(r < 0)) {
> >>>>>>> @@ -449,15 +508,9 @@ static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
> >>>>>>>          if (s->vhost_vdpa.shadow_vqs_enabled) {
> >>>>>>>              vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
> >>>>>>>              vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
> >>>>>>> -        if (!s->always_svq) {
> >>>>>>> -            /*
> >>>>>>> -             * If only the CVQ is shadowed we can delete this safely.
> >>>>>>> -             * If all the VQs are shadows this will be needed by the time the
> >>>>>>> -             * device is started again to register SVQ vrings and similar.
> >>>>>>> -             */
> >>>>>>> -            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
> >>>>>>> -        }
> >>>>>>>          }
> >>>>>>> +
> >>>>>>> +    vhost_vdpa_net_client_stop(nc);
> >>>>>>>      }
> >>>>>>>
> >>>>>>>      static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
> >>>>>>> @@ -667,8 +720,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>>>>>                                             int nvqs,
> >>>>>>>                                             bool is_datapath,
> >>>>>>>                                             bool svq,
> >>>>>>> -                                       struct vhost_vdpa_iova_range iova_range,
> >>>>>>> -                                       VhostIOVATree *iova_tree)
> >>>>>>> +                                       struct vhost_vdpa_iova_range iova_range)
> >>>>>>>      {
> >>>>>>>          NetClientState *nc = NULL;
> >>>>>>>          VhostVDPAState *s;
> >>>>>>> @@ -690,7 +742,6 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
> >>>>>>>          s->vhost_vdpa.shadow_vqs_enabled = svq;
> >>>>>>>          s->vhost_vdpa.iova_range = iova_range;
> >>>>>>>          s->vhost_vdpa.shadow_data = svq;
> >>>>>>> -    s->vhost_vdpa.iova_tree = iova_tree;
> >>>>>>>          if (!is_datapath) {
> >>>>>>>              s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
> >>>>>>>                                                  vhost_vdpa_net_cvq_cmd_page_len());
> >>>>>>> @@ -760,7 +811,6 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>>>          uint64_t features;
> >>>>>>>          int vdpa_device_fd;
> >>>>>>>          g_autofree NetClientState **ncs = NULL;
> >>>>>>> -    g_autoptr(VhostIOVATree) iova_tree = NULL;
> >>>>>>>          struct vhost_vdpa_iova_range iova_range;
> >>>>>>>          NetClientState *nc;
> >>>>>>>          int queue_pairs, r, i = 0, has_cvq = 0;
> >>>>>>> @@ -812,12 +862,8 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>>>              goto err;
> >>>>>>>          }
> >>>>>>>
> >>>>>>> -    if (opts->x_svq) {
> >>>>>>> -        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>>>>>> -            goto err_svq;
> >>>>>>> -        }
> >>>>>>> -
> >>>>>>> -        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
> >>>>>>> +    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
> >>>>>>> +        goto err;
> >>>>>>>          }
> >>>>>>>
> >>>>>>>          ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
> >>>>>>> @@ -825,7 +871,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>>>          for (i = 0; i < queue_pairs; i++) {
> >>>>>>>              ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>>>>>                                           vdpa_device_fd, i, 2, true, opts->x_svq,
> >>>>>>> -                                     iova_range, iova_tree);
> >>>>>>> +                                     iova_range);
> >>>>>>>              if (!ncs[i])
> >>>>>>>                  goto err;
> >>>>>>>          }
> >>>>>>> @@ -833,13 +879,11 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
> >>>>>>>          if (has_cvq) {
> >>>>>>>              nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
> >>>>>>>                                       vdpa_device_fd, i, 1, false,
> >>>>>>> -                                 opts->x_svq, iova_range, iova_tree);
> >>>>>>> +                                 opts->x_svq, iova_range);
> >>>>>>>              if (!nc)
> >>>>>>>                  goto err;
> >>>>>>>          }
> >>>>>>>
> >>>>>>> -    /* iova_tree ownership belongs to last NetClientState */
> >>>>>>> -    g_steal_pointer(&iova_tree);
> >>>>>>>          return 0;
> >>>>>>>
> >>>>>>>      err:
> >>>>>>> @@ -849,7 +893,6 @@ err:
> >>>>>>>              }
> >>>>>>>          }
> >>>>>>>
> >>>>>>> -err_svq:
> >>>>>>>          qemu_close(vdpa_device_fd);
> >>>>>>>
> >>>>>>>          return -1;
>
diff mbox series

Patch

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index de5ed8ff22..a9e6c8f28e 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -178,13 +178,9 @@  err_init:
 static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
-    struct vhost_dev *dev = &s->vhost_net->dev;
 
     qemu_vfree(s->cvq_cmd_out_buffer);
     qemu_vfree(s->status);
-    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
-        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
-    }
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -234,10 +230,64 @@  static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
     return size;
 }
 
+/** From any vdpa net client, get the netclient of first queue pair */
+static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
+{
+    NICState *nic = qemu_get_nic(s->nc.peer);
+    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
+
+    return DO_UPCAST(VhostVDPAState, nc, nc0);
+}
+
+static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
+{
+    struct vhost_vdpa *v = &s->vhost_vdpa;
+
+    if (v->shadow_vqs_enabled) {
+        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
+                                           v->iova_range.last);
+    }
+}
+
+static int vhost_vdpa_net_data_start(NetClientState *nc)
+{
+    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+    struct vhost_vdpa *v = &s->vhost_vdpa;
+
+    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+
+    if (v->index == 0) {
+        vhost_vdpa_net_data_start_first(s);
+        return 0;
+    }
+
+    if (v->shadow_vqs_enabled) {
+        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
+        v->iova_tree = s0->vhost_vdpa.iova_tree;
+    }
+
+    return 0;
+}
+
+static void vhost_vdpa_net_client_stop(NetClientState *nc)
+{
+    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+    struct vhost_dev *dev;
+
+    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+
+    dev = s->vhost_vdpa.dev;
+    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
+        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+    }
+}
+
 static NetClientInfo net_vhost_vdpa_info = {
         .type = NET_CLIENT_DRIVER_VHOST_VDPA,
         .size = sizeof(VhostVDPAState),
         .receive = vhost_vdpa_receive,
+        .start = vhost_vdpa_net_data_start,
+        .stop = vhost_vdpa_net_client_stop,
         .cleanup = vhost_vdpa_cleanup,
         .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
         .has_ufo = vhost_vdpa_has_ufo,
@@ -351,7 +401,7 @@  dma_map_err:
 
 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 {
-    VhostVDPAState *s;
+    VhostVDPAState *s, *s0;
     struct vhost_vdpa *v;
     uint64_t backend_features;
     int64_t cvq_group;
@@ -425,6 +475,15 @@  out:
         return 0;
     }
 
+    s0 = vhost_vdpa_net_first_nc_vdpa(s);
+    if (s0->vhost_vdpa.iova_tree) {
+        /* SVQ is already configured for all virtqueues */
+        v->iova_tree = s0->vhost_vdpa.iova_tree;
+    } else {
+        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
+                                           v->iova_range.last);
+    }
+
     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
                                vhost_vdpa_net_cvq_cmd_page_len(), false);
     if (unlikely(r < 0)) {
@@ -449,15 +508,9 @@  static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
     if (s->vhost_vdpa.shadow_vqs_enabled) {
         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
-        if (!s->always_svq) {
-            /*
-             * If only the CVQ is shadowed we can delete this safely.
-             * If all the VQs are shadows this will be needed by the time the
-             * device is started again to register SVQ vrings and similar.
-             */
-            g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
-        }
     }
+
+    vhost_vdpa_net_client_stop(nc);
 }
 
 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
@@ -667,8 +720,7 @@  static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                        int nvqs,
                                        bool is_datapath,
                                        bool svq,
-                                       struct vhost_vdpa_iova_range iova_range,
-                                       VhostIOVATree *iova_tree)
+                                       struct vhost_vdpa_iova_range iova_range)
 {
     NetClientState *nc = NULL;
     VhostVDPAState *s;
@@ -690,7 +742,6 @@  static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     s->vhost_vdpa.shadow_vqs_enabled = svq;
     s->vhost_vdpa.iova_range = iova_range;
     s->vhost_vdpa.shadow_data = svq;
-    s->vhost_vdpa.iova_tree = iova_tree;
     if (!is_datapath) {
         s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
                                             vhost_vdpa_net_cvq_cmd_page_len());
@@ -760,7 +811,6 @@  int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
-    g_autoptr(VhostIOVATree) iova_tree = NULL;
     struct vhost_vdpa_iova_range iova_range;
     NetClientState *nc;
     int queue_pairs, r, i = 0, has_cvq = 0;
@@ -812,12 +862,8 @@  int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         goto err;
     }
 
-    if (opts->x_svq) {
-        if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
-            goto err_svq;
-        }
-
-        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
+        goto err;
     }
 
     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
@@ -825,7 +871,7 @@  int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     for (i = 0; i < queue_pairs; i++) {
         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
                                      vdpa_device_fd, i, 2, true, opts->x_svq,
-                                     iova_range, iova_tree);
+                                     iova_range);
         if (!ncs[i])
             goto err;
     }
@@ -833,13 +879,11 @@  int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     if (has_cvq) {
         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
                                  vdpa_device_fd, i, 1, false,
-                                 opts->x_svq, iova_range, iova_tree);
+                                 opts->x_svq, iova_range);
         if (!nc)
             goto err;
     }
 
-    /* iova_tree ownership belongs to last NetClientState */
-    g_steal_pointer(&iova_tree);
     return 0;
 
 err:
@@ -849,7 +893,6 @@  err:
         }
     }
 
-err_svq:
     qemu_close(vdpa_device_fd);
 
     return -1;