Message ID | 20221205170436.2977336-11-eperezma@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vDPA-net inflight descriptors migration with SVQ | expand |
> From: Eugenio Pérez <eperezma@redhat.com> > Sent: Monday, December 5, 2022 12:05 PM > > There is currently no data to be migrated, since nothing populates or read > the fields on virtio-net. > > The migration of in-flight descriptors is modelled after the migration of > requests in virtio-blk. With some differences: > * virtio-blk migrates queue number on each request. Here we only add a > vq if it has descriptors to migrate, and then we make all descriptors > in an array. > * Use of QTAILQ since it works similar to signal the end of the inflight > descriptors: 1 for more data, 0 if end. But do it for each vq instead > of for each descriptor. > * Usage of VMState macros. > > The fields of descriptors would be way more complicated if we use the > VirtQueueElements directly, since there would be a few levels of > indirections. Using VirtQueueElementOld for the moment, and migrate to > VirtQueueElement for the final patch. > > TODO: Proper migration versioning > TODO: Do not embed vhost-vdpa structs > TODO: Migrate the VirtQueueElement, not VirtQueueElementOld. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > include/hw/virtio/virtio-net.h | 2 + > include/migration/vmstate.h | 11 +++ > hw/net/virtio-net.c | 129 +++++++++++++++++++++++++++++++++ > 3 files changed, 142 insertions(+) > > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h > index ef234ffe7e..ae7c017ef0 100644 > --- a/include/hw/virtio/virtio-net.h > +++ b/include/hw/virtio/virtio-net.h > @@ -151,9 +151,11 @@ typedef struct VirtIONetQueue { > QEMUTimer *tx_timer; > QEMUBH *tx_bh; > uint32_t tx_waiting; > + uint32_t tx_inflight_num, rx_inflight_num; > struct { > VirtQueueElement *elem; > } async_tx; > + VirtQueueElement **tx_inflight, **rx_inflight; > struct VirtIONet *n; > } VirtIONetQueue; > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h > index 9726d2d09e..9e0dfef9ee 100644 > --- a/include/migration/vmstate.h > +++ b/include/migration/vmstate.h > @@ -626,6 +626,17 @@ extern const VMStateInfo vmstate_info_qlist; > .offset = vmstate_offset_varray(_state, _field, _type), \ > } > > +#define VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(_field, _state, > _field_num, \ > + _version, _vmsd, _type) { \ > + .name = (stringify(_field)), \ > + .version_id = (_version), \ > + .vmsd = &(_vmsd), \ > + .num_offset = vmstate_offset_value(_state, _field_num, uint16_t), \ > + .size = sizeof(_type), \ > + .flags = VMS_STRUCT | VMS_VARRAY_UINT16 | VMS_ALLOC | > VMS_POINTER, \ > + .offset = vmstate_offset_pointer(_state, _field, _type), \ > +} > + > #define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, > _version, _vmsd, _type) {\ > .name = (stringify(_field)), \ > .version_id = (_version), \ > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index > aba12759d5..ffd7bf1fc7 100644 > --- a/hw/net/virtio-net.c > +++ b/hw/net/virtio-net.c > @@ -3077,6 +3077,13 @@ static bool mac_table_doesnt_fit(void *opaque, > int version_id) > return !mac_table_fits(opaque, version_id); } > > +typedef struct VirtIONetInflightQueue { > + uint16_t idx; > + uint16_t num; > + QTAILQ_ENTRY(VirtIONetInflightQueue) entry; > + VirtQueueElementOld *elems; > +} VirtIONetInflightQueue; > + > /* This temporary type is shared by all the WITH_TMP methods > * although only some fields are used by each. > */ > @@ -3086,6 +3093,7 @@ struct VirtIONetMigTmp { > uint16_t curr_queue_pairs_1; > uint8_t has_ufo; > uint32_t has_vnet_hdr; > + QTAILQ_HEAD(, VirtIONetInflightQueue) queues_inflight; > }; > > /* The 2nd and subsequent tx_waiting flags are loaded later than @@ - > 3231,6 +3239,124 @@ static const VMStateDescription > vmstate_virtio_net_rss = { > }, > }; > > +static const VMStateDescription vmstate_virtio_net_inflight_queue = { > + .name = "virtio-net-device/inflight/queue", > + .fields = (VMStateField[]) { > + VMSTATE_UINT16(idx, VirtIONetInflightQueue), > + VMSTATE_UINT16(num, VirtIONetInflightQueue), > + > + VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(elems, > VirtIONetInflightQueue, num, > + 0, vmstate_virtqueue_element_old, > + VirtQueueElementOld), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > +static int virtio_net_inflight_init(void *opaque) { > + struct VirtIONetMigTmp *tmp = opaque; > + > + QTAILQ_INIT(&tmp->queues_inflight); > + return 0; > +} > + > +static int virtio_net_inflight_pre_save(void *opaque) { > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONet *net = tmp->parent; > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : > 1; > + VirtIONetInflightQueue *qi = g_new0(VirtIONetInflightQueue, > + curr_queue_pairs * 2); > + > + virtio_net_inflight_init(opaque); > + for (uint16_t i = 0; i < curr_queue_pairs * 2; ++i) { > + VirtIONetQueue *q = &net->vqs[vq2q(i)]; > + size_t n = i % 2 ? q->tx_inflight_num : q->rx_inflight_num; > + VirtQueueElement **inflight = i % 2 ? q->tx_inflight : > + q->rx_inflight; > + > + if (n == 0) { > + continue; > + } > + > + qi[i].idx = i; > + qi[i].num = n; > + qi[i].elems = g_new0(VirtQueueElementOld, n); > + for (uint16_t j = 0; j < n; ++j) { > + qemu_put_virtqueue_element_old(inflight[j], &qi[i].elems[j]); > + } > + QTAILQ_INSERT_TAIL(&tmp->queues_inflight, &qi[i], entry); > + } > + > + return 0; > +} > + > +static int virtio_net_inflight_post_save(void *opaque) { > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONetInflightQueue *qi; > + > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > + g_free(qi->elems); > + g_free(qi); > + } > + > + return 0; > +} > + > +static int virtio_net_inflight_post_load(void *opaque, int version_id) > +{ > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONet *net = tmp->parent; > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : > 1; > + VirtIONetInflightQueue *qi; > + > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > + VirtIONetQueue *q = &net->vqs[vq2q(qi->idx)]; > + uint32_t *n = qi->idx % 2 ? &q->tx_inflight_num : &q->rx_inflight_num; > + VirtQueueElement ***inflight = qi->idx % 2 ? > + &q->tx_inflight : &q->rx_inflight; > + if (unlikely(qi->num == 0)) { > + /* TODO: error message */ > + return -1; > + } > + > + if (unlikely(qi->idx > curr_queue_pairs * 2)) { > + /* TODO: error message */ > + return -1; > + } > + > + *n = qi->num; > + *inflight = g_new(VirtQueueElement *, *n); > + for (uint16_t j = 0; j < *n; ++j) { > + (*inflight)[j] = qemu_get_virtqueue_element_from_old( > + &net->parent_obj, &qi->elems[j], > + sizeof(VirtQueueElement)); > + } > + > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > + g_free(qi->elems); > + g_free(qi); > + } > + > + return 0; > +} > + > +/* TODO: Allocate a temporal per queue / queue element, not all of > +them! */ static const VMStateDescription vmstate_virtio_net_inflight = { > + .name = "virtio-net-device/inflight", > + .pre_save = virtio_net_inflight_pre_save, > + .post_save = virtio_net_inflight_post_save, > + .pre_load = virtio_net_inflight_init, > + .post_load = virtio_net_inflight_post_load, > + .fields = (VMStateField[]) { > + VMSTATE_QTAILQ_V(queues_inflight, struct VirtIONetMigTmp, 0, > + vmstate_virtio_net_inflight_queue, > + VirtIONetInflightQueue, entry), > + VMSTATE_END_OF_LIST() > + }, > +}; > + How is the CVQ related mac, vlan, rss replay different than these inflight descriptors, due to which inflights to be done by these callbacks and CVQ differently?
On Tue, Dec 6, 2022 at 1:05 AM Eugenio Pérez <eperezma@redhat.com> wrote: > > There is currently no data to be migrated, since nothing populates or > read the fields on virtio-net. > > The migration of in-flight descriptors is modelled after the migration > of requests in virtio-blk. With some differences: > * virtio-blk migrates queue number on each request. Here we only add a > vq if it has descriptors to migrate, and then we make all descriptors > in an array. > * Use of QTAILQ since it works similar to signal the end of the inflight > descriptors: 1 for more data, 0 if end. But do it for each vq instead > of for each descriptor. > * Usage of VMState macros. > > The fields of descriptors would be way more complicated if we use the > VirtQueueElements directly, since there would be a few levels of > indirections. Using VirtQueueElementOld for the moment, and migrate to > VirtQueueElement for the final patch. > > TODO: Proper migration versioning > TODO: Do not embed vhost-vdpa structs > TODO: Migrate the VirtQueueElement, not VirtQueueElementOld. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > include/hw/virtio/virtio-net.h | 2 + > include/migration/vmstate.h | 11 +++ > hw/net/virtio-net.c | 129 +++++++++++++++++++++++++++++++++ > 3 files changed, 142 insertions(+) > > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h > index ef234ffe7e..ae7c017ef0 100644 > --- a/include/hw/virtio/virtio-net.h > +++ b/include/hw/virtio/virtio-net.h > @@ -151,9 +151,11 @@ typedef struct VirtIONetQueue { > QEMUTimer *tx_timer; > QEMUBH *tx_bh; > uint32_t tx_waiting; > + uint32_t tx_inflight_num, rx_inflight_num; > struct { > VirtQueueElement *elem; > } async_tx; > + VirtQueueElement **tx_inflight, **rx_inflight; > struct VirtIONet *n; > } VirtIONetQueue; > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h > index 9726d2d09e..9e0dfef9ee 100644 > --- a/include/migration/vmstate.h > +++ b/include/migration/vmstate.h > @@ -626,6 +626,17 @@ extern const VMStateInfo vmstate_info_qlist; > .offset = vmstate_offset_varray(_state, _field, _type), \ > } > > +#define VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(_field, _state, _field_num, \ > + _version, _vmsd, _type) { \ > + .name = (stringify(_field)), \ > + .version_id = (_version), \ > + .vmsd = &(_vmsd), \ > + .num_offset = vmstate_offset_value(_state, _field_num, uint16_t), \ > + .size = sizeof(_type), \ > + .flags = VMS_STRUCT | VMS_VARRAY_UINT16 | VMS_ALLOC | VMS_POINTER, \ > + .offset = vmstate_offset_pointer(_state, _field, _type), \ > +} > + > #define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, _version, _vmsd, _type) {\ > .name = (stringify(_field)), \ > .version_id = (_version), \ > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > index aba12759d5..ffd7bf1fc7 100644 > --- a/hw/net/virtio-net.c > +++ b/hw/net/virtio-net.c > @@ -3077,6 +3077,13 @@ static bool mac_table_doesnt_fit(void *opaque, int version_id) > return !mac_table_fits(opaque, version_id); > } > > +typedef struct VirtIONetInflightQueue { > + uint16_t idx; > + uint16_t num; > + QTAILQ_ENTRY(VirtIONetInflightQueue) entry; > + VirtQueueElementOld *elems; > +} VirtIONetInflightQueue; > + > /* This temporary type is shared by all the WITH_TMP methods > * although only some fields are used by each. > */ > @@ -3086,6 +3093,7 @@ struct VirtIONetMigTmp { > uint16_t curr_queue_pairs_1; > uint8_t has_ufo; > uint32_t has_vnet_hdr; > + QTAILQ_HEAD(, VirtIONetInflightQueue) queues_inflight; > }; > > /* The 2nd and subsequent tx_waiting flags are loaded later than > @@ -3231,6 +3239,124 @@ static const VMStateDescription vmstate_virtio_net_rss = { > }, > }; > > +static const VMStateDescription vmstate_virtio_net_inflight_queue = { > + .name = "virtio-net-device/inflight/queue", > + .fields = (VMStateField[]) { > + VMSTATE_UINT16(idx, VirtIONetInflightQueue), > + VMSTATE_UINT16(num, VirtIONetInflightQueue), > + > + VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(elems, VirtIONetInflightQueue, num, > + 0, vmstate_virtqueue_element_old, > + VirtQueueElementOld), > + VMSTATE_END_OF_LIST() > + }, > +}; A dumb question, any reason we need bother with virtio-net? It looks to me it's not a must and would complicate migration compatibility. I guess virtio-blk is the better place. Thanks > + > +static int virtio_net_inflight_init(void *opaque) > +{ > + struct VirtIONetMigTmp *tmp = opaque; > + > + QTAILQ_INIT(&tmp->queues_inflight); > + return 0; > +} > + > +static int virtio_net_inflight_pre_save(void *opaque) > +{ > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONet *net = tmp->parent; > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; > + VirtIONetInflightQueue *qi = g_new0(VirtIONetInflightQueue, > + curr_queue_pairs * 2); > + > + virtio_net_inflight_init(opaque); > + for (uint16_t i = 0; i < curr_queue_pairs * 2; ++i) { > + VirtIONetQueue *q = &net->vqs[vq2q(i)]; > + size_t n = i % 2 ? q->tx_inflight_num : q->rx_inflight_num; > + VirtQueueElement **inflight = i % 2 ? q->tx_inflight : q->rx_inflight; > + > + if (n == 0) { > + continue; > + } > + > + qi[i].idx = i; > + qi[i].num = n; > + qi[i].elems = g_new0(VirtQueueElementOld, n); > + for (uint16_t j = 0; j < n; ++j) { > + qemu_put_virtqueue_element_old(inflight[j], &qi[i].elems[j]); > + } > + QTAILQ_INSERT_TAIL(&tmp->queues_inflight, &qi[i], entry); > + } > + > + return 0; > +} > + > +static int virtio_net_inflight_post_save(void *opaque) > +{ > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONetInflightQueue *qi; > + > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > + g_free(qi->elems); > + g_free(qi); > + } > + > + return 0; > +} > + > +static int virtio_net_inflight_post_load(void *opaque, int version_id) > +{ > + struct VirtIONetMigTmp *tmp = opaque; > + VirtIONet *net = tmp->parent; > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; > + VirtIONetInflightQueue *qi; > + > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > + VirtIONetQueue *q = &net->vqs[vq2q(qi->idx)]; > + uint32_t *n = qi->idx % 2 ? &q->tx_inflight_num : &q->rx_inflight_num; > + VirtQueueElement ***inflight = qi->idx % 2 ? > + &q->tx_inflight : &q->rx_inflight; > + if (unlikely(qi->num == 0)) { > + /* TODO: error message */ > + return -1; > + } > + > + if (unlikely(qi->idx > curr_queue_pairs * 2)) { > + /* TODO: error message */ > + return -1; > + } > + > + *n = qi->num; > + *inflight = g_new(VirtQueueElement *, *n); > + for (uint16_t j = 0; j < *n; ++j) { > + (*inflight)[j] = qemu_get_virtqueue_element_from_old( > + &net->parent_obj, &qi->elems[j], > + sizeof(VirtQueueElement)); > + } > + > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > + g_free(qi->elems); > + g_free(qi); > + } > + > + return 0; > +} > + > +/* TODO: Allocate a temporal per queue / queue element, not all of them! */ > +static const VMStateDescription vmstate_virtio_net_inflight = { > + .name = "virtio-net-device/inflight", > + .pre_save = virtio_net_inflight_pre_save, > + .post_save = virtio_net_inflight_post_save, > + .pre_load = virtio_net_inflight_init, > + .post_load = virtio_net_inflight_post_load, > + .fields = (VMStateField[]) { > + VMSTATE_QTAILQ_V(queues_inflight, struct VirtIONetMigTmp, 0, > + vmstate_virtio_net_inflight_queue, > + VirtIONetInflightQueue, entry), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > static const VMStateDescription vmstate_virtio_net_device = { > .name = "virtio-net-device", > .version_id = VIRTIO_NET_VM_VERSION, > @@ -3279,6 +3405,9 @@ static const VMStateDescription vmstate_virtio_net_device = { > vmstate_virtio_net_tx_waiting), > VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet, > has_ctrl_guest_offloads), > + /* TODO: Move to subsection */ > + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, > + vmstate_virtio_net_inflight), > VMSTATE_END_OF_LIST() > }, > .subsections = (const VMStateDescription * []) { > -- > 2.31.1 >
On Mon, Dec 5, 2022 at 9:52 PM Parav Pandit <parav@nvidia.com> wrote: > > > > From: Eugenio Pérez <eperezma@redhat.com> > > Sent: Monday, December 5, 2022 12:05 PM > > > > There is currently no data to be migrated, since nothing populates or read > > the fields on virtio-net. > > > > The migration of in-flight descriptors is modelled after the migration of > > requests in virtio-blk. With some differences: > > * virtio-blk migrates queue number on each request. Here we only add a > > vq if it has descriptors to migrate, and then we make all descriptors > > in an array. > > * Use of QTAILQ since it works similar to signal the end of the inflight > > descriptors: 1 for more data, 0 if end. But do it for each vq instead > > of for each descriptor. > > * Usage of VMState macros. > > > > The fields of descriptors would be way more complicated if we use the > > VirtQueueElements directly, since there would be a few levels of > > indirections. Using VirtQueueElementOld for the moment, and migrate to > > VirtQueueElement for the final patch. > > > > TODO: Proper migration versioning > > TODO: Do not embed vhost-vdpa structs > > TODO: Migrate the VirtQueueElement, not VirtQueueElementOld. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > include/hw/virtio/virtio-net.h | 2 + > > include/migration/vmstate.h | 11 +++ > > hw/net/virtio-net.c | 129 +++++++++++++++++++++++++++++++++ > > 3 files changed, 142 insertions(+) > > > > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h > > index ef234ffe7e..ae7c017ef0 100644 > > --- a/include/hw/virtio/virtio-net.h > > +++ b/include/hw/virtio/virtio-net.h > > @@ -151,9 +151,11 @@ typedef struct VirtIONetQueue { > > QEMUTimer *tx_timer; > > QEMUBH *tx_bh; > > uint32_t tx_waiting; > > + uint32_t tx_inflight_num, rx_inflight_num; > > struct { > > VirtQueueElement *elem; > > } async_tx; > > + VirtQueueElement **tx_inflight, **rx_inflight; > > struct VirtIONet *n; > > } VirtIONetQueue; > > > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h > > index 9726d2d09e..9e0dfef9ee 100644 > > --- a/include/migration/vmstate.h > > +++ b/include/migration/vmstate.h > > @@ -626,6 +626,17 @@ extern const VMStateInfo vmstate_info_qlist; > > .offset = vmstate_offset_varray(_state, _field, _type), \ > > } > > > > +#define VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(_field, _state, > > _field_num, \ > > + _version, _vmsd, _type) { \ > > + .name = (stringify(_field)), \ > > + .version_id = (_version), \ > > + .vmsd = &(_vmsd), \ > > + .num_offset = vmstate_offset_value(_state, _field_num, uint16_t), \ > > + .size = sizeof(_type), \ > > + .flags = VMS_STRUCT | VMS_VARRAY_UINT16 | VMS_ALLOC | > > VMS_POINTER, \ > > + .offset = vmstate_offset_pointer(_state, _field, _type), \ > > +} > > + > > #define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, > > _version, _vmsd, _type) {\ > > .name = (stringify(_field)), \ > > .version_id = (_version), \ > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index > > aba12759d5..ffd7bf1fc7 100644 > > --- a/hw/net/virtio-net.c > > +++ b/hw/net/virtio-net.c > > @@ -3077,6 +3077,13 @@ static bool mac_table_doesnt_fit(void *opaque, > > int version_id) > > return !mac_table_fits(opaque, version_id); } > > > > +typedef struct VirtIONetInflightQueue { > > + uint16_t idx; > > + uint16_t num; > > + QTAILQ_ENTRY(VirtIONetInflightQueue) entry; > > + VirtQueueElementOld *elems; > > +} VirtIONetInflightQueue; > > + > > /* This temporary type is shared by all the WITH_TMP methods > > * although only some fields are used by each. > > */ > > @@ -3086,6 +3093,7 @@ struct VirtIONetMigTmp { > > uint16_t curr_queue_pairs_1; > > uint8_t has_ufo; > > uint32_t has_vnet_hdr; > > + QTAILQ_HEAD(, VirtIONetInflightQueue) queues_inflight; > > }; > > > > /* The 2nd and subsequent tx_waiting flags are loaded later than @@ - > > 3231,6 +3239,124 @@ static const VMStateDescription > > vmstate_virtio_net_rss = { > > }, > > }; > > > > +static const VMStateDescription vmstate_virtio_net_inflight_queue = { > > + .name = "virtio-net-device/inflight/queue", > > + .fields = (VMStateField[]) { > > + VMSTATE_UINT16(idx, VirtIONetInflightQueue), > > + VMSTATE_UINT16(num, VirtIONetInflightQueue), > > + > > + VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(elems, > > VirtIONetInflightQueue, num, > > + 0, vmstate_virtqueue_element_old, > > + VirtQueueElementOld), > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > + > > +static int virtio_net_inflight_init(void *opaque) { > > + struct VirtIONetMigTmp *tmp = opaque; > > + > > + QTAILQ_INIT(&tmp->queues_inflight); > > + return 0; > > +} > > + > > +static int virtio_net_inflight_pre_save(void *opaque) { > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONet *net = tmp->parent; > > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : > > 1; > > + VirtIONetInflightQueue *qi = g_new0(VirtIONetInflightQueue, > > + curr_queue_pairs * 2); > > + > > + virtio_net_inflight_init(opaque); > > + for (uint16_t i = 0; i < curr_queue_pairs * 2; ++i) { > > + VirtIONetQueue *q = &net->vqs[vq2q(i)]; > > + size_t n = i % 2 ? q->tx_inflight_num : q->rx_inflight_num; > > + VirtQueueElement **inflight = i % 2 ? q->tx_inflight : > > + q->rx_inflight; > > + > > + if (n == 0) { > > + continue; > > + } > > + > > + qi[i].idx = i; > > + qi[i].num = n; > > + qi[i].elems = g_new0(VirtQueueElementOld, n); > > + for (uint16_t j = 0; j < n; ++j) { > > + qemu_put_virtqueue_element_old(inflight[j], &qi[i].elems[j]); > > + } > > + QTAILQ_INSERT_TAIL(&tmp->queues_inflight, &qi[i], entry); > > + } > > + > > + return 0; > > +} > > + > > +static int virtio_net_inflight_post_save(void *opaque) { > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONetInflightQueue *qi; > > + > > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > > + g_free(qi->elems); > > + g_free(qi); > > + } > > + > > + return 0; > > +} > > + > > +static int virtio_net_inflight_post_load(void *opaque, int version_id) > > +{ > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONet *net = tmp->parent; > > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : > > 1; > > + VirtIONetInflightQueue *qi; > > + > > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > > + VirtIONetQueue *q = &net->vqs[vq2q(qi->idx)]; > > + uint32_t *n = qi->idx % 2 ? &q->tx_inflight_num : &q->rx_inflight_num; > > + VirtQueueElement ***inflight = qi->idx % 2 ? > > + &q->tx_inflight : &q->rx_inflight; > > + if (unlikely(qi->num == 0)) { > > + /* TODO: error message */ > > + return -1; > > + } > > + > > + if (unlikely(qi->idx > curr_queue_pairs * 2)) { > > + /* TODO: error message */ > > + return -1; > > + } > > + > > + *n = qi->num; > > + *inflight = g_new(VirtQueueElement *, *n); > > + for (uint16_t j = 0; j < *n; ++j) { > > + (*inflight)[j] = qemu_get_virtqueue_element_from_old( > > + &net->parent_obj, &qi->elems[j], > > + sizeof(VirtQueueElement)); > > + } > > + > > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > > + g_free(qi->elems); > > + g_free(qi); > > + } > > + > > + return 0; > > +} > > + > > +/* TODO: Allocate a temporal per queue / queue element, not all of > > +them! */ static const VMStateDescription vmstate_virtio_net_inflight = { > > + .name = "virtio-net-device/inflight", > > + .pre_save = virtio_net_inflight_pre_save, > > + .post_save = virtio_net_inflight_post_save, > > + .pre_load = virtio_net_inflight_init, > > + .post_load = virtio_net_inflight_post_load, > > + .fields = (VMStateField[]) { > > + VMSTATE_QTAILQ_V(queues_inflight, struct VirtIONetMigTmp, 0, > > + vmstate_virtio_net_inflight_queue, > > + VirtIONetInflightQueue, entry), > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > + > How is the CVQ related mac, vlan, rss replay different than these inflight descriptors, due to which inflights to be done by these callbacks and CVQ differently? CVQ is processed by qemu directly, so it is guaranteed they will not be out of order. Guest memory is enough to recover in the destination.
On Tue, Dec 6, 2022 at 4:24 AM Jason Wang <jasowang@redhat.com> wrote: > > On Tue, Dec 6, 2022 at 1:05 AM Eugenio Pérez <eperezma@redhat.com> wrote: > > > > There is currently no data to be migrated, since nothing populates or > > read the fields on virtio-net. > > > > The migration of in-flight descriptors is modelled after the migration > > of requests in virtio-blk. With some differences: > > * virtio-blk migrates queue number on each request. Here we only add a > > vq if it has descriptors to migrate, and then we make all descriptors > > in an array. > > * Use of QTAILQ since it works similar to signal the end of the inflight > > descriptors: 1 for more data, 0 if end. But do it for each vq instead > > of for each descriptor. > > * Usage of VMState macros. > > > > The fields of descriptors would be way more complicated if we use the > > VirtQueueElements directly, since there would be a few levels of > > indirections. Using VirtQueueElementOld for the moment, and migrate to > > VirtQueueElement for the final patch. > > > > TODO: Proper migration versioning > > TODO: Do not embed vhost-vdpa structs > > TODO: Migrate the VirtQueueElement, not VirtQueueElementOld. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > include/hw/virtio/virtio-net.h | 2 + > > include/migration/vmstate.h | 11 +++ > > hw/net/virtio-net.c | 129 +++++++++++++++++++++++++++++++++ > > 3 files changed, 142 insertions(+) > > > > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h > > index ef234ffe7e..ae7c017ef0 100644 > > --- a/include/hw/virtio/virtio-net.h > > +++ b/include/hw/virtio/virtio-net.h > > @@ -151,9 +151,11 @@ typedef struct VirtIONetQueue { > > QEMUTimer *tx_timer; > > QEMUBH *tx_bh; > > uint32_t tx_waiting; > > + uint32_t tx_inflight_num, rx_inflight_num; > > struct { > > VirtQueueElement *elem; > > } async_tx; > > + VirtQueueElement **tx_inflight, **rx_inflight; > > struct VirtIONet *n; > > } VirtIONetQueue; > > > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h > > index 9726d2d09e..9e0dfef9ee 100644 > > --- a/include/migration/vmstate.h > > +++ b/include/migration/vmstate.h > > @@ -626,6 +626,17 @@ extern const VMStateInfo vmstate_info_qlist; > > .offset = vmstate_offset_varray(_state, _field, _type), \ > > } > > > > +#define VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(_field, _state, _field_num, \ > > + _version, _vmsd, _type) { \ > > + .name = (stringify(_field)), \ > > + .version_id = (_version), \ > > + .vmsd = &(_vmsd), \ > > + .num_offset = vmstate_offset_value(_state, _field_num, uint16_t), \ > > + .size = sizeof(_type), \ > > + .flags = VMS_STRUCT | VMS_VARRAY_UINT16 | VMS_ALLOC | VMS_POINTER, \ > > + .offset = vmstate_offset_pointer(_state, _field, _type), \ > > +} > > + > > #define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, _version, _vmsd, _type) {\ > > .name = (stringify(_field)), \ > > .version_id = (_version), \ > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > > index aba12759d5..ffd7bf1fc7 100644 > > --- a/hw/net/virtio-net.c > > +++ b/hw/net/virtio-net.c > > @@ -3077,6 +3077,13 @@ static bool mac_table_doesnt_fit(void *opaque, int version_id) > > return !mac_table_fits(opaque, version_id); > > } > > > > +typedef struct VirtIONetInflightQueue { > > + uint16_t idx; > > + uint16_t num; > > + QTAILQ_ENTRY(VirtIONetInflightQueue) entry; > > + VirtQueueElementOld *elems; > > +} VirtIONetInflightQueue; > > + > > /* This temporary type is shared by all the WITH_TMP methods > > * although only some fields are used by each. > > */ > > @@ -3086,6 +3093,7 @@ struct VirtIONetMigTmp { > > uint16_t curr_queue_pairs_1; > > uint8_t has_ufo; > > uint32_t has_vnet_hdr; > > + QTAILQ_HEAD(, VirtIONetInflightQueue) queues_inflight; > > }; > > > > /* The 2nd and subsequent tx_waiting flags are loaded later than > > @@ -3231,6 +3239,124 @@ static const VMStateDescription vmstate_virtio_net_rss = { > > }, > > }; > > > > +static const VMStateDescription vmstate_virtio_net_inflight_queue = { > > + .name = "virtio-net-device/inflight/queue", > > + .fields = (VMStateField[]) { > > + VMSTATE_UINT16(idx, VirtIONetInflightQueue), > > + VMSTATE_UINT16(num, VirtIONetInflightQueue), > > + > > + VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(elems, VirtIONetInflightQueue, num, > > + 0, vmstate_virtqueue_element_old, > > + VirtQueueElementOld), > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > A dumb question, any reason we need bother with virtio-net? It looks > to me it's not a must and would complicate migration compatibility. > > I guess virtio-blk is the better place. > I'm fine to start with -blk, but if -net devices are processing buffers out of order we have chances of losing descriptors too. We can wait for more feedback to prioritize correctly this though. Thanks! > Thanks > > > + > > +static int virtio_net_inflight_init(void *opaque) > > +{ > > + struct VirtIONetMigTmp *tmp = opaque; > > + > > + QTAILQ_INIT(&tmp->queues_inflight); > > + return 0; > > +} > > + > > +static int virtio_net_inflight_pre_save(void *opaque) > > +{ > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONet *net = tmp->parent; > > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; > > + VirtIONetInflightQueue *qi = g_new0(VirtIONetInflightQueue, > > + curr_queue_pairs * 2); > > + > > + virtio_net_inflight_init(opaque); > > + for (uint16_t i = 0; i < curr_queue_pairs * 2; ++i) { > > + VirtIONetQueue *q = &net->vqs[vq2q(i)]; > > + size_t n = i % 2 ? q->tx_inflight_num : q->rx_inflight_num; > > + VirtQueueElement **inflight = i % 2 ? q->tx_inflight : q->rx_inflight; > > + > > + if (n == 0) { > > + continue; > > + } > > + > > + qi[i].idx = i; > > + qi[i].num = n; > > + qi[i].elems = g_new0(VirtQueueElementOld, n); > > + for (uint16_t j = 0; j < n; ++j) { > > + qemu_put_virtqueue_element_old(inflight[j], &qi[i].elems[j]); > > + } > > + QTAILQ_INSERT_TAIL(&tmp->queues_inflight, &qi[i], entry); > > + } > > + > > + return 0; > > +} > > + > > +static int virtio_net_inflight_post_save(void *opaque) > > +{ > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONetInflightQueue *qi; > > + > > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > > + g_free(qi->elems); > > + g_free(qi); > > + } > > + > > + return 0; > > +} > > + > > +static int virtio_net_inflight_post_load(void *opaque, int version_id) > > +{ > > + struct VirtIONetMigTmp *tmp = opaque; > > + VirtIONet *net = tmp->parent; > > + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; > > + VirtIONetInflightQueue *qi; > > + > > + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { > > + VirtIONetQueue *q = &net->vqs[vq2q(qi->idx)]; > > + uint32_t *n = qi->idx % 2 ? &q->tx_inflight_num : &q->rx_inflight_num; > > + VirtQueueElement ***inflight = qi->idx % 2 ? > > + &q->tx_inflight : &q->rx_inflight; > > + if (unlikely(qi->num == 0)) { > > + /* TODO: error message */ > > + return -1; > > + } > > + > > + if (unlikely(qi->idx > curr_queue_pairs * 2)) { > > + /* TODO: error message */ > > + return -1; > > + } > > + > > + *n = qi->num; > > + *inflight = g_new(VirtQueueElement *, *n); > > + for (uint16_t j = 0; j < *n; ++j) { > > + (*inflight)[j] = qemu_get_virtqueue_element_from_old( > > + &net->parent_obj, &qi->elems[j], > > + sizeof(VirtQueueElement)); > > + } > > + > > + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); > > + g_free(qi->elems); > > + g_free(qi); > > + } > > + > > + return 0; > > +} > > + > > +/* TODO: Allocate a temporal per queue / queue element, not all of them! */ > > +static const VMStateDescription vmstate_virtio_net_inflight = { > > + .name = "virtio-net-device/inflight", > > + .pre_save = virtio_net_inflight_pre_save, > > + .post_save = virtio_net_inflight_post_save, > > + .pre_load = virtio_net_inflight_init, > > + .post_load = virtio_net_inflight_post_load, > > + .fields = (VMStateField[]) { > > + VMSTATE_QTAILQ_V(queues_inflight, struct VirtIONetMigTmp, 0, > > + vmstate_virtio_net_inflight_queue, > > + VirtIONetInflightQueue, entry), > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > + > > static const VMStateDescription vmstate_virtio_net_device = { > > .name = "virtio-net-device", > > .version_id = VIRTIO_NET_VM_VERSION, > > @@ -3279,6 +3405,9 @@ static const VMStateDescription vmstate_virtio_net_device = { > > vmstate_virtio_net_tx_waiting), > > VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet, > > has_ctrl_guest_offloads), > > + /* TODO: Move to subsection */ > > + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, > > + vmstate_virtio_net_inflight), > > VMSTATE_END_OF_LIST() > > }, > > .subsections = (const VMStateDescription * []) { > > -- > > 2.31.1 > > >
Hi Jason, > From: Jason Wang <jasowang@redhat.com> > Sent: Monday, December 5, 2022 10:25 PM > > A dumb question, any reason we need bother with virtio-net? It looks to me it's > not a must and would complicate migration compatibility. Virtio net vdpa device is processing the descriptors out of order. This vdpa device doesn’t offer IN_ORDER flag. And when a VQ is suspended it cannot complete these descriptors as some dummy zero length completions. The guest VM is flooded with [1]. So it is needed for the devices that doesn’t offer IN_ORDER feature. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/virtio_net.c?h=v6.2-rc3#n1252
On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > Hi Jason, > > > From: Jason Wang <jasowang@redhat.com> > > Sent: Monday, December 5, 2022 10:25 PM > > > > > A dumb question, any reason we need bother with virtio-net? It looks to me it's > > not a must and would complicate migration compatibility. > > Virtio net vdpa device is processing the descriptors out of order. > This vdpa device doesn’t offer IN_ORDER flag. > > And when a VQ is suspended it cannot complete these descriptors as some dummy zero length completions. > The guest VM is flooded with [1]. Yes, but any reason for the device to do out-of-order for RX? > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 It is only enabled in a debug kernel which should be harmless? Thanks >
> From: Jason Wang <jasowang@redhat.com> > Sent: Tuesday, January 10, 2023 11:35 PM > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > > > Hi Jason, > > > > > From: Jason Wang <jasowang@redhat.com> > > > Sent: Monday, December 5, 2022 10:25 PM > > > > > > > > A dumb question, any reason we need bother with virtio-net? It looks > > > to me it's not a must and would complicate migration compatibility. > > > > Virtio net vdpa device is processing the descriptors out of order. > > This vdpa device doesn’t offer IN_ORDER flag. > > > > And when a VQ is suspended it cannot complete these descriptors as some > dummy zero length completions. > > The guest VM is flooded with [1]. > > Yes, but any reason for the device to do out-of-order for RX? > For some devices it is more optimal to process them out of order. And its not limited to RX. > > > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > > > [1] > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tre > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 > > It is only enabled in a debug kernel which should be harmless? it is KERN_DEBUG log level. Its is not debug kernel, just the debug log level. And regardless, generating zero length packets for debug kernel is even more confusing.
On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit <parav@nvidia.com> wrote: > > > > From: Jason Wang <jasowang@redhat.com> > > Sent: Tuesday, January 10, 2023 11:35 PM > > > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > > > > > Hi Jason, > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > > Sent: Monday, December 5, 2022 10:25 PM > > > > > > > > > > > A dumb question, any reason we need bother with virtio-net? It looks > > > > to me it's not a must and would complicate migration compatibility. > > > > > > Virtio net vdpa device is processing the descriptors out of order. > > > This vdpa device doesn’t offer IN_ORDER flag. > > > > > > And when a VQ is suspended it cannot complete these descriptors as some > > dummy zero length completions. > > > The guest VM is flooded with [1]. > > > > Yes, but any reason for the device to do out-of-order for RX? > > > For some devices it is more optimal to process them out of order. > And its not limited to RX. TX should be fine, since the device can anyhow pretend to send all packets, so we won't have any in-flight descriptors. > > > > > > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > > > > > [1] > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tre > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 > > > > It is only enabled in a debug kernel which should be harmless? > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log level. Ok, but the production environment should not use that level anyhow. > And regardless, generating zero length packets for debug kernel is even more confusing. Note that it is allowed in the virtio-spec[1] (we probably can fix that in the driver) and we have pr_debug() all over this drivers and other places. It doesn't cause any side effects except for the debugging purpose. So I think having inflight tracking is useful, but I'm not sure it's worth bothering with virtio-net (or worth to bothering now): - zero length is allowed - it only helps for debugging - may cause issues for migration compatibility - requires new infrastructure to be invented Thanks [1] spec said " Note: len is particularly useful for drivers using untrusted buffers: if a driver does not know exactly how much has been written by the device, the driver would have to zero the buffer in advance to ensure no data leakage occurs. "
> From: Jason Wang <jasowang@redhat.com> > Sent: Wednesday, January 11, 2023 12:51 AM > > On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > Sent: Tuesday, January 10, 2023 11:35 PM > > > > > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > > Hi Jason, > > > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > > > Sent: Monday, December 5, 2022 10:25 PM > > > > > > > > > > > > > > A dumb question, any reason we need bother with virtio-net? It > > > > > looks to me it's not a must and would complicate migration > compatibility. > > > > > > > > Virtio net vdpa device is processing the descriptors out of order. > > > > This vdpa device doesn’t offer IN_ORDER flag. > > > > > > > > And when a VQ is suspended it cannot complete these descriptors as > > > > some > > > dummy zero length completions. > > > > The guest VM is flooded with [1]. > > > > > > Yes, but any reason for the device to do out-of-order for RX? > > > > > For some devices it is more optimal to process them out of order. > > And its not limited to RX. > > TX should be fine, since the device can anyhow pretend to send all packets, so > we won't have any in-flight descriptors. > > > > > > > > > > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > > > > > > > [1] > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git > > > > /tre > > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 > > > > > > It is only enabled in a debug kernel which should be harmless? > > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log level. > > Ok, but the production environment should not use that level anyhow. > > > And regardless, generating zero length packets for debug kernel is even more > confusing. > > Note that it is allowed in the virtio-spec[1] (we probably can fix that in the > driver) and we have pr_debug() all over this drivers and other places. It doesn't > cause any side effects except for the debugging purpose. > > So I think having inflight tracking is useful, but I'm not sure it's worth bothering > with virtio-net (or worth to bothering now): > > - zero length is allowed This isn’t explicitly called out. It may be worth to add to the spec. > - it only helps for debugging > - may cause issues for migration compatibility We have this missing for long time regardless of this feature. So let's not mix here. The mlx5 vdpa device can do eventual in-order completion before/at time of suspend, so I think we can wait for now to until more advance hw arrives. > - requires new infrastructure to be invented > > Thanks > > [1] spec said > This doesn’t say that its zero-length completion. Len is a mandatory field to tell how many bytes device wrote. > " > Note: len is particularly useful for drivers using untrusted buffers: > if a driver does not know exactly how much has been written by the device, the > driver would have to zero the buffer in advance to ensure no data leakage > occurs. > "
On Wed, Jan 11, 2023 at 01:51:06PM +0800, Jason Wang wrote: > On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > Sent: Tuesday, January 10, 2023 11:35 PM > > > > > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > > Hi Jason, > > > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > > > Sent: Monday, December 5, 2022 10:25 PM > > > > > > > > > > > > > > A dumb question, any reason we need bother with virtio-net? It looks > > > > > to me it's not a must and would complicate migration compatibility. > > > > > > > > Virtio net vdpa device is processing the descriptors out of order. > > > > This vdpa device doesn’t offer IN_ORDER flag. > > > > > > > > And when a VQ is suspended it cannot complete these descriptors as some > > > dummy zero length completions. > > > > The guest VM is flooded with [1]. > > > > > > Yes, but any reason for the device to do out-of-order for RX? > > > > > For some devices it is more optimal to process them out of order. > > And its not limited to RX. > > TX should be fine, since the device can anyhow pretend to send all > packets, so we won't have any in-flight descriptors. And drop them all? You end up with multisecond delays for things like DHCP. Yes theoretically packets can be dropped at any time, but practically people expect this to happen on busy systems, not randomly out of the blue. > > > > > > > > > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > > > > > > > [1] > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tre > > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 > > > > > > It is only enabled in a debug kernel which should be harmless? > > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log level. > > Ok, but the production environment should not use that level anyhow. It's just one example. And it's enough in my eyes to prove we really can't start sending zero length RX buffers to drivers and expect all to be well. If we want to we need to negotiate a new feature bit. > > And regardless, generating zero length packets for debug kernel is even more confusing. > > Note that it is allowed in the virtio-spec[1] (we probably can fix > that in the driver) and we have pr_debug() all over this drivers and > other places. It doesn't cause any side effects except for the > debugging purpose. > > So I think having inflight tracking is useful, but I'm not sure it's > worth bothering with virtio-net (or worth to bothering now): > > - zero length is allowed > - it only helps for debugging > - may cause issues for migration compatibility > - requires new infrastructure to be invented > > Thanks > > [1] spec said > > " > Note: len is particularly useful for drivers using untrusted buffers: > if a driver does not know exactly how much has been written by the > device, the driver would have to zero the buffer in advance to ensure > no data leakage occurs. > " I don't think this talks about zero length at all. Let me try to explain what this talk about in my opinion. There are cases where device does not know exactly how much data it wrote into buffer. Should it over-estimate such that driver can be sure that buffer after the reported length is unchanged? Or should it instead under-estimate such that driver can be sure that the reported length has been initialized by device? What this text in the spec says is that it must always under-estimate and not over-estimate. And it attempts to explain why this is useful: imagine driver that trusts the device and wants to make sure buffer is initialized. With the definition in the spec, it only needs to initialize data after the reported length. Initialize how? It's up to the driver but for example it can zero this buffer. In short, all the text says is "do not over-report length, only set it to part of buffer you wrote". Besides that, the text itself is from the original spec and it did not age well: 1)- no one actually relies on this 2)- rather than untrusted "buffers" what we commonly have is untrusted devices so length can't be trusted either 3)- writes on PCI are posted and if your security model depends on buffer being initialized and you want to recover from errors you really can't expect device to give you this info. Luckily no one cares see 1) above.
On Wed, Dec 07, 2022 at 09:56:20AM +0100, Eugenio Perez Martin wrote: > > A dumb question, any reason we need bother with virtio-net? It looks > > to me it's not a must and would complicate migration compatibility. > > > > I guess virtio-blk is the better place. > > > > I'm fine to start with -blk, but if -net devices are processing > buffers out of order we have chances of losing descriptors too. > > We can wait for more feedback to prioritize correctly this though. > > Thanks! Traditionally vhost serialized everything when dropping the VQ. Would be interesting to hear from hardware vendors on whether it's hard or easy to do in hardware. But I suspect all devices will want the capability eventually because why not, if we have the code let's just do it everywhere.
On Tue, Jan 17, 2023 at 5:02 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Dec 07, 2022 at 09:56:20AM +0100, Eugenio Perez Martin wrote: > > > A dumb question, any reason we need bother with virtio-net? It looks > > > to me it's not a must and would complicate migration compatibility. > > > > > > I guess virtio-blk is the better place. > > > > > > > I'm fine to start with -blk, but if -net devices are processing > > buffers out of order we have chances of losing descriptors too. > > > > We can wait for more feedback to prioritize correctly this though. > > > > Thanks! > > Traditionally vhost serialized everything when dropping the VQ. > Would be interesting to hear from hardware vendors on whether > it's hard or easy to do in hardware. The feedback is some vendors will do the serialization while others are not. > But I suspect all devices will want the capability eventually > because why not, if we have the code let's just do it everywhere. Yes, but it's more about priority/compatibility other than whether it is needed. We want to let the migration work as earlier as possible for vendor to test and try. Thanks > > -- > MST >
On Tue, Jan 17, 2023 at 4:58 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Jan 11, 2023 at 01:51:06PM +0800, Jason Wang wrote: > > On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > > Sent: Tuesday, January 10, 2023 11:35 PM > > > > > > > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit <parav@nvidia.com> wrote: > > > > > > > > > > Hi Jason, > > > > > > > > > > > From: Jason Wang <jasowang@redhat.com> > > > > > > Sent: Monday, December 5, 2022 10:25 PM > > > > > > > > > > > > > > > > > A dumb question, any reason we need bother with virtio-net? It looks > > > > > > to me it's not a must and would complicate migration compatibility. > > > > > > > > > > Virtio net vdpa device is processing the descriptors out of order. > > > > > This vdpa device doesn’t offer IN_ORDER flag. > > > > > > > > > > And when a VQ is suspended it cannot complete these descriptors as some > > > > dummy zero length completions. > > > > > The guest VM is flooded with [1]. > > > > > > > > Yes, but any reason for the device to do out-of-order for RX? > > > > > > > For some devices it is more optimal to process them out of order. > > > And its not limited to RX. > > > > TX should be fine, since the device can anyhow pretend to send all > > packets, so we won't have any in-flight descriptors. > > And drop them all? Depends on how many inflight descriptors. This is the worst case and actually this is how software vhost backends did since it can't validate whether or not the packet is sent to the wire. And it's not worse than RX in which a lot of packets will be dropped for sure during live migration. > You end up with multisecond delays for things like > DHCP. Well, if DHCP is done during the live migration this is somehow unavoidable, a lot of things needs to be recovered not only from the migration downtime. E.g it may suffer from delay of gARP packet and others. > Yes theoretically packets can be dropped at any time, but > practically people expect this to happen on busy systems, not randomly > out of the blue. The problem is that we never validate whether or not the packet is sent for a software device. Various layers could be placed under the vhost, so there's no guarantee that the packet won't be lost. > > > > > > > > > > > > > > So it is needed for the devices that doesn’t offer IN_ORDER feature. > > > > > > > > > > [1] > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tre > > > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252 > > > > > > > > It is only enabled in a debug kernel which should be harmless? > > > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log level. > > > > Ok, but the production environment should not use that level anyhow. > > It's just one example. And it's enough in my eyes to prove we really > can't start sending zero length RX buffers to drivers and expect all to be > well. If we want to we need to negotiate a new feature bit. Ok. > > > > > And regardless, generating zero length packets for debug kernel is even more confusing. > > > > Note that it is allowed in the virtio-spec[1] (we probably can fix > > that in the driver) and we have pr_debug() all over this drivers and > > other places. It doesn't cause any side effects except for the > > debugging purpose. > > > > So I think having inflight tracking is useful, but I'm not sure it's > > worth bothering with virtio-net (or worth to bothering now): > > > > - zero length is allowed > > - it only helps for debugging > > - may cause issues for migration compatibility > > - requires new infrastructure to be invented > > > > Thanks > > > > [1] spec said > > > > " > > Note: len is particularly useful for drivers using untrusted buffers: > > if a driver does not know exactly how much has been written by the > > device, the driver would have to zero the buffer in advance to ensure > > no data leakage occurs. > > " > > I don't think this talks about zero length at all. > Let me try to explain what this talk about in my opinion. > > > There are cases where device does not know exactly how much > data it wrote into buffer. Actually, I think the inflight could be one case. Or do you have any other case when the device doesn't know how much data it wrote? > Should it over-estimate > such that driver can be sure that buffer after the reported > length is unchanged? I can't think of a case when such over-estimation can help for any logic. (Filling magic value into the buffer and deduce the actual length that is written by the device?) > Or should it instead under-estimate > such that driver can be sure that the reported length has > been initialized by device? > > What this text in the spec says is that it must always > under-estimate and not over-estimate. And it attempts to > explain why this is useful: imagine driver that trusts the > device and wants to make sure buffer is initialized. > With the definition in the spec, it only needs to initialize > data after the reported length. Just to make sure I understand, such initialization can happen only after the buffer is completed by the device. But after that the buffer doesn't belong to the device anymore so drivers are free to do any initialization they want. Or anything makes this special? Thanks > Initialize how? It's up to the > driver but for example it can zero this buffer. > > > In short, all the text says is "do not over-report length, > only set it to part of buffer you wrote". > > Besides that, the text itself is from the original spec and it did not > age well: > > 1)- no one actually relies on this > > 2)- rather than untrusted "buffers" what we commonly have is untrusted > devices so length can't be trusted either > > 3)- writes on PCI are posted and if your security model > depends on buffer being initialized and you want to > recover from errors you really can't expect device to > give you this info. Luckily no one cares see 1) above. > > > -- > MST >
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index ef234ffe7e..ae7c017ef0 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -151,9 +151,11 @@ typedef struct VirtIONetQueue { QEMUTimer *tx_timer; QEMUBH *tx_bh; uint32_t tx_waiting; + uint32_t tx_inflight_num, rx_inflight_num; struct { VirtQueueElement *elem; } async_tx; + VirtQueueElement **tx_inflight, **rx_inflight; struct VirtIONet *n; } VirtIONetQueue; diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 9726d2d09e..9e0dfef9ee 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -626,6 +626,17 @@ extern const VMStateInfo vmstate_info_qlist; .offset = vmstate_offset_varray(_state, _field, _type), \ } +#define VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(_field, _state, _field_num, \ + _version, _vmsd, _type) { \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .vmsd = &(_vmsd), \ + .num_offset = vmstate_offset_value(_state, _field_num, uint16_t), \ + .size = sizeof(_type), \ + .flags = VMS_STRUCT | VMS_VARRAY_UINT16 | VMS_ALLOC | VMS_POINTER, \ + .offset = vmstate_offset_pointer(_state, _field, _type), \ +} + #define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, _version, _vmsd, _type) {\ .name = (stringify(_field)), \ .version_id = (_version), \ diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index aba12759d5..ffd7bf1fc7 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3077,6 +3077,13 @@ static bool mac_table_doesnt_fit(void *opaque, int version_id) return !mac_table_fits(opaque, version_id); } +typedef struct VirtIONetInflightQueue { + uint16_t idx; + uint16_t num; + QTAILQ_ENTRY(VirtIONetInflightQueue) entry; + VirtQueueElementOld *elems; +} VirtIONetInflightQueue; + /* This temporary type is shared by all the WITH_TMP methods * although only some fields are used by each. */ @@ -3086,6 +3093,7 @@ struct VirtIONetMigTmp { uint16_t curr_queue_pairs_1; uint8_t has_ufo; uint32_t has_vnet_hdr; + QTAILQ_HEAD(, VirtIONetInflightQueue) queues_inflight; }; /* The 2nd and subsequent tx_waiting flags are loaded later than @@ -3231,6 +3239,124 @@ static const VMStateDescription vmstate_virtio_net_rss = { }, }; +static const VMStateDescription vmstate_virtio_net_inflight_queue = { + .name = "virtio-net-device/inflight/queue", + .fields = (VMStateField[]) { + VMSTATE_UINT16(idx, VirtIONetInflightQueue), + VMSTATE_UINT16(num, VirtIONetInflightQueue), + + VMSTATE_STRUCT_VARRAY_ALLOC_UINT16(elems, VirtIONetInflightQueue, num, + 0, vmstate_virtqueue_element_old, + VirtQueueElementOld), + VMSTATE_END_OF_LIST() + }, +}; + +static int virtio_net_inflight_init(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + + QTAILQ_INIT(&tmp->queues_inflight); + return 0; +} + +static int virtio_net_inflight_pre_save(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + VirtIONet *net = tmp->parent; + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; + VirtIONetInflightQueue *qi = g_new0(VirtIONetInflightQueue, + curr_queue_pairs * 2); + + virtio_net_inflight_init(opaque); + for (uint16_t i = 0; i < curr_queue_pairs * 2; ++i) { + VirtIONetQueue *q = &net->vqs[vq2q(i)]; + size_t n = i % 2 ? q->tx_inflight_num : q->rx_inflight_num; + VirtQueueElement **inflight = i % 2 ? q->tx_inflight : q->rx_inflight; + + if (n == 0) { + continue; + } + + qi[i].idx = i; + qi[i].num = n; + qi[i].elems = g_new0(VirtQueueElementOld, n); + for (uint16_t j = 0; j < n; ++j) { + qemu_put_virtqueue_element_old(inflight[j], &qi[i].elems[j]); + } + QTAILQ_INSERT_TAIL(&tmp->queues_inflight, &qi[i], entry); + } + + return 0; +} + +static int virtio_net_inflight_post_save(void *opaque) +{ + struct VirtIONetMigTmp *tmp = opaque; + VirtIONetInflightQueue *qi; + + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); + g_free(qi->elems); + g_free(qi); + } + + return 0; +} + +static int virtio_net_inflight_post_load(void *opaque, int version_id) +{ + struct VirtIONetMigTmp *tmp = opaque; + VirtIONet *net = tmp->parent; + uint16_t curr_queue_pairs = net->multiqueue ? net->curr_queue_pairs : 1; + VirtIONetInflightQueue *qi; + + while ((qi = QTAILQ_FIRST(&tmp->queues_inflight))) { + VirtIONetQueue *q = &net->vqs[vq2q(qi->idx)]; + uint32_t *n = qi->idx % 2 ? &q->tx_inflight_num : &q->rx_inflight_num; + VirtQueueElement ***inflight = qi->idx % 2 ? + &q->tx_inflight : &q->rx_inflight; + if (unlikely(qi->num == 0)) { + /* TODO: error message */ + return -1; + } + + if (unlikely(qi->idx > curr_queue_pairs * 2)) { + /* TODO: error message */ + return -1; + } + + *n = qi->num; + *inflight = g_new(VirtQueueElement *, *n); + for (uint16_t j = 0; j < *n; ++j) { + (*inflight)[j] = qemu_get_virtqueue_element_from_old( + &net->parent_obj, &qi->elems[j], + sizeof(VirtQueueElement)); + } + + QTAILQ_REMOVE(&tmp->queues_inflight, qi, entry); + g_free(qi->elems); + g_free(qi); + } + + return 0; +} + +/* TODO: Allocate a temporal per queue / queue element, not all of them! */ +static const VMStateDescription vmstate_virtio_net_inflight = { + .name = "virtio-net-device/inflight", + .pre_save = virtio_net_inflight_pre_save, + .post_save = virtio_net_inflight_post_save, + .pre_load = virtio_net_inflight_init, + .post_load = virtio_net_inflight_post_load, + .fields = (VMStateField[]) { + VMSTATE_QTAILQ_V(queues_inflight, struct VirtIONetMigTmp, 0, + vmstate_virtio_net_inflight_queue, + VirtIONetInflightQueue, entry), + VMSTATE_END_OF_LIST() + }, +}; + static const VMStateDescription vmstate_virtio_net_device = { .name = "virtio-net-device", .version_id = VIRTIO_NET_VM_VERSION, @@ -3279,6 +3405,9 @@ static const VMStateDescription vmstate_virtio_net_device = { vmstate_virtio_net_tx_waiting), VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet, has_ctrl_guest_offloads), + /* TODO: Move to subsection */ + VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp, + vmstate_virtio_net_inflight), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription * []) {
There is currently no data to be migrated, since nothing populates or read the fields on virtio-net. The migration of in-flight descriptors is modelled after the migration of requests in virtio-blk. With some differences: * virtio-blk migrates queue number on each request. Here we only add a vq if it has descriptors to migrate, and then we make all descriptors in an array. * Use of QTAILQ since it works similar to signal the end of the inflight descriptors: 1 for more data, 0 if end. But do it for each vq instead of for each descriptor. * Usage of VMState macros. The fields of descriptors would be way more complicated if we use the VirtQueueElements directly, since there would be a few levels of indirections. Using VirtQueueElementOld for the moment, and migrate to VirtQueueElement for the final patch. TODO: Proper migration versioning TODO: Do not embed vhost-vdpa structs TODO: Migrate the VirtQueueElement, not VirtQueueElementOld. Signed-off-by: Eugenio Pérez <eperezma@redhat.com> --- include/hw/virtio/virtio-net.h | 2 + include/migration/vmstate.h | 11 +++ hw/net/virtio-net.c | 129 +++++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+)