Message ID | 20240509114615.317450-1-jiri@resnulli.us (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] virtio_net: add support for Byte Queue Limits | expand |
On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> Can we get more detail on the benefits you observe etc? Thanks! > --- > drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > 1 file changed, 20 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 218a446c4c27..c53d6dc6d332 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > > struct virtnet_sq_free_stats { > u64 packets; > + u64 xdp_packets; > u64 bytes; > + u64 xdp_bytes; > }; > > struct virtnet_sq_stats { > @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > void *ptr; > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > - ++stats->packets; > - > if (!is_xdp_frame(ptr)) { > struct sk_buff *skb = ptr; > > pr_debug("Sent skb %p\n", skb); > > + stats->packets++; > stats->bytes += skb->len; > napi_consume_skb(skb, in_napi); > } else { > struct xdp_frame *frame = ptr_to_xdp(ptr); > > - stats->bytes += xdp_get_frame_len(frame); > + stats->xdp_packets++; > + stats->xdp_bytes += xdp_get_frame_len(frame); > xdp_return_frame(frame); > } > } > @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > virtnet_rq_free_buf(vi, rq, buf); > } > > -static void free_old_xmit(struct send_queue *sq, bool in_napi) > +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi) > { > struct virtnet_sq_free_stats stats = {0}; > > @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > /* Avoid overhead when no packets have been processed > * happens when called speculatively from start_xmit. > */ > - if (!stats.packets) > + if (!stats.packets && !stats.xdp_packets) > return; > > + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > + > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_add(&sq->stats.bytes, stats.bytes); > u64_stats_add(&sq->stats.packets, stats.packets); > @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > * early means 16 slots are typically wasted. > */ > if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > - netif_stop_subqueue(dev, qnum); > + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > + > + netif_tx_stop_queue(txq); > if (use_napi) { > if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > virtqueue_napi_schedule(&sq->napi, sq->vq); > } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > /* More just got used, free them then recheck. */ > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > netif_start_subqueue(dev, qnum); > virtqueue_disable_cb(sq->vq); > @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > do { > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > txq = netdev_get_tx_queue(vi->dev, index); > __netif_tx_lock(txq, raw_smp_processor_id()); > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > netif_tx_wake_queue(txq); > @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > struct send_queue *sq = &vi->sq[qnum]; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > - bool kick = !netdev_xmit_more(); > + bool xmit_more = netdev_xmit_more(); > bool use_napi = sq->napi.weight; > > /* Free up any pending old buffers before queueing new ones. */ > @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > if (use_napi) > virtqueue_disable_cb(sq->vq); > > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > > - } while (use_napi && kick && > + } while (use_napi && !xmit_more && > unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > /* timestamp packet in software */ > @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > check_sq_full_and_disable(vi, dev, sq); > > - if (kick || netif_xmit_stopped(txq)) { > + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.kicks); > -- > 2.44.0
Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >Can we get more detail on the benefits you observe etc? >Thanks! More info about the BQL in general is here: https://lwn.net/Articles/469652/
On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> From: Jiri Pirko <jiri@nvidia.com> > >> > >> Add support for Byte Queue Limits (BQL). > >> > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > > >Can we get more detail on the benefits you observe etc? > >Thanks! > > More info about the BQL in general is here: > https://lwn.net/Articles/469652/ I know about BQL in general. We discussed BQL for virtio in the past mostly I got the feedback from net core maintainers that it likely won't benefit virtio. So I'm asking, what kind of benefit do you observe?
On Thu, May 9, 2024 at 7:46 PM Jiri Pirko <jiri@resnulli.us> wrote: > > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- Does this work for non tx NAPI mode (it seems not obvious to me)? Thanks
On Thu, May 9, 2024 at 10:28 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > > >> From: Jiri Pirko <jiri@nvidia.com> > > >> > > >> Add support for Byte Queue Limits (BQL). > > >> > > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > > > > >Can we get more detail on the benefits you observe etc? > > >Thanks! > > > > More info about the BQL in general is here: > > https://lwn.net/Articles/469652/ > > I know about BQL in general. We discussed BQL for virtio in the past > mostly I got the feedback from net core maintainers that it likely won't > benefit virtio. > > So I'm asking, what kind of benefit do you observe? Yes, benmark is more than welcomed. Thanks > > -- > MST >
On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). Historically both Jason and Michael have attempted to support BQL for virtio-net, for example: https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ These discussions focus primarily on: 1. BQL is based on napi tx. Therefore, the transfer of statistical information needs to rely on the judgment of use_napi. When the napi mode is switched to orphan, some statistical information will be lost, resulting in temporary inaccuracy in BQL. 2. If tx dim is supported, orphan mode may be removed and tx irq will be more reasonable. This provides good support for BQL. Thanks. > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- > drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > 1 file changed, 20 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 218a446c4c27..c53d6dc6d332 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > > struct virtnet_sq_free_stats { > u64 packets; > + u64 xdp_packets; > u64 bytes; > + u64 xdp_bytes; > }; > > struct virtnet_sq_stats { > @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > void *ptr; > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > - ++stats->packets; > - > if (!is_xdp_frame(ptr)) { > struct sk_buff *skb = ptr; > > pr_debug("Sent skb %p\n", skb); > > + stats->packets++; > stats->bytes += skb->len; > napi_consume_skb(skb, in_napi); > } else { > struct xdp_frame *frame = ptr_to_xdp(ptr); > > - stats->bytes += xdp_get_frame_len(frame); > + stats->xdp_packets++; > + stats->xdp_bytes += xdp_get_frame_len(frame); > xdp_return_frame(frame); > } > } > @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > virtnet_rq_free_buf(vi, rq, buf); > } > > -static void free_old_xmit(struct send_queue *sq, bool in_napi) > +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi) > { > struct virtnet_sq_free_stats stats = {0}; > > @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > /* Avoid overhead when no packets have been processed > * happens when called speculatively from start_xmit. > */ > - if (!stats.packets) > + if (!stats.packets && !stats.xdp_packets) > return; > > + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > + > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_add(&sq->stats.bytes, stats.bytes); > u64_stats_add(&sq->stats.packets, stats.packets); > @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > * early means 16 slots are typically wasted. > */ > if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > - netif_stop_subqueue(dev, qnum); > + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > + > + netif_tx_stop_queue(txq); > if (use_napi) { > if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > virtqueue_napi_schedule(&sq->napi, sq->vq); > } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > /* More just got used, free them then recheck. */ > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > netif_start_subqueue(dev, qnum); > virtqueue_disable_cb(sq->vq); > @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > do { > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > txq = netdev_get_tx_queue(vi->dev, index); > __netif_tx_lock(txq, raw_smp_processor_id()); > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > netif_tx_wake_queue(txq); > @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > struct send_queue *sq = &vi->sq[qnum]; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > - bool kick = !netdev_xmit_more(); > + bool xmit_more = netdev_xmit_more(); > bool use_napi = sq->napi.weight; > > /* Free up any pending old buffers before queueing new ones. */ > @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > if (use_napi) > virtqueue_disable_cb(sq->vq); > > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > > - } while (use_napi && kick && > + } while (use_napi && !xmit_more && > unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > /* timestamp packet in software */ > @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > check_sq_full_and_disable(vi, dev, sq); > > - if (kick || netif_xmit_stopped(txq)) { > + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.kicks); > -- > 2.44.0 > >
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). > >Historically both Jason and Michael have attempted to support BQL >for virtio-net, for example: > >https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >These discussions focus primarily on: > >1. BQL is based on napi tx. Therefore, the transfer of statistical information >needs to rely on the judgment of use_napi. When the napi mode is switched to >orphan, some statistical information will be lost, resulting in temporary >inaccuracy in BQL. > >2. If tx dim is supported, orphan mode may be removed and tx irq will be more >reasonable. This provides good support for BQL. Thanks for the pointers, will check that out. > >Thanks. > >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> --- >> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- >> 1 file changed, 20 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 218a446c4c27..c53d6dc6d332 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { >> >> struct virtnet_sq_free_stats { >> u64 packets; >> + u64 xdp_packets; >> u64 bytes; >> + u64 xdp_bytes; >> }; >> >> struct virtnet_sq_stats { >> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, >> void *ptr; >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >> - ++stats->packets; >> - >> if (!is_xdp_frame(ptr)) { >> struct sk_buff *skb = ptr; >> >> pr_debug("Sent skb %p\n", skb); >> >> + stats->packets++; >> stats->bytes += skb->len; >> napi_consume_skb(skb, in_napi); >> } else { >> struct xdp_frame *frame = ptr_to_xdp(ptr); >> >> - stats->bytes += xdp_get_frame_len(frame); >> + stats->xdp_packets++; >> + stats->xdp_bytes += xdp_get_frame_len(frame); >> xdp_return_frame(frame); >> } >> } >> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) >> virtnet_rq_free_buf(vi, rq, buf); >> } >> >> -static void free_old_xmit(struct send_queue *sq, bool in_napi) >> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >> + bool in_napi) >> { >> struct virtnet_sq_free_stats stats = {0}; >> >> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) >> /* Avoid overhead when no packets have been processed >> * happens when called speculatively from start_xmit. >> */ >> - if (!stats.packets) >> + if (!stats.packets && !stats.xdp_packets) >> return; >> >> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); >> + >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_add(&sq->stats.bytes, stats.bytes); >> u64_stats_add(&sq->stats.packets, stats.packets); >> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, >> * early means 16 slots are typically wasted. >> */ >> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { >> - netif_stop_subqueue(dev, qnum); >> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> + >> + netif_tx_stop_queue(txq); >> if (use_napi) { >> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) >> virtqueue_napi_schedule(&sq->napi, sq->vq); >> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { >> /* More just got used, free them then recheck. */ >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { >> netif_start_subqueue(dev, qnum); >> virtqueue_disable_cb(sq->vq); >> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) >> >> do { >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) >> txq = netdev_get_tx_queue(vi->dev, index); >> __netif_tx_lock(txq, raw_smp_processor_id()); >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> netif_tx_wake_queue(txq); >> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> struct send_queue *sq = &vi->sq[qnum]; >> int err; >> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> - bool kick = !netdev_xmit_more(); >> + bool xmit_more = netdev_xmit_more(); >> bool use_napi = sq->napi.weight; >> >> /* Free up any pending old buffers before queueing new ones. */ >> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> if (use_napi) >> virtqueue_disable_cb(sq->vq); >> >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> >> - } while (use_napi && kick && >> + } while (use_napi && !xmit_more && >> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> /* timestamp packet in software */ >> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> >> check_sq_full_and_disable(vi, dev, sq); >> >> - if (kick || netif_xmit_stopped(txq)) { >> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { >> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_inc(&sq->stats.kicks); >> -- >> 2.44.0 >> >>
Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> > >> >Can we get more detail on the benefits you observe etc? >> >Thanks! >> >> More info about the BQL in general is here: >> https://lwn.net/Articles/469652/ > >I know about BQL in general. We discussed BQL for virtio in the past >mostly I got the feedback from net core maintainers that it likely won't >benefit virtio. Do you have some link to that, or is it this thread: https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ I don't see why virtio should be any different from other drivers/devices that benefit from bql. HOL blocking is the same here are everywhere. > >So I'm asking, what kind of benefit do you observe? I don't have measurements at hand, will attach them to v2. Thanks! > >-- >MST >
On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> > >> >> Add support for Byte Queue Limits (BQL). > >> >> > >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> > > >> >Can we get more detail on the benefits you observe etc? > >> >Thanks! > >> > >> More info about the BQL in general is here: > >> https://lwn.net/Articles/469652/ > > > >I know about BQL in general. We discussed BQL for virtio in the past > >mostly I got the feedback from net core maintainers that it likely won't > >benefit virtio. > > Do you have some link to that, or is it this thread: > https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ A quick search on lore turned up this, for example: https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > I don't see why virtio should be any different from other > drivers/devices that benefit from bql. HOL blocking is the same here are > everywhere. > > > > >So I'm asking, what kind of benefit do you observe? > > I don't have measurements at hand, will attach them to v2. > > Thanks! > > > > >-- > >MST > >
Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> > >> >> >Can we get more detail on the benefits you observe etc? >> >> >Thanks! >> >> >> >> More info about the BQL in general is here: >> >> https://lwn.net/Articles/469652/ >> > >> >I know about BQL in general. We discussed BQL for virtio in the past >> >mostly I got the feedback from net core maintainers that it likely won't >> >benefit virtio. >> >> Do you have some link to that, or is it this thread: >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > >A quick search on lore turned up this, for example: >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ Says: "Note that NIC with many TX queues make BQL almost useless, only adding extra overhead." But virtio can have one tx queue, I guess that could be quite common configuration in lot of deployments. > > > > >> I don't see why virtio should be any different from other >> drivers/devices that benefit from bql. HOL blocking is the same here are >> everywhere. >> >> > >> >So I'm asking, what kind of benefit do you observe? >> >> I don't have measurements at hand, will attach them to v2. >> >> Thanks! >> >> > >> >-- >> >MST >> > >
On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> >> > >> >> >> Add support for Byte Queue Limits (BQL). > >> >> >> > >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >> > > >> >> >Can we get more detail on the benefits you observe etc? > >> >> >Thanks! > >> >> > >> >> More info about the BQL in general is here: > >> >> https://lwn.net/Articles/469652/ > >> > > >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >mostly I got the feedback from net core maintainers that it likely won't > >> >benefit virtio. > >> > >> Do you have some link to that, or is it this thread: > >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > > > >A quick search on lore turned up this, for example: > >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > > Says: > "Note that NIC with many TX queues make BQL almost useless, only adding extra > overhead." > > But virtio can have one tx queue, I guess that could be quite common > configuration in lot of deployments. Not sure we should worry about performance for these though. What I am saying is this should come with some benchmarking results. > > > > > > > > > > >> I don't see why virtio should be any different from other > >> drivers/devices that benefit from bql. HOL blocking is the same here are > >> everywhere. > >> > >> > > >> >So I'm asking, what kind of benefit do you observe? > >> > >> I don't have measurements at hand, will attach them to v2. > >> > >> Thanks! > >> > >> > > >> >-- > >> >MST > >> > > >
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> > >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >Thanks! >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> https://lwn.net/Articles/469652/ >> >> > >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >benefit virtio. >> >> >> >> Do you have some link to that, or is it this thread: >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> > >> > >> >A quick search on lore turned up this, for example: >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> Says: >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> overhead." >> >> But virtio can have one tx queue, I guess that could be quite common >> configuration in lot of deployments. > >Not sure we should worry about performance for these though. Well, queues may be scarce resource sometimes, even in those cases, you want to perform. >What I am saying is this should come with some benchmarking >results. Sure, I got the message. > > >> >> > >> > >> > >> > >> >> I don't see why virtio should be any different from other >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> everywhere. >> >> >> >> > >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> Thanks! >> >> >> >> > >> >> >-- >> >> >MST >> >> > >> > >
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> > >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >Thanks! >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> https://lwn.net/Articles/469652/ >> >> > >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >benefit virtio. >> >> >> >> Do you have some link to that, or is it this thread: >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> > >> > >> >A quick search on lore turned up this, for example: >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> Says: >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> overhead." >> >> But virtio can have one tx queue, I guess that could be quite common >> configuration in lot of deployments. > >Not sure we should worry about performance for these though. >What I am saying is this should come with some benchmarking >results. I did some measurements with VDPA, backed by ConnectX6dx NIC, single queue pair: super_netperf 200 -H $ip -l 45 -t TCP_STREAM & nice -n 20 netperf -H $ip -l 10 -t TCP_RR RR result with no bql: 29.95 32.74 28.77 RR result with bql: 222.98 159.81 197.88 > > >> >> > >> > >> > >> > >> >> I don't see why virtio should be any different from other >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> everywhere. >> >> >> >> > >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> Thanks! >> >> >> >> > >> >> >-- >> >> >MST >> >> > >> > >
On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> >> >> > >> >> >> >> Add support for Byte Queue Limits (BQL). > >> >> >> >> > >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >> >> > > >> >> >> >Can we get more detail on the benefits you observe etc? > >> >> >> >Thanks! > >> >> >> > >> >> >> More info about the BQL in general is here: > >> >> >> https://lwn.net/Articles/469652/ > >> >> > > >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >> >mostly I got the feedback from net core maintainers that it likely won't > >> >> >benefit virtio. > >> >> > >> >> Do you have some link to that, or is it this thread: > >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> > > >> > > >> >A quick search on lore turned up this, for example: > >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >> > >> Says: > >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >> overhead." > >> > >> But virtio can have one tx queue, I guess that could be quite common > >> configuration in lot of deployments. > > > >Not sure we should worry about performance for these though. > >What I am saying is this should come with some benchmarking > >results. > > I did some measurements with VDPA, backed by ConnectX6dx NIC, single > queue pair: > > super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > nice -n 20 netperf -H $ip -l 10 -t TCP_RR > > RR result with no bql: > 29.95 > 32.74 > 28.77 > > RR result with bql: > 222.98 > 159.81 > 197.88 > Okay. And on the other hand, any measureable degradation with multiqueue and when testing throughput? > > > > > > >> > >> > > >> > > >> > > >> > > >> >> I don't see why virtio should be any different from other > >> >> drivers/devices that benefit from bql. HOL blocking is the same here are > >> >> everywhere. > >> >> > >> >> > > >> >> >So I'm asking, what kind of benefit do you observe? > >> >> > >> >> I don't have measurements at hand, will attach them to v2. > >> >> > >> >> Thanks! > >> >> > >> >> > > >> >> >-- > >> >> >MST > >> >> > > >> > > >
Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> >> > >> >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >> >Thanks! >> >> >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> >> https://lwn.net/Articles/469652/ >> >> >> > >> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >> >benefit virtio. >> >> >> >> >> >> Do you have some link to that, or is it this thread: >> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> > >> >> > >> >> >A quick search on lore turned up this, for example: >> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> >> >> Says: >> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >> overhead." >> >> >> >> But virtio can have one tx queue, I guess that could be quite common >> >> configuration in lot of deployments. >> > >> >Not sure we should worry about performance for these though. >> >What I am saying is this should come with some benchmarking >> >results. >> >> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> queue pair: >> >> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >> RR result with no bql: >> 29.95 >> 32.74 >> 28.77 >> >> RR result with bql: >> 222.98 >> 159.81 >> 197.88 >> > >Okay. And on the other hand, any measureable degradation with >multiqueue and when testing throughput? With multiqueue it depends if the flows hits the same queue or not. If they do, the same results will likely be shown. > > >> >> > >> > >> >> >> >> > >> >> > >> >> > >> >> > >> >> >> I don't see why virtio should be any different from other >> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> >> everywhere. >> >> >> >> >> >> > >> >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> >> >> Thanks! >> >> >> >> >> >> > >> >> >> >-- >> >> >> >MST >> >> >> > >> >> > >> > >
Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >>> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >>> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>> >> >> >> > >>> >> >> >> >Can we get more detail on the benefits you observe etc? >>> >> >> >> >Thanks! >>> >> >> >> >>> >> >> >> More info about the BQL in general is here: >>> >> >> >> https://lwn.net/Articles/469652/ >>> >> >> > >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >>> >> >> >benefit virtio. >>> >> >> >>> >> >> Do you have some link to that, or is it this thread: >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >>> >> > >>> >> > >>> >> >A quick search on lore turned up this, for example: >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >>> >> >>> >> Says: >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >>> >> overhead." >>> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >>> >> configuration in lot of deployments. >>> > >>> >Not sure we should worry about performance for these though. >>> >What I am saying is this should come with some benchmarking >>> >results. >>> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >>> queue pair: >>> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >>> >>> RR result with no bql: >>> 29.95 >>> 32.74 >>> 28.77 >>> >>> RR result with bql: >>> 222.98 >>> 159.81 >>> 197.88 >>> >> >>Okay. And on the other hand, any measureable degradation with >>multiqueue and when testing throughput? > >With multiqueue it depends if the flows hits the same queue or not. If >they do, the same results will likely be shown. RR 1q, w/o bql: 29.95 32.74 28.77 RR 1q, with bql: 222.98 159.81 197.88 RR 4q, w/o bql: 355.82 364.58 233.47 RR 4q, with bql: 371.19 255.93 337.77 So answer to your question is: "no measurable degradation with 4 queues". > > >> >> >>> >>> > >>> > >>> >> >>> >> > >>> >> > >>> >> > >>> >> > >>> >> >> I don't see why virtio should be any different from other >>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >>> >> >> everywhere. >>> >> >> >>> >> >> > >>> >> >> >So I'm asking, what kind of benefit do you observe? >>> >> >> >>> >> >> I don't have measurements at hand, will attach them to v2. >>> >> >> >>> >> >> Thanks! >>> >> >> >>> >> >> > >>> >> >> >-- >>> >> >> >MST >>> >> >> > >>> >> > >>> > >>
On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: > >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: > >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >>> >> >> >> >> > >>> >> >> >> >> Add support for Byte Queue Limits (BQL). > >>> >> >> >> >> > >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >>> >> >> >> > > >>> >> >> >> >Can we get more detail on the benefits you observe etc? > >>> >> >> >> >Thanks! > >>> >> >> >> > >>> >> >> >> More info about the BQL in general is here: > >>> >> >> >> https://lwn.net/Articles/469652/ > >>> >> >> > > >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't > >>> >> >> >benefit virtio. > >>> >> >> > >>> >> >> Do you have some link to that, or is it this thread: > >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >>> >> > > >>> >> > > >>> >> >A quick search on lore turned up this, for example: > >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >>> >> > >>> >> Says: > >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >>> >> overhead." > >>> >> > >>> >> But virtio can have one tx queue, I guess that could be quite common > >>> >> configuration in lot of deployments. > >>> > > >>> >Not sure we should worry about performance for these though. > >>> >What I am saying is this should come with some benchmarking > >>> >results. > >>> > >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single > >>> queue pair: > >>> > >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR > >>> > >>> RR result with no bql: > >>> 29.95 > >>> 32.74 > >>> 28.77 > >>> > >>> RR result with bql: > >>> 222.98 > >>> 159.81 > >>> 197.88 > >>> > >> > >>Okay. And on the other hand, any measureable degradation with > >>multiqueue and when testing throughput? > > > >With multiqueue it depends if the flows hits the same queue or not. If > >they do, the same results will likely be shown. > > RR 1q, w/o bql: > 29.95 > 32.74 > 28.77 > > RR 1q, with bql: > 222.98 > 159.81 > 197.88 > > RR 4q, w/o bql: > 355.82 > 364.58 > 233.47 > > RR 4q, with bql: > 371.19 > 255.93 > 337.77 > > So answer to your question is: "no measurable degradation with 4 > queues". Thanks but I think we also need benchmarks in cases other than vDPA. For example, a simple virtualization setup.
Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >>> >> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >>> >> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >>> >> >> >> > >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? >> >>> >> >> >> >Thanks! >> >>> >> >> >> >> >>> >> >> >> More info about the BQL in general is here: >> >>> >> >> >> https://lwn.net/Articles/469652/ >> >>> >> >> > >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >>> >> >> >benefit virtio. >> >>> >> >> >> >>> >> >> Do you have some link to that, or is it this thread: >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >>> >> > >> >>> >> > >> >>> >> >A quick search on lore turned up this, for example: >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >>> >> >> >>> >> Says: >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >>> >> overhead." >> >>> >> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >> >>> >> configuration in lot of deployments. >> >>> > >> >>> >Not sure we should worry about performance for these though. >> >>> >What I am saying is this should come with some benchmarking >> >>> >results. >> >>> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> >>> queue pair: >> >>> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >>> >> >>> RR result with no bql: >> >>> 29.95 >> >>> 32.74 >> >>> 28.77 >> >>> >> >>> RR result with bql: >> >>> 222.98 >> >>> 159.81 >> >>> 197.88 >> >>> >> >> >> >>Okay. And on the other hand, any measureable degradation with >> >>multiqueue and when testing throughput? >> > >> >With multiqueue it depends if the flows hits the same queue or not. If >> >they do, the same results will likely be shown. >> >> RR 1q, w/o bql: >> 29.95 >> 32.74 >> 28.77 >> >> RR 1q, with bql: >> 222.98 >> 159.81 >> 197.88 >> >> RR 4q, w/o bql: >> 355.82 >> 364.58 >> 233.47 >> >> RR 4q, with bql: >> 371.19 >> 255.93 >> 337.77 >> >> So answer to your question is: "no measurable degradation with 4 >> queues". > >Thanks but I think we also need benchmarks in cases other than vDPA. >For example, a simple virtualization setup. For virtualization setup, I get this: VIRT RR 1q, w/0 bql: 49.18 49.75 50.07 VIRT RR 1q, with bql: 51.33 47.88 40.40 No measurable/significant difference. >
On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote: > Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: > >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> > >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: > >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: > >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >>> >> >> >> >> > >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). > >> >>> >> >> >> >> > >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >>> >> >> >> > > >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? > >> >>> >> >> >> >Thanks! > >> >>> >> >> >> > >> >>> >> >> >> More info about the BQL in general is here: > >> >>> >> >> >> https://lwn.net/Articles/469652/ > >> >>> >> >> > > >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't > >> >>> >> >> >benefit virtio. > >> >>> >> >> > >> >>> >> >> Do you have some link to that, or is it this thread: > >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> >>> >> > > >> >>> >> > > >> >>> >> >A quick search on lore turned up this, for example: > >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >> >>> >> > >> >>> >> Says: > >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >> >>> >> overhead." > >> >>> >> > >> >>> >> But virtio can have one tx queue, I guess that could be quite common > >> >>> >> configuration in lot of deployments. > >> >>> > > >> >>> >Not sure we should worry about performance for these though. > >> >>> >What I am saying is this should come with some benchmarking > >> >>> >results. > >> >>> > >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single > >> >>> queue pair: > >> >>> > >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR > >> >>> > >> >>> RR result with no bql: > >> >>> 29.95 > >> >>> 32.74 > >> >>> 28.77 > >> >>> > >> >>> RR result with bql: > >> >>> 222.98 > >> >>> 159.81 > >> >>> 197.88 > >> >>> > >> >> > >> >>Okay. And on the other hand, any measureable degradation with > >> >>multiqueue and when testing throughput? > >> > > >> >With multiqueue it depends if the flows hits the same queue or not. If > >> >they do, the same results will likely be shown. > >> > >> RR 1q, w/o bql: > >> 29.95 > >> 32.74 > >> 28.77 > >> > >> RR 1q, with bql: > >> 222.98 > >> 159.81 > >> 197.88 > >> > >> RR 4q, w/o bql: > >> 355.82 > >> 364.58 > >> 233.47 > >> > >> RR 4q, with bql: > >> 371.19 > >> 255.93 > >> 337.77 > >> > >> So answer to your question is: "no measurable degradation with 4 > >> queues". > > > >Thanks but I think we also need benchmarks in cases other than vDPA. > >For example, a simple virtualization setup. > > For virtualization setup, I get this: > > VIRT RR 1q, w/0 bql: > 49.18 > 49.75 > 50.07 > > VIRT RR 1q, with bql: > 51.33 > 47.88 > 40.40 > > No measurable/significant difference. Seems the results became much noisier? Also I'd expect a regression if any to be in a streaming benchmark.
Thu, May 16, 2024 at 02:31:59PM CEST, mst@redhat.com wrote: >On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote: >> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: >> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> >> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >>> >> >> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >>> >> >> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >>> >> >> >> > >> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >>> >> >> >> >Thanks! >> >> >>> >> >> >> >> >> >>> >> >> >> More info about the BQL in general is here: >> >> >>> >> >> >> https://lwn.net/Articles/469652/ >> >> >>> >> >> > >> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >>> >> >> >benefit virtio. >> >> >>> >> >> >> >> >>> >> >> Do you have some link to that, or is it this thread: >> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> >>> >> > >> >> >>> >> > >> >> >>> >> >A quick search on lore turned up this, for example: >> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> >>> >> >> >> >>> >> Says: >> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >> >>> >> overhead." >> >> >>> >> >> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >> >> >>> >> configuration in lot of deployments. >> >> >>> > >> >> >>> >Not sure we should worry about performance for these though. >> >> >>> >What I am saying is this should come with some benchmarking >> >> >>> >results. >> >> >>> >> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> >> >>> queue pair: >> >> >>> >> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >> >>> >> >> >>> RR result with no bql: >> >> >>> 29.95 >> >> >>> 32.74 >> >> >>> 28.77 >> >> >>> >> >> >>> RR result with bql: >> >> >>> 222.98 >> >> >>> 159.81 >> >> >>> 197.88 >> >> >>> >> >> >> >> >> >>Okay. And on the other hand, any measureable degradation with >> >> >>multiqueue and when testing throughput? >> >> > >> >> >With multiqueue it depends if the flows hits the same queue or not. If >> >> >they do, the same results will likely be shown. >> >> >> >> RR 1q, w/o bql: >> >> 29.95 >> >> 32.74 >> >> 28.77 >> >> >> >> RR 1q, with bql: >> >> 222.98 >> >> 159.81 >> >> 197.88 >> >> >> >> RR 4q, w/o bql: >> >> 355.82 >> >> 364.58 >> >> 233.47 >> >> >> >> RR 4q, with bql: >> >> 371.19 >> >> 255.93 >> >> 337.77 >> >> >> >> So answer to your question is: "no measurable degradation with 4 >> >> queues". >> > >> >Thanks but I think we also need benchmarks in cases other than vDPA. >> >For example, a simple virtualization setup. >> >> For virtualization setup, I get this: >> >> VIRT RR 1q, w/0 bql: >> 49.18 >> 49.75 >> 50.07 >> >> VIRT RR 1q, with bql: >> 51.33 >> 47.88 >> 40.40 >> >> No measurable/significant difference. > >Seems the results became much noisier? Also Not enough data to assume that I believe. >I'd expect a regression if any to be in a streaming benchmark. Can you elaborate? > >-- >MST >
On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote: > > >I'd expect a regression if any to be in a streaming benchmark. > > Can you elaborate? BQL does two things that can hurt throughput: - limits amount of data in the queue - can limit bandwidth if we now get queue underruns - adds CPU overhead - can limit bandwidth if CPU bound So checking result of a streaming benchmark seems important.
Thu, May 16, 2024 at 09:04:41PM CEST, mst@redhat.com wrote: >On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote: >> >> >I'd expect a regression if any to be in a streaming benchmark. >> >> Can you elaborate? > >BQL does two things that can hurt throughput: >- limits amount of data in the queue - can limit bandwidth > if we now get queue underruns >- adds CPU overhead - can limit bandwidth if CPU bound > >So checking result of a streaming benchmark seems important. I understand. Didn't see any degradation in TCP_STREAM netperf. But I will try to extend the testing. Thanks! > > >-- >MST >
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). > >Historically both Jason and Michael have attempted to support BQL >for virtio-net, for example: > >https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >These discussions focus primarily on: > >1. BQL is based on napi tx. Therefore, the transfer of statistical information >needs to rely on the judgment of use_napi. When the napi mode is switched to >orphan, some statistical information will be lost, resulting in temporary >inaccuracy in BQL. > >2. If tx dim is supported, orphan mode may be removed and tx irq will be more >reasonable. This provides good support for BQL. But when the device does not support dim, the orphan mode is still needed, isn't it? > >Thanks. > >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> --- >> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- >> 1 file changed, 20 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 218a446c4c27..c53d6dc6d332 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { >> >> struct virtnet_sq_free_stats { >> u64 packets; >> + u64 xdp_packets; >> u64 bytes; >> + u64 xdp_bytes; >> }; >> >> struct virtnet_sq_stats { >> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, >> void *ptr; >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >> - ++stats->packets; >> - >> if (!is_xdp_frame(ptr)) { >> struct sk_buff *skb = ptr; >> >> pr_debug("Sent skb %p\n", skb); >> >> + stats->packets++; >> stats->bytes += skb->len; >> napi_consume_skb(skb, in_napi); >> } else { >> struct xdp_frame *frame = ptr_to_xdp(ptr); >> >> - stats->bytes += xdp_get_frame_len(frame); >> + stats->xdp_packets++; >> + stats->xdp_bytes += xdp_get_frame_len(frame); >> xdp_return_frame(frame); >> } >> } >> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) >> virtnet_rq_free_buf(vi, rq, buf); >> } >> >> -static void free_old_xmit(struct send_queue *sq, bool in_napi) >> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >> + bool in_napi) >> { >> struct virtnet_sq_free_stats stats = {0}; >> >> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) >> /* Avoid overhead when no packets have been processed >> * happens when called speculatively from start_xmit. >> */ >> - if (!stats.packets) >> + if (!stats.packets && !stats.xdp_packets) >> return; >> >> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); >> + >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_add(&sq->stats.bytes, stats.bytes); >> u64_stats_add(&sq->stats.packets, stats.packets); >> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, >> * early means 16 slots are typically wasted. >> */ >> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { >> - netif_stop_subqueue(dev, qnum); >> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> + >> + netif_tx_stop_queue(txq); >> if (use_napi) { >> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) >> virtqueue_napi_schedule(&sq->napi, sq->vq); >> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { >> /* More just got used, free them then recheck. */ >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { >> netif_start_subqueue(dev, qnum); >> virtqueue_disable_cb(sq->vq); >> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) >> >> do { >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) >> txq = netdev_get_tx_queue(vi->dev, index); >> __netif_tx_lock(txq, raw_smp_processor_id()); >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> netif_tx_wake_queue(txq); >> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> struct send_queue *sq = &vi->sq[qnum]; >> int err; >> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> - bool kick = !netdev_xmit_more(); >> + bool xmit_more = netdev_xmit_more(); >> bool use_napi = sq->napi.weight; >> >> /* Free up any pending old buffers before queueing new ones. */ >> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> if (use_napi) >> virtqueue_disable_cb(sq->vq); >> >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> >> - } while (use_napi && kick && >> + } while (use_napi && !xmit_more && >> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> /* timestamp packet in software */ >> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> >> check_sq_full_and_disable(vi, dev, sq); >> >> - if (kick || netif_xmit_stopped(txq)) { >> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { >> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_inc(&sq->stats.kicks); >> -- >> 2.44.0 >> >>
Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >>> From: Jiri Pirko <jiri@nvidia.com> >>> >>> Add support for Byte Queue Limits (BQL). >> >>Historically both Jason and Michael have attempted to support BQL >>for virtio-net, for example: >> >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >>These discussions focus primarily on: >> >>1. BQL is based on napi tx. Therefore, the transfer of statistical information >>needs to rely on the judgment of use_napi. When the napi mode is switched to >>orphan, some statistical information will be lost, resulting in temporary >>inaccuracy in BQL. >> >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more >>reasonable. This provides good support for BQL. > >But when the device does not support dim, the orphan mode is still >needed, isn't it? Heng, is my assuption correct here? Thanks! > >> >>Thanks. >> >>> >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>> --- >>> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- >>> 1 file changed, 20 insertions(+), 13 deletions(-) >>> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >>> index 218a446c4c27..c53d6dc6d332 100644 >>> --- a/drivers/net/virtio_net.c >>> +++ b/drivers/net/virtio_net.c >>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { >>> >>> struct virtnet_sq_free_stats { >>> u64 packets; >>> + u64 xdp_packets; >>> u64 bytes; >>> + u64 xdp_bytes; >>> }; >>> >>> struct virtnet_sq_stats { >>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, >>> void *ptr; >>> >>> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >>> - ++stats->packets; >>> - >>> if (!is_xdp_frame(ptr)) { >>> struct sk_buff *skb = ptr; >>> >>> pr_debug("Sent skb %p\n", skb); >>> >>> + stats->packets++; >>> stats->bytes += skb->len; >>> napi_consume_skb(skb, in_napi); >>> } else { >>> struct xdp_frame *frame = ptr_to_xdp(ptr); >>> >>> - stats->bytes += xdp_get_frame_len(frame); >>> + stats->xdp_packets++; >>> + stats->xdp_bytes += xdp_get_frame_len(frame); >>> xdp_return_frame(frame); >>> } >>> } >>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) >>> virtnet_rq_free_buf(vi, rq, buf); >>> } >>> >>> -static void free_old_xmit(struct send_queue *sq, bool in_napi) >>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >>> + bool in_napi) >>> { >>> struct virtnet_sq_free_stats stats = {0}; >>> >>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) >>> /* Avoid overhead when no packets have been processed >>> * happens when called speculatively from start_xmit. >>> */ >>> - if (!stats.packets) >>> + if (!stats.packets && !stats.xdp_packets) >>> return; >>> >>> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); >>> + >>> u64_stats_update_begin(&sq->stats.syncp); >>> u64_stats_add(&sq->stats.bytes, stats.bytes); >>> u64_stats_add(&sq->stats.packets, stats.packets); >>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, >>> * early means 16 slots are typically wasted. >>> */ >>> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { >>> - netif_stop_subqueue(dev, qnum); >>> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >>> + >>> + netif_tx_stop_queue(txq); >>> if (use_napi) { >>> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) >>> virtqueue_napi_schedule(&sq->napi, sq->vq); >>> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { >>> /* More just got used, free them then recheck. */ >>> - free_old_xmit(sq, false); >>> + free_old_xmit(sq, txq, false); >>> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { >>> netif_start_subqueue(dev, qnum); >>> virtqueue_disable_cb(sq->vq); >>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) >>> >>> do { >>> virtqueue_disable_cb(sq->vq); >>> - free_old_xmit(sq, true); >>> + free_old_xmit(sq, txq, true); >>> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >>> >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) >>> txq = netdev_get_tx_queue(vi->dev, index); >>> __netif_tx_lock(txq, raw_smp_processor_id()); >>> virtqueue_disable_cb(sq->vq); >>> - free_old_xmit(sq, true); >>> + free_old_xmit(sq, txq, true); >>> >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >>> netif_tx_wake_queue(txq); >>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >>> struct send_queue *sq = &vi->sq[qnum]; >>> int err; >>> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >>> - bool kick = !netdev_xmit_more(); >>> + bool xmit_more = netdev_xmit_more(); >>> bool use_napi = sq->napi.weight; >>> >>> /* Free up any pending old buffers before queueing new ones. */ >>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >>> if (use_napi) >>> virtqueue_disable_cb(sq->vq); >>> >>> - free_old_xmit(sq, false); >>> + free_old_xmit(sq, txq, false); >>> >>> - } while (use_napi && kick && >>> + } while (use_napi && !xmit_more && >>> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >>> >>> /* timestamp packet in software */ >>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >>> >>> check_sq_full_and_disable(vi, dev, sq); >>> >>> - if (kick || netif_xmit_stopped(txq)) { >>> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { >>> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { >>> u64_stats_update_begin(&sq->stats.syncp); >>> u64_stats_inc(&sq->stats.kicks); >>> -- >>> 2.44.0 >>> >>>
On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > >>> From: Jiri Pirko <jiri@nvidia.com> > >>> > >>> Add support for Byte Queue Limits (BQL). > >> > >>Historically both Jason and Michael have attempted to support BQL > >>for virtio-net, for example: > >> > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> > >>These discussions focus primarily on: > >> > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > >>orphan, some statistical information will be lost, resulting in temporary > >>inaccuracy in BQL. > >> > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > >>reasonable. This provides good support for BQL. > > > >But when the device does not support dim, the orphan mode is still > >needed, isn't it? > > Heng, is my assuption correct here? Thanks! > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in most scenarios. Although orphan mode performs better in specific benckmark, perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to support dim for devices that want the best performance. Back to this patch set, I think BQL as a point that affects performance should deserve more comprehensive test data. Thanks. > > > >> > >>Thanks. > >> > >>> > >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >>> --- > >>> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > >>> 1 file changed, 20 insertions(+), 13 deletions(-) > >>> > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > >>> index 218a446c4c27..c53d6dc6d332 100644 > >>> --- a/drivers/net/virtio_net.c > >>> +++ b/drivers/net/virtio_net.c > >>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > >>> > >>> struct virtnet_sq_free_stats { > >>> u64 packets; > >>> + u64 xdp_packets; > >>> u64 bytes; > >>> + u64 xdp_bytes; > >>> }; > >>> > >>> struct virtnet_sq_stats { > >>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > >>> void *ptr; > >>> > >>> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > >>> - ++stats->packets; > >>> - > >>> if (!is_xdp_frame(ptr)) { > >>> struct sk_buff *skb = ptr; > >>> > >>> pr_debug("Sent skb %p\n", skb); > >>> > >>> + stats->packets++; > >>> stats->bytes += skb->len; > >>> napi_consume_skb(skb, in_napi); > >>> } else { > >>> struct xdp_frame *frame = ptr_to_xdp(ptr); > >>> > >>> - stats->bytes += xdp_get_frame_len(frame); > >>> + stats->xdp_packets++; > >>> + stats->xdp_bytes += xdp_get_frame_len(frame); > >>> xdp_return_frame(frame); > >>> } > >>> } > >>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > >>> virtnet_rq_free_buf(vi, rq, buf); > >>> } > >>> > >>> -static void free_old_xmit(struct send_queue *sq, bool in_napi) > >>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > >>> + bool in_napi) > >>> { > >>> struct virtnet_sq_free_stats stats = {0}; > >>> > >>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > >>> /* Avoid overhead when no packets have been processed > >>> * happens when called speculatively from start_xmit. > >>> */ > >>> - if (!stats.packets) > >>> + if (!stats.packets && !stats.xdp_packets) > >>> return; > >>> > >>> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > >>> + > >>> u64_stats_update_begin(&sq->stats.syncp); > >>> u64_stats_add(&sq->stats.bytes, stats.bytes); > >>> u64_stats_add(&sq->stats.packets, stats.packets); > >>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > >>> * early means 16 slots are typically wasted. > >>> */ > >>> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > >>> - netif_stop_subqueue(dev, qnum); > >>> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > >>> + > >>> + netif_tx_stop_queue(txq); > >>> if (use_napi) { > >>> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > >>> virtqueue_napi_schedule(&sq->napi, sq->vq); > >>> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > >>> /* More just got used, free them then recheck. */ > >>> - free_old_xmit(sq, false); > >>> + free_old_xmit(sq, txq, false); > >>> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > >>> netif_start_subqueue(dev, qnum); > >>> virtqueue_disable_cb(sq->vq); > >>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > >>> > >>> do { > >>> virtqueue_disable_cb(sq->vq); > >>> - free_old_xmit(sq, true); > >>> + free_old_xmit(sq, txq, true); > >>> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > >>> > >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > >>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > >>> txq = netdev_get_tx_queue(vi->dev, index); > >>> __netif_tx_lock(txq, raw_smp_processor_id()); > >>> virtqueue_disable_cb(sq->vq); > >>> - free_old_xmit(sq, true); > >>> + free_old_xmit(sq, txq, true); > >>> > >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > >>> netif_tx_wake_queue(txq); > >>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > >>> struct send_queue *sq = &vi->sq[qnum]; > >>> int err; > >>> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > >>> - bool kick = !netdev_xmit_more(); > >>> + bool xmit_more = netdev_xmit_more(); > >>> bool use_napi = sq->napi.weight; > >>> > >>> /* Free up any pending old buffers before queueing new ones. */ > >>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > >>> if (use_napi) > >>> virtqueue_disable_cb(sq->vq); > >>> > >>> - free_old_xmit(sq, false); > >>> + free_old_xmit(sq, txq, false); > >>> > >>> - } while (use_napi && kick && > >>> + } while (use_napi && !xmit_more && > >>> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > >>> > >>> /* timestamp packet in software */ > >>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > >>> > >>> check_sq_full_and_disable(vi, dev, sq); > >>> > >>> - if (kick || netif_xmit_stopped(txq)) { > >>> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > >>> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > >>> u64_stats_update_begin(&sq->stats.syncp); > >>> u64_stats_inc(&sq->stats.kicks); > >>> -- > >>> 2.44.0 > >>> > >>>
On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > >>> From: Jiri Pirko <jiri@nvidia.com> > > >>> > > >>> Add support for Byte Queue Limits (BQL). > > >> > > >>Historically both Jason and Michael have attempted to support BQL > > >>for virtio-net, for example: > > >> > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > >> > > >>These discussions focus primarily on: > > >> > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > >>orphan, some statistical information will be lost, resulting in temporary > > >>inaccuracy in BQL. > > >> > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > >>reasonable. This provides good support for BQL. > > > > > >But when the device does not support dim, the orphan mode is still > > >needed, isn't it? > > > > Heng, is my assuption correct here? Thanks! > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > most scenarios. Although orphan mode performs better in specific benckmark, For example pktgen (I meant even if the orphan mode can break pktgen, it can finish when there's a new packet that needs to be sent after pktgen is completed). > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > support dim for devices that want the best performance. Ideally, if we can drop orphan mode, everything would be simplified. > > Back to this patch set, I think BQL as a point that affects performance should > deserve more comprehensive test data. Thanks > > Thanks. > > > > > > >> > > >>Thanks. > > >> > > >>> > > >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > >>> --- > > >>> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > > >>> 1 file changed, 20 insertions(+), 13 deletions(-) > > >>> > > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > >>> index 218a446c4c27..c53d6dc6d332 100644 > > >>> --- a/drivers/net/virtio_net.c > > >>> +++ b/drivers/net/virtio_net.c > > >>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > > >>> > > >>> struct virtnet_sq_free_stats { > > >>> u64 packets; > > >>> + u64 xdp_packets; > > >>> u64 bytes; > > >>> + u64 xdp_bytes; > > >>> }; > > >>> > > >>> struct virtnet_sq_stats { > > >>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > > >>> void *ptr; > > >>> > > >>> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > > >>> - ++stats->packets; > > >>> - > > >>> if (!is_xdp_frame(ptr)) { > > >>> struct sk_buff *skb = ptr; > > >>> > > >>> pr_debug("Sent skb %p\n", skb); > > >>> > > >>> + stats->packets++; > > >>> stats->bytes += skb->len; > > >>> napi_consume_skb(skb, in_napi); > > >>> } else { > > >>> struct xdp_frame *frame = ptr_to_xdp(ptr); > > >>> > > >>> - stats->bytes += xdp_get_frame_len(frame); > > >>> + stats->xdp_packets++; > > >>> + stats->xdp_bytes += xdp_get_frame_len(frame); > > >>> xdp_return_frame(frame); > > >>> } > > >>> } > > >>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > > >>> virtnet_rq_free_buf(vi, rq, buf); > > >>> } > > >>> > > >>> -static void free_old_xmit(struct send_queue *sq, bool in_napi) > > >>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > > >>> + bool in_napi) > > >>> { > > >>> struct virtnet_sq_free_stats stats = {0}; > > >>> > > >>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > > >>> /* Avoid overhead when no packets have been processed > > >>> * happens when called speculatively from start_xmit. > > >>> */ > > >>> - if (!stats.packets) > > >>> + if (!stats.packets && !stats.xdp_packets) > > >>> return; > > >>> > > >>> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > > >>> + > > >>> u64_stats_update_begin(&sq->stats.syncp); > > >>> u64_stats_add(&sq->stats.bytes, stats.bytes); > > >>> u64_stats_add(&sq->stats.packets, stats.packets); > > >>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > > >>> * early means 16 slots are typically wasted. > > >>> */ > > >>> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > > >>> - netif_stop_subqueue(dev, qnum); > > >>> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > > >>> + > > >>> + netif_tx_stop_queue(txq); > > >>> if (use_napi) { > > >>> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > > >>> virtqueue_napi_schedule(&sq->napi, sq->vq); > > >>> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > > >>> /* More just got used, free them then recheck. */ > > >>> - free_old_xmit(sq, false); > > >>> + free_old_xmit(sq, txq, false); > > >>> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > > >>> netif_start_subqueue(dev, qnum); > > >>> virtqueue_disable_cb(sq->vq); > > >>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > >>> > > >>> do { > > >>> virtqueue_disable_cb(sq->vq); > > >>> - free_old_xmit(sq, true); > > >>> + free_old_xmit(sq, txq, true); > > >>> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > >>> > > >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > > >>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > > >>> txq = netdev_get_tx_queue(vi->dev, index); > > >>> __netif_tx_lock(txq, raw_smp_processor_id()); > > >>> virtqueue_disable_cb(sq->vq); > > >>> - free_old_xmit(sq, true); > > >>> + free_old_xmit(sq, txq, true); > > >>> > > >>> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > > >>> netif_tx_wake_queue(txq); > > >>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > >>> struct send_queue *sq = &vi->sq[qnum]; > > >>> int err; > > >>> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > > >>> - bool kick = !netdev_xmit_more(); > > >>> + bool xmit_more = netdev_xmit_more(); > > >>> bool use_napi = sq->napi.weight; > > >>> > > >>> /* Free up any pending old buffers before queueing new ones. */ > > >>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > >>> if (use_napi) > > >>> virtqueue_disable_cb(sq->vq); > > >>> > > >>> - free_old_xmit(sq, false); > > >>> + free_old_xmit(sq, txq, false); > > >>> > > >>> - } while (use_napi && kick && > > >>> + } while (use_napi && !xmit_more && > > >>> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > >>> > > >>> /* timestamp packet in software */ > > >>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > >>> > > >>> check_sq_full_and_disable(vi, dev, sq); > > >>> > > >>> - if (kick || netif_xmit_stopped(txq)) { > > >>> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > > >>> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > > >>> u64_stats_update_begin(&sq->stats.syncp); > > >>> u64_stats_inc(&sq->stats.kicks); > > >>> -- > > >>> 2.44.0 > > >>> > > >>> >
Hello Jason, On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > >>> From: Jiri Pirko <jiri@nvidia.com> > > > >>> > > > >>> Add support for Byte Queue Limits (BQL). > > > >> > > > >>Historically both Jason and Michael have attempted to support BQL > > > >>for virtio-net, for example: > > > >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > >> > > > >>These discussions focus primarily on: > > > >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > > >>orphan, some statistical information will be lost, resulting in temporary > > > >>inaccuracy in BQL. > > > >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > > >>reasonable. This provides good support for BQL. > > > > > > > >But when the device does not support dim, the orphan mode is still > > > >needed, isn't it? > > > > > > Heng, is my assuption correct here? Thanks! > > > > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > > most scenarios. Although orphan mode performs better in specific benckmark, > > For example pktgen (I meant even if the orphan mode can break pktgen, > it can finish when there's a new packet that needs to be sent after > pktgen is completed). > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > > support dim for devices that want the best performance. > > Ideally, if we can drop orphan mode, everything would be simplified. Please please don't do this. Orphan mode still has its merits. In some cases which can hardly be reproduced in production, we still choose to turn off the napi_tx mode because the delay of freeing a skb could cause lower performance in the tx path, which is, I know, surely designed on purpose. If the codes of orphan mode don't have an impact when you enable napi_tx mode, please keep it if you can. Thank you.
On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > Hello Jason, > > On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > > > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > >>> From: Jiri Pirko <jiri@nvidia.com> > > > > >>> > > > > >>> Add support for Byte Queue Limits (BQL). > > > > >> > > > > >>Historically both Jason and Michael have attempted to support BQL > > > > >>for virtio-net, for example: > > > > >> > > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > > >> > > > > >>These discussions focus primarily on: > > > > >> > > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > > > >>orphan, some statistical information will be lost, resulting in temporary > > > > >>inaccuracy in BQL. > > > > >> > > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > > > >>reasonable. This provides good support for BQL. > > > > > > > > > >But when the device does not support dim, the orphan mode is still > > > > >needed, isn't it? > > > > > > > > Heng, is my assuption correct here? Thanks! > > > > > > > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > > > most scenarios. Although orphan mode performs better in specific benckmark, > > > > For example pktgen (I meant even if the orphan mode can break pktgen, > > it can finish when there's a new packet that needs to be sent after > > pktgen is completed). > > > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > > > support dim for devices that want the best performance. > > > > Ideally, if we can drop orphan mode, everything would be simplified. > > Please please don't do this. Orphan mode still has its merits. In some > cases which can hardly be reproduced in production, we still choose to > turn off the napi_tx mode because the delay of freeing a skb could > cause lower performance in the tx path, Well, it's probably just a side effect and it depends on how to define performance here. > which is, I know, surely > designed on purpose. I don't think so and no modern NIC uses that. It breaks a lot of things. > > If the codes of orphan mode don't have an impact when you enable > napi_tx mode, please keep it if you can. For example, it complicates BQL implementation. Thanks > > Thank you. >
On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > > If the codes of orphan mode don't have an impact when you enable > > napi_tx mode, please keep it if you can. > > For example, it complicates BQL implementation. > > Thanks I very much doubt sending interrupts to a VM can *on all benchmarks* compete with not sending interrupts. So yea, it's great if napi and hardware are advanced enough that the default can be changed, since this way virtio is closer to a regular nic and more or standard infrastructure can be used. But dropping it will go against *no breaking userspace* rule. Complicated? Tough.
Thu, Jun 06, 2024 at 04:20:48AM CEST, dave.taht@gmail.com wrote: >On Wed, May 15, 2024 at 12:37 AM Jiri Pirko <jiri@resnulli.us> wrote: > >> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> >> > >> >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >> >Thanks! >> >> >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> >> https://lwn.net/Articles/469652/ >> >> >> > >> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >> >mostly I got the feedback from net core maintainers that it likely >> won't >> >> >> >benefit virtio. >> >> >> >> >> >> Do you have some link to that, or is it this thread: >> >> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> > >> >> > >> >> >A quick search on lore turned up this, for example: >> >> > >> https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> >> >> Says: >> >> "Note that NIC with many TX queues make BQL almost useless, only adding >> extra >> >> overhead." >> >> >> >> But virtio can have one tx queue, I guess that could be quite common >> >> configuration in lot of deployments. >> > >> >Not sure we should worry about performance for these though. >> >What I am saying is this should come with some benchmarking >> >results. >> >> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> queue pair: >> >> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >> RR result with no bql: >> 29.95 >> 32.74 >> 28.77 >> >> RR result with bql: >> 222.98 >> 159.81 >> 197.88 >> >> >> >Yay! is that with pfifo_fast or fq_codel as the underlying qdisc? pfifo_fast > >fq_codel, please? > > >> >> > >> > >> >> >> >> > >> >> > >> >> > >> >> > >> >> >> I don't see why virtio should be any different from other >> >> >> drivers/devices that benefit from bql. HOL blocking is the same here >> are >> >> >> everywhere. >> >> >> >> >> >> > >> >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> >> >> Thanks! >> >> >> >> >> >> > >> >> >> >-- >> >> >> >MST >> >> >> > >> >> > >> > >> >> > >-- >https://www.youtube.com/watch?v=BVFWSyMp3xg&t=1098s Waves Podcast >Dave Täht CSO, LibreQos
On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > > > If the codes of orphan mode don't have an impact when you enable > > > napi_tx mode, please keep it if you can. > > > > For example, it complicates BQL implementation. > > > > Thanks > > I very much doubt sending interrupts to a VM can > *on all benchmarks* compete with not sending interrupts. It should not differ too much from the physical NIC. We can have one more round of benchmarks to see the difference. But if NAPI mode needs to win all of the benchmarks in order to get rid of orphan, that would be very difficult. Considering various bugs will be fixed by dropping skb_orphan(), it would be sufficient if most of the benchmark doesn't show obvious differences. Looking at git history, there're commits that removes skb_orphan(), for example: commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 Author: Eric Dumazet <edumazet@google.com> Date: Fri Sep 28 07:53:26 2012 +0000 mlx4: dont orphan skbs in mlx4_en_xmit() After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX completions) we no longer need to orphan skbs in mlx4_en_xmit() since skb wont stay a long time in TX ring before their release. Orphaning skbs in ndo_start_xmit() should be avoided as much as possible, since it breaks TCP Small Queue or other flow control mechanisms (per socket limits) Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> Cc: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net> > > So yea, it's great if napi and hardware are advanced enough > that the default can be changed, since this way virtio > is closer to a regular nic and more or standard > infrastructure can be used. > > But dropping it will go against *no breaking userspace* rule. > Complicated? Tough. I don't know what kind of userspace is broken by this. Or why it is not broken since the day we enable NAPI mode by default. Thanks > > -- > MST >
On Thu, Jun 6, 2024 at 12:25 PM Jason Wang <jasowang@redhat.com> wrote: > > On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > Hello Jason, > > > > On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > > > > > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > > >>> From: Jiri Pirko <jiri@nvidia.com> > > > > > >>> > > > > > >>> Add support for Byte Queue Limits (BQL). > > > > > >> > > > > > >>Historically both Jason and Michael have attempted to support BQL > > > > > >>for virtio-net, for example: > > > > > >> > > > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > > > >> > > > > > >>These discussions focus primarily on: > > > > > >> > > > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > > > > >>orphan, some statistical information will be lost, resulting in temporary > > > > > >>inaccuracy in BQL. > > > > > >> > > > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > > > > >>reasonable. This provides good support for BQL. > > > > > > > > > > > >But when the device does not support dim, the orphan mode is still > > > > > >needed, isn't it? > > > > > > > > > > Heng, is my assuption correct here? Thanks! > > > > > > > > > > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > > > > most scenarios. Although orphan mode performs better in specific benckmark, > > > > > > For example pktgen (I meant even if the orphan mode can break pktgen, > > > it can finish when there's a new packet that needs to be sent after > > > pktgen is completed). > > > > > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > > > > support dim for devices that want the best performance. > > > > > > Ideally, if we can drop orphan mode, everything would be simplified. > > > > Please please don't do this. Orphan mode still has its merits. In some > > cases which can hardly be reproduced in production, we still choose to > > turn off the napi_tx mode because the delay of freeing a skb could > > cause lower performance in the tx path, > > Well, it's probably just a side effect and it depends on how to define > performance here. Yes. > > > which is, I know, surely > > designed on purpose. > > I don't think so and no modern NIC uses that. It breaks a lot of things. To avoid confusion, I meant napi_tx mode can delay/slow down the speed in the tx path and no modern nic uses skb_orphan(). I think I will have some time to test BQL in virtio_net. Thanks, Jason
On Thu, Jun 06, 2024 at 07:42:35PM +0800, Jason Xing wrote: > On Thu, Jun 6, 2024 at 12:25 PM Jason Wang <jasowang@redhat.com> wrote: > > > > On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > > > Hello Jason, > > > > > > On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > > > > > > > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > > > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > > > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > > > > > >>> From: Jiri Pirko <jiri@nvidia.com> > > > > > > >>> > > > > > > >>> Add support for Byte Queue Limits (BQL). > > > > > > >> > > > > > > >>Historically both Jason and Michael have attempted to support BQL > > > > > > >>for virtio-net, for example: > > > > > > >> > > > > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > > > > >> > > > > > > >>These discussions focus primarily on: > > > > > > >> > > > > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > > > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > > > > > >>orphan, some statistical information will be lost, resulting in temporary > > > > > > >>inaccuracy in BQL. > > > > > > >> > > > > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > > > > > >>reasonable. This provides good support for BQL. > > > > > > > > > > > > > >But when the device does not support dim, the orphan mode is still > > > > > > >needed, isn't it? > > > > > > > > > > > > Heng, is my assuption correct here? Thanks! > > > > > > > > > > > > > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > > > > > most scenarios. Although orphan mode performs better in specific benckmark, > > > > > > > > For example pktgen (I meant even if the orphan mode can break pktgen, > > > > it can finish when there's a new packet that needs to be sent after > > > > pktgen is completed). > > > > > > > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > > > > > support dim for devices that want the best performance. > > > > > > > > Ideally, if we can drop orphan mode, everything would be simplified. > > > > > > Please please don't do this. Orphan mode still has its merits. In some > > > cases which can hardly be reproduced in production, we still choose to > > > turn off the napi_tx mode because the delay of freeing a skb could > > > cause lower performance in the tx path, > > > > Well, it's probably just a side effect and it depends on how to define > > performance here. > > Yes. > > > > > > which is, I know, surely > > > designed on purpose. > > > > I don't think so and no modern NIC uses that. It breaks a lot of things. > > To avoid confusion, I meant napi_tx mode can delay/slow down the speed > in the tx path and no modern nic uses skb_orphan(). Clearly it's been designed for software NICs and when the cost of interrupts is very high. > I think I will have some time to test BQL in virtio_net. > > Thanks, > Jason
Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote: >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: >> >> Hello Jason, >> >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: >> > >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: >> > > >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >> > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> > > > >>> From: Jiri Pirko <jiri@nvidia.com> >> > > > >>> >> > > > >>> Add support for Byte Queue Limits (BQL). >> > > > >> >> > > > >>Historically both Jason and Michael have attempted to support BQL >> > > > >>for virtio-net, for example: >> > > > >> >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> > > > >> >> > > > >>These discussions focus primarily on: >> > > > >> >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to >> > > > >>orphan, some statistical information will be lost, resulting in temporary >> > > > >>inaccuracy in BQL. >> > > > >> >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more >> > > > >>reasonable. This provides good support for BQL. >> > > > > >> > > > >But when the device does not support dim, the orphan mode is still >> > > > >needed, isn't it? >> > > > >> > > > Heng, is my assuption correct here? Thanks! >> > > > >> > > >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in >> > > most scenarios. Although orphan mode performs better in specific benckmark, >> > >> > For example pktgen (I meant even if the orphan mode can break pktgen, >> > it can finish when there's a new packet that needs to be sent after >> > pktgen is completed). >> > >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to >> > > support dim for devices that want the best performance. >> > >> > Ideally, if we can drop orphan mode, everything would be simplified. >> >> Please please don't do this. Orphan mode still has its merits. In some >> cases which can hardly be reproduced in production, we still choose to >> turn off the napi_tx mode because the delay of freeing a skb could >> cause lower performance in the tx path, > >Well, it's probably just a side effect and it depends on how to define >performance here. > >> which is, I know, surely >> designed on purpose. > >I don't think so and no modern NIC uses that. It breaks a lot of things. > >> >> If the codes of orphan mode don't have an impact when you enable >> napi_tx mode, please keep it if you can. > >For example, it complicates BQL implementation. Well, bql could be disabled when napi is not used. It is just a matter of one "if" in the xmit path. > >Thanks > >> >> Thank you. >> >
Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: >> > > If the codes of orphan mode don't have an impact when you enable >> > > napi_tx mode, please keep it if you can. >> > >> > For example, it complicates BQL implementation. >> > >> > Thanks >> >> I very much doubt sending interrupts to a VM can >> *on all benchmarks* compete with not sending interrupts. > >It should not differ too much from the physical NIC. We can have one >more round of benchmarks to see the difference. > >But if NAPI mode needs to win all of the benchmarks in order to get >rid of orphan, that would be very difficult. Considering various bugs >will be fixed by dropping skb_orphan(), it would be sufficient if most >of the benchmark doesn't show obvious differences. > >Looking at git history, there're commits that removes skb_orphan(), for example: > >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 >Author: Eric Dumazet <edumazet@google.com> >Date: Fri Sep 28 07:53:26 2012 +0000 > > mlx4: dont orphan skbs in mlx4_en_xmit() > > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX > completions) we no longer need to orphan skbs in mlx4_en_xmit() > since skb wont stay a long time in TX ring before their release. > > Orphaning skbs in ndo_start_xmit() should be avoided as much as > possible, since it breaks TCP Small Queue or other flow control > mechanisms (per socket limits) > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> > Cc: Or Gerlitz <ogerlitz@mellanox.com> > Signed-off-by: David S. Miller <davem@davemloft.net> > >> >> So yea, it's great if napi and hardware are advanced enough >> that the default can be changed, since this way virtio >> is closer to a regular nic and more or standard >> infrastructure can be used. >> >> But dropping it will go against *no breaking userspace* rule. >> Complicated? Tough. > >I don't know what kind of userspace is broken by this. Or why it is >not broken since the day we enable NAPI mode by default. There is a module option that explicitly allows user to set napi_tx=false or napi_weight=0 So if you remove this option or ignore it, both breaks the user expectation. I personally would vote for this breakage. To carry ancient things like this one forever does not make sense to me. While at it, let's remove all virtio net module params. Thoughts? > >Thanks > >> >> -- >> MST >> >
On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote: > >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > >> > >> Hello Jason, > >> > >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > >> > > >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > >> > > > >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > >> > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > >> > > > >>> From: Jiri Pirko <jiri@nvidia.com> > >> > > > >>> > >> > > > >>> Add support for Byte Queue Limits (BQL). > >> > > > >> > >> > > > >>Historically both Jason and Michael have attempted to support BQL > >> > > > >>for virtio-net, for example: > >> > > > >> > >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> > > > >> > >> > > > >>These discussions focus primarily on: > >> > > > >> > >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > >> > > > >>orphan, some statistical information will be lost, resulting in temporary > >> > > > >>inaccuracy in BQL. > >> > > > >> > >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > >> > > > >>reasonable. This provides good support for BQL. > >> > > > > > >> > > > >But when the device does not support dim, the orphan mode is still > >> > > > >needed, isn't it? > >> > > > > >> > > > Heng, is my assuption correct here? Thanks! > >> > > > > >> > > > >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > >> > > most scenarios. Although orphan mode performs better in specific benckmark, > >> > > >> > For example pktgen (I meant even if the orphan mode can break pktgen, > >> > it can finish when there's a new packet that needs to be sent after > >> > pktgen is completed). > >> > > >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > >> > > support dim for devices that want the best performance. > >> > > >> > Ideally, if we can drop orphan mode, everything would be simplified. > >> > >> Please please don't do this. Orphan mode still has its merits. In some > >> cases which can hardly be reproduced in production, we still choose to > >> turn off the napi_tx mode because the delay of freeing a skb could > >> cause lower performance in the tx path, > > > >Well, it's probably just a side effect and it depends on how to define > >performance here. > > > >> which is, I know, surely > >> designed on purpose. > > > >I don't think so and no modern NIC uses that. It breaks a lot of things. > > > >> > >> If the codes of orphan mode don't have an impact when you enable > >> napi_tx mode, please keep it if you can. > > > >For example, it complicates BQL implementation. > > Well, bql could be disabled when napi is not used. It is just a matter > of one "if" in the xmit path. Maybe, care to post a patch? The trick part is, a skb is queued when BQL is enabled but sent when BQL is disabled as discussed here: https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ Thanks > > > > > >Thanks > > > >> > >> Thank you. > >> > > >
On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: > >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> > >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > >> > > If the codes of orphan mode don't have an impact when you enable > >> > > napi_tx mode, please keep it if you can. > >> > > >> > For example, it complicates BQL implementation. > >> > > >> > Thanks > >> > >> I very much doubt sending interrupts to a VM can > >> *on all benchmarks* compete with not sending interrupts. > > > >It should not differ too much from the physical NIC. We can have one > >more round of benchmarks to see the difference. > > > >But if NAPI mode needs to win all of the benchmarks in order to get > >rid of orphan, that would be very difficult. Considering various bugs > >will be fixed by dropping skb_orphan(), it would be sufficient if most > >of the benchmark doesn't show obvious differences. > > > >Looking at git history, there're commits that removes skb_orphan(), for example: > > > >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 > >Author: Eric Dumazet <edumazet@google.com> > >Date: Fri Sep 28 07:53:26 2012 +0000 > > > > mlx4: dont orphan skbs in mlx4_en_xmit() > > > > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX > > completions) we no longer need to orphan skbs in mlx4_en_xmit() > > since skb wont stay a long time in TX ring before their release. > > > > Orphaning skbs in ndo_start_xmit() should be avoided as much as > > possible, since it breaks TCP Small Queue or other flow control > > mechanisms (per socket limits) > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> > > Cc: Or Gerlitz <ogerlitz@mellanox.com> > > Signed-off-by: David S. Miller <davem@davemloft.net> > > > >> > >> So yea, it's great if napi and hardware are advanced enough > >> that the default can be changed, since this way virtio > >> is closer to a regular nic and more or standard > >> infrastructure can be used. > >> > >> But dropping it will go against *no breaking userspace* rule. > >> Complicated? Tough. > > > >I don't know what kind of userspace is broken by this. Or why it is > >not broken since the day we enable NAPI mode by default. > > There is a module option that explicitly allows user to set > napi_tx=false > or > napi_weight=0 > > So if you remove this option or ignore it, both breaks the user > expectation. We can keep them, but I wonder what's the expectation of the user here? The only thing so far I can imagine is the performance difference. > I personally would vote for this breakage. To carry ancient > things like this one forever does not make sense to me. Exactly. > While at it, > let's remove all virtio net module params. Thoughts? I tend to 1) drop the orphan mode, but we can have some benchmarks first 2) keep the module parameters Thanks > > > > > > >Thanks > > > >> > >> -- > >> MST > >> > > >
Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote: >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: >> >> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: >> >> > > If the codes of orphan mode don't have an impact when you enable >> >> > > napi_tx mode, please keep it if you can. >> >> > >> >> > For example, it complicates BQL implementation. >> >> > >> >> > Thanks >> >> >> >> I very much doubt sending interrupts to a VM can >> >> *on all benchmarks* compete with not sending interrupts. >> > >> >It should not differ too much from the physical NIC. We can have one >> >more round of benchmarks to see the difference. >> > >> >But if NAPI mode needs to win all of the benchmarks in order to get >> >rid of orphan, that would be very difficult. Considering various bugs >> >will be fixed by dropping skb_orphan(), it would be sufficient if most >> >of the benchmark doesn't show obvious differences. >> > >> >Looking at git history, there're commits that removes skb_orphan(), for example: >> > >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 >> >Author: Eric Dumazet <edumazet@google.com> >> >Date: Fri Sep 28 07:53:26 2012 +0000 >> > >> > mlx4: dont orphan skbs in mlx4_en_xmit() >> > >> > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX >> > completions) we no longer need to orphan skbs in mlx4_en_xmit() >> > since skb wont stay a long time in TX ring before their release. >> > >> > Orphaning skbs in ndo_start_xmit() should be avoided as much as >> > possible, since it breaks TCP Small Queue or other flow control >> > mechanisms (per socket limits) >> > >> > Signed-off-by: Eric Dumazet <edumazet@google.com> >> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> >> > Cc: Or Gerlitz <ogerlitz@mellanox.com> >> > Signed-off-by: David S. Miller <davem@davemloft.net> >> > >> >> >> >> So yea, it's great if napi and hardware are advanced enough >> >> that the default can be changed, since this way virtio >> >> is closer to a regular nic and more or standard >> >> infrastructure can be used. >> >> >> >> But dropping it will go against *no breaking userspace* rule. >> >> Complicated? Tough. >> > >> >I don't know what kind of userspace is broken by this. Or why it is >> >not broken since the day we enable NAPI mode by default. >> >> There is a module option that explicitly allows user to set >> napi_tx=false >> or >> napi_weight=0 >> >> So if you remove this option or ignore it, both breaks the user >> expectation. > >We can keep them, but I wonder what's the expectation of the user >here? The only thing so far I can imagine is the performance >difference. True. > >> I personally would vote for this breakage. To carry ancient >> things like this one forever does not make sense to me. > >Exactly. > >> While at it, >> let's remove all virtio net module params. Thoughts? > >I tend to > >1) drop the orphan mode, but we can have some benchmarks first Any idea which? That would be really tricky to find the ones where orphan mode makes difference I assume. >2) keep the module parameters and ignore them, correct? Perhaps a warning would be good. > >Thanks > >> >> >> >> > >> >Thanks >> > >> >> >> >> -- >> >> MST >> >> >> > >> >
On Fri, Jun 07, 2024 at 02:22:31PM +0800, Jason Wang wrote: > On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote: > > > > Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote: > > >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > >> > > >> Hello Jason, > > >> > > >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: > > >> > > > >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: > > >> > > > > >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: > > >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: > > >> > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > > >> > > > >>> From: Jiri Pirko <jiri@nvidia.com> > > >> > > > >>> > > >> > > > >>> Add support for Byte Queue Limits (BQL). > > >> > > > >> > > >> > > > >>Historically both Jason and Michael have attempted to support BQL > > >> > > > >>for virtio-net, for example: > > >> > > > >> > > >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > >> > > > >> > > >> > > > >>These discussions focus primarily on: > > >> > > > >> > > >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information > > >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to > > >> > > > >>orphan, some statistical information will be lost, resulting in temporary > > >> > > > >>inaccuracy in BQL. > > >> > > > >> > > >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more > > >> > > > >>reasonable. This provides good support for BQL. > > >> > > > > > > >> > > > >But when the device does not support dim, the orphan mode is still > > >> > > > >needed, isn't it? > > >> > > > > > >> > > > Heng, is my assuption correct here? Thanks! > > >> > > > > > >> > > > > >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in > > >> > > most scenarios. Although orphan mode performs better in specific benckmark, > > >> > > > >> > For example pktgen (I meant even if the orphan mode can break pktgen, > > >> > it can finish when there's a new packet that needs to be sent after > > >> > pktgen is completed). > > >> > > > >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to > > >> > > support dim for devices that want the best performance. > > >> > > > >> > Ideally, if we can drop orphan mode, everything would be simplified. > > >> > > >> Please please don't do this. Orphan mode still has its merits. In some > > >> cases which can hardly be reproduced in production, we still choose to > > >> turn off the napi_tx mode because the delay of freeing a skb could > > >> cause lower performance in the tx path, > > > > > >Well, it's probably just a side effect and it depends on how to define > > >performance here. > > > > > >> which is, I know, surely > > >> designed on purpose. > > > > > >I don't think so and no modern NIC uses that. It breaks a lot of things. > > > > > >> > > >> If the codes of orphan mode don't have an impact when you enable > > >> napi_tx mode, please keep it if you can. > > > > > >For example, it complicates BQL implementation. > > > > Well, bql could be disabled when napi is not used. It is just a matter > > of one "if" in the xmit path. > > Maybe, care to post a patch? > > The trick part is, a skb is queued when BQL is enabled but sent when > BQL is disabled as discussed here: > > https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > Thanks Yes of course. Or we can stick a dummy value in skb->destructor after we orphan, maybe that's easier. > > > > > > > > > >Thanks > > > > > >> > > >> Thank you. > > >> > > > > >
Fri, Jun 07, 2024 at 08:22:31AM CEST, jasowang@redhat.com wrote: >On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote: >> >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote: >> >> >> >> Hello Jason, >> >> >> >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote: >> >> > >> >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote: >> >> > > >> >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote: >> >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >> >> > > > >>On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> >> > > > >>> From: Jiri Pirko <jiri@nvidia.com> >> >> > > > >>> >> >> > > > >>> Add support for Byte Queue Limits (BQL). >> >> > > > >> >> >> > > > >>Historically both Jason and Michael have attempted to support BQL >> >> > > > >>for virtio-net, for example: >> >> > > > >> >> >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> > > > >> >> >> > > > >>These discussions focus primarily on: >> >> > > > >> >> >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information >> >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to >> >> > > > >>orphan, some statistical information will be lost, resulting in temporary >> >> > > > >>inaccuracy in BQL. >> >> > > > >> >> >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more >> >> > > > >>reasonable. This provides good support for BQL. >> >> > > > > >> >> > > > >But when the device does not support dim, the orphan mode is still >> >> > > > >needed, isn't it? >> >> > > > >> >> > > > Heng, is my assuption correct here? Thanks! >> >> > > > >> >> > > >> >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in >> >> > > most scenarios. Although orphan mode performs better in specific benckmark, >> >> > >> >> > For example pktgen (I meant even if the orphan mode can break pktgen, >> >> > it can finish when there's a new packet that needs to be sent after >> >> > pktgen is completed). >> >> > >> >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to >> >> > > support dim for devices that want the best performance. >> >> > >> >> > Ideally, if we can drop orphan mode, everything would be simplified. >> >> >> >> Please please don't do this. Orphan mode still has its merits. In some >> >> cases which can hardly be reproduced in production, we still choose to >> >> turn off the napi_tx mode because the delay of freeing a skb could >> >> cause lower performance in the tx path, >> > >> >Well, it's probably just a side effect and it depends on how to define >> >performance here. >> > >> >> which is, I know, surely >> >> designed on purpose. >> > >> >I don't think so and no modern NIC uses that. It breaks a lot of things. >> > >> >> >> >> If the codes of orphan mode don't have an impact when you enable >> >> napi_tx mode, please keep it if you can. >> > >> >For example, it complicates BQL implementation. >> >> Well, bql could be disabled when napi is not used. It is just a matter >> of one "if" in the xmit path. > >Maybe, care to post a patch? > >The trick part is, a skb is queued when BQL is enabled but sent when >BQL is disabled as discussed here: > >https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >Thanks Will try to go in orphan removal direction first. > >> >> >> > >> >Thanks >> > >> >> >> >> Thank you. >> >> >> > >> >
On Fri, Jun 07, 2024 at 08:39:20AM +0200, Jiri Pirko wrote: > Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote: > >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> > >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: > >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> >> > >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > >> >> > > If the codes of orphan mode don't have an impact when you enable > >> >> > > napi_tx mode, please keep it if you can. > >> >> > > >> >> > For example, it complicates BQL implementation. > >> >> > > >> >> > Thanks > >> >> > >> >> I very much doubt sending interrupts to a VM can > >> >> *on all benchmarks* compete with not sending interrupts. > >> > > >> >It should not differ too much from the physical NIC. We can have one > >> >more round of benchmarks to see the difference. > >> > > >> >But if NAPI mode needs to win all of the benchmarks in order to get > >> >rid of orphan, that would be very difficult. Considering various bugs > >> >will be fixed by dropping skb_orphan(), it would be sufficient if most > >> >of the benchmark doesn't show obvious differences. > >> > > >> >Looking at git history, there're commits that removes skb_orphan(), for example: > >> > > >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 > >> >Author: Eric Dumazet <edumazet@google.com> > >> >Date: Fri Sep 28 07:53:26 2012 +0000 > >> > > >> > mlx4: dont orphan skbs in mlx4_en_xmit() > >> > > >> > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX > >> > completions) we no longer need to orphan skbs in mlx4_en_xmit() > >> > since skb wont stay a long time in TX ring before their release. > >> > > >> > Orphaning skbs in ndo_start_xmit() should be avoided as much as > >> > possible, since it breaks TCP Small Queue or other flow control > >> > mechanisms (per socket limits) > >> > > >> > Signed-off-by: Eric Dumazet <edumazet@google.com> > >> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> > >> > Cc: Or Gerlitz <ogerlitz@mellanox.com> > >> > Signed-off-by: David S. Miller <davem@davemloft.net> > >> > > >> >> > >> >> So yea, it's great if napi and hardware are advanced enough > >> >> that the default can be changed, since this way virtio > >> >> is closer to a regular nic and more or standard > >> >> infrastructure can be used. > >> >> > >> >> But dropping it will go against *no breaking userspace* rule. > >> >> Complicated? Tough. > >> > > >> >I don't know what kind of userspace is broken by this. Or why it is > >> >not broken since the day we enable NAPI mode by default. > >> > >> There is a module option that explicitly allows user to set > >> napi_tx=false > >> or > >> napi_weight=0 > >> > >> So if you remove this option or ignore it, both breaks the user > >> expectation. > > > >We can keep them, but I wonder what's the expectation of the user > >here? The only thing so far I can imagine is the performance > >difference. > > True. > > > > >> I personally would vote for this breakage. To carry ancient > >> things like this one forever does not make sense to me. > > > >Exactly. > > > >> While at it, > >> let's remove all virtio net module params. Thoughts? > > > >I tend to > > > >1) drop the orphan mode, but we can have some benchmarks first > > Any idea which? That would be really tricky to find the ones where > orphan mode makes difference I assume. Exactly. We are kind of stuck with it I think. I would just do this: void orphan_destructor(struct sk_buff *skb) { } skb_orphan(skb); skb->destructor = orphan_destructor; /* skip BQL */ return; and then later /* skip BQL accounting if we orphaned on xmit path */ if (skb->destructor == orphan_destructor) return; Hmm? > > >2) keep the module parameters > > and ignore them, correct? Perhaps a warning would be good. > > > > > >Thanks > > > >> > >> > >> > >> > > >> >Thanks > >> > > >> >> > >> >> -- > >> >> MST > >> >> > >> > > >> > >
On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote: > >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> > >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: > >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> >> > >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > >> >> > > If the codes of orphan mode don't have an impact when you enable > >> >> > > napi_tx mode, please keep it if you can. > >> >> > > >> >> > For example, it complicates BQL implementation. > >> >> > > >> >> > Thanks > >> >> > >> >> I very much doubt sending interrupts to a VM can > >> >> *on all benchmarks* compete with not sending interrupts. > >> > > >> >It should not differ too much from the physical NIC. We can have one > >> >more round of benchmarks to see the difference. > >> > > >> >But if NAPI mode needs to win all of the benchmarks in order to get > >> >rid of orphan, that would be very difficult. Considering various bugs > >> >will be fixed by dropping skb_orphan(), it would be sufficient if most > >> >of the benchmark doesn't show obvious differences. > >> > > >> >Looking at git history, there're commits that removes skb_orphan(), for example: > >> > > >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 > >> >Author: Eric Dumazet <edumazet@google.com> > >> >Date: Fri Sep 28 07:53:26 2012 +0000 > >> > > >> > mlx4: dont orphan skbs in mlx4_en_xmit() > >> > > >> > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX > >> > completions) we no longer need to orphan skbs in mlx4_en_xmit() > >> > since skb wont stay a long time in TX ring before their release. > >> > > >> > Orphaning skbs in ndo_start_xmit() should be avoided as much as > >> > possible, since it breaks TCP Small Queue or other flow control > >> > mechanisms (per socket limits) > >> > > >> > Signed-off-by: Eric Dumazet <edumazet@google.com> > >> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> > >> > Cc: Or Gerlitz <ogerlitz@mellanox.com> > >> > Signed-off-by: David S. Miller <davem@davemloft.net> > >> > > >> >> > >> >> So yea, it's great if napi and hardware are advanced enough > >> >> that the default can be changed, since this way virtio > >> >> is closer to a regular nic and more or standard > >> >> infrastructure can be used. > >> >> > >> >> But dropping it will go against *no breaking userspace* rule. > >> >> Complicated? Tough. > >> > > >> >I don't know what kind of userspace is broken by this. Or why it is > >> >not broken since the day we enable NAPI mode by default. > >> > >> There is a module option that explicitly allows user to set > >> napi_tx=false > >> or > >> napi_weight=0 > >> > >> So if you remove this option or ignore it, both breaks the user > >> expectation. > > > >We can keep them, but I wonder what's the expectation of the user > >here? The only thing so far I can imagine is the performance > >difference. > > True. > > > > >> I personally would vote for this breakage. To carry ancient > >> things like this one forever does not make sense to me. > > > >Exactly. > > > >> While at it, > >> let's remove all virtio net module params. Thoughts? > > > >I tend to > > > >1) drop the orphan mode, but we can have some benchmarks first > > Any idea which? That would be really tricky to find the ones where > orphan mode makes difference I assume. True. Personally, I would like to just drop orphan mode. But I'm not sure others are happy with this. Thanks > > > >2) keep the module parameters > > and ignore them, correct? Perhaps a warning would be good. > > > > > >Thanks > > > >> > >> > >> > >> > > >> >Thanks > >> > > >> >> > >> >> -- > >> >> MST > >> >> > >> > > >> > > >
Fri, Jun 07, 2024 at 08:47:43AM CEST, jasowang@redhat.com wrote: >On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote: >> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> >> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: >> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: >> >> >> >> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: >> >> >> > > If the codes of orphan mode don't have an impact when you enable >> >> >> > > napi_tx mode, please keep it if you can. >> >> >> > >> >> >> > For example, it complicates BQL implementation. >> >> >> > >> >> >> > Thanks >> >> >> >> >> >> I very much doubt sending interrupts to a VM can >> >> >> *on all benchmarks* compete with not sending interrupts. >> >> > >> >> >It should not differ too much from the physical NIC. We can have one >> >> >more round of benchmarks to see the difference. >> >> > >> >> >But if NAPI mode needs to win all of the benchmarks in order to get >> >> >rid of orphan, that would be very difficult. Considering various bugs >> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most >> >> >of the benchmark doesn't show obvious differences. >> >> > >> >> >Looking at git history, there're commits that removes skb_orphan(), for example: >> >> > >> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 >> >> >Author: Eric Dumazet <edumazet@google.com> >> >> >Date: Fri Sep 28 07:53:26 2012 +0000 >> >> > >> >> > mlx4: dont orphan skbs in mlx4_en_xmit() >> >> > >> >> > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX >> >> > completions) we no longer need to orphan skbs in mlx4_en_xmit() >> >> > since skb wont stay a long time in TX ring before their release. >> >> > >> >> > Orphaning skbs in ndo_start_xmit() should be avoided as much as >> >> > possible, since it breaks TCP Small Queue or other flow control >> >> > mechanisms (per socket limits) >> >> > >> >> > Signed-off-by: Eric Dumazet <edumazet@google.com> >> >> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> >> >> > Cc: Or Gerlitz <ogerlitz@mellanox.com> >> >> > Signed-off-by: David S. Miller <davem@davemloft.net> >> >> > >> >> >> >> >> >> So yea, it's great if napi and hardware are advanced enough >> >> >> that the default can be changed, since this way virtio >> >> >> is closer to a regular nic and more or standard >> >> >> infrastructure can be used. >> >> >> >> >> >> But dropping it will go against *no breaking userspace* rule. >> >> >> Complicated? Tough. >> >> > >> >> >I don't know what kind of userspace is broken by this. Or why it is >> >> >not broken since the day we enable NAPI mode by default. >> >> >> >> There is a module option that explicitly allows user to set >> >> napi_tx=false >> >> or >> >> napi_weight=0 >> >> >> >> So if you remove this option or ignore it, both breaks the user >> >> expectation. >> > >> >We can keep them, but I wonder what's the expectation of the user >> >here? The only thing so far I can imagine is the performance >> >difference. >> >> True. >> >> > >> >> I personally would vote for this breakage. To carry ancient >> >> things like this one forever does not make sense to me. >> > >> >Exactly. >> > >> >> While at it, >> >> let's remove all virtio net module params. Thoughts? >> > >> >I tend to >> > >> >1) drop the orphan mode, but we can have some benchmarks first >> >> Any idea which? That would be really tricky to find the ones where >> orphan mode makes difference I assume. > >True. Personally, I would like to just drop orphan mode. But I'm not >sure others are happy with this. How about to do it other way around. I will take a stab at sending patch removing it. If anyone is against and has solid data to prove orphan mode is needed, let them provide those. > >Thanks > >> >> >> >2) keep the module parameters >> >> and ignore them, correct? Perhaps a warning would be good. >> >> >> > >> >Thanks >> > >> >> >> >> >> >> >> >> > >> >> >Thanks >> >> > >> >> >> >> >> >> -- >> >> >> MST >> >> >> >> >> > >> >> >> > >> >
On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > >True. Personally, I would like to just drop orphan mode. But I'm not > >sure others are happy with this. > > How about to do it other way around. I will take a stab at sending patch > removing it. If anyone is against and has solid data to prove orphan > mode is needed, let them provide those. Break it with no warning and see if anyone complains? No, this is not how we handle userspace compatibility, normally.
On Fri, Jun 7, 2024 at 5:57 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Fri, Jun 07, 2024 at 08:47:43AM CEST, jasowang@redhat.com wrote: > >On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> > >> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote: > >> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> >> > >> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote: > >> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> >> >> > >> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote: > >> >> >> > > If the codes of orphan mode don't have an impact when you enable > >> >> >> > > napi_tx mode, please keep it if you can. > >> >> >> > > >> >> >> > For example, it complicates BQL implementation. > >> >> >> > > >> >> >> > Thanks > >> >> >> > >> >> >> I very much doubt sending interrupts to a VM can > >> >> >> *on all benchmarks* compete with not sending interrupts. > >> >> > > >> >> >It should not differ too much from the physical NIC. We can have one > >> >> >more round of benchmarks to see the difference. > >> >> > > >> >> >But if NAPI mode needs to win all of the benchmarks in order to get > >> >> >rid of orphan, that would be very difficult. Considering various bugs > >> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most > >> >> >of the benchmark doesn't show obvious differences. > >> >> > > >> >> >Looking at git history, there're commits that removes skb_orphan(), for example: > >> >> > > >> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340 > >> >> >Author: Eric Dumazet <edumazet@google.com> > >> >> >Date: Fri Sep 28 07:53:26 2012 +0000 > >> >> > > >> >> > mlx4: dont orphan skbs in mlx4_en_xmit() > >> >> > > >> >> > After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX > >> >> > completions) we no longer need to orphan skbs in mlx4_en_xmit() > >> >> > since skb wont stay a long time in TX ring before their release. > >> >> > > >> >> > Orphaning skbs in ndo_start_xmit() should be avoided as much as > >> >> > possible, since it breaks TCP Small Queue or other flow control > >> >> > mechanisms (per socket limits) > >> >> > > >> >> > Signed-off-by: Eric Dumazet <edumazet@google.com> > >> >> > Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com> > >> >> > Cc: Or Gerlitz <ogerlitz@mellanox.com> > >> >> > Signed-off-by: David S. Miller <davem@davemloft.net> > >> >> > > >> >> >> > >> >> >> So yea, it's great if napi and hardware are advanced enough > >> >> >> that the default can be changed, since this way virtio > >> >> >> is closer to a regular nic and more or standard > >> >> >> infrastructure can be used. > >> >> >> > >> >> >> But dropping it will go against *no breaking userspace* rule. > >> >> >> Complicated? Tough. > >> >> > > >> >> >I don't know what kind of userspace is broken by this. Or why it is > >> >> >not broken since the day we enable NAPI mode by default. > >> >> > >> >> There is a module option that explicitly allows user to set > >> >> napi_tx=false > >> >> or > >> >> napi_weight=0 > >> >> > >> >> So if you remove this option or ignore it, both breaks the user > >> >> expectation. > >> > > >> >We can keep them, but I wonder what's the expectation of the user > >> >here? The only thing so far I can imagine is the performance > >> >difference. > >> > >> True. > >> > >> > > >> >> I personally would vote for this breakage. To carry ancient > >> >> things like this one forever does not make sense to me. > >> > > >> >Exactly. > >> > > >> >> While at it, > >> >> let's remove all virtio net module params. Thoughts? > >> > > >> >I tend to > >> > > >> >1) drop the orphan mode, but we can have some benchmarks first > >> > >> Any idea which? That would be really tricky to find the ones where > >> orphan mode makes difference I assume. > > > >True. Personally, I would like to just drop orphan mode. But I'm not > >sure others are happy with this. > > How about to do it other way around. I will take a stab at sending patch > removing it. If anyone is against and has solid data to prove orphan > mode is needed, let them provide those. Honestly, we in production have to use skb orphan mode. I cannot provide enough and strong evidence about why default mode on earth causes performance degradation in some cases. I mean I don't know the root cause. The only thing I can tell is if I enable the skb orphan mode, then 1) will let more skb pass the TCP layer, 2) some key members like snd_cwnd in tcp will behave normally. I know without orphan mode the skb will be controlled/limited by the combination of TSO and napi_tx mode thanks to sk_wmem_alloc. So I _guess_ the root cause is: the possible delay of interrupts generated by the host machine will cause the delay of freeing skbs, resulting in the slowing down the tx speed. If the interval between two interrupts is very short like the real NIC, I think the issue would disappear. That's all I can tell. Just for record. Hope this information could also be useful to other readers. Thanks, Jason > > > > > >Thanks > > > >> > >> > >> >2) keep the module parameters > >> > >> and ignore them, correct? Perhaps a warning would be good. > >> > >> > >> > > >> >Thanks > >> > > >> >> > >> >> > >> >> > >> >> > > >> >> >Thanks > >> >> > > >> >> >> > >> >> >> -- > >> >> >> MST > >> >> >> > >> >> > > >> >> > >> > > >> > >
Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: >> >True. Personally, I would like to just drop orphan mode. But I'm not >> >sure others are happy with this. >> >> How about to do it other way around. I will take a stab at sending patch >> removing it. If anyone is against and has solid data to prove orphan >> mode is needed, let them provide those. > >Break it with no warning and see if anyone complains? This is now what I suggested at all. >No, this is not how we handle userspace compatibility, normally. Sure. Again: I would send orphan removal patch containing: 1) no module options removal. Warn if someone sets it up 2) module option to disable napi is ignored 3) orphan mode is removed from code There is no breakage. Only, hypotetically performance downgrade in some hypotetical usecase nobody knows of. My point was, if someone presents solid data to prove orphan is needed during the patch review, let's toss out the patch. Makes sense? > >-- >MST >
On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > >> >True. Personally, I would like to just drop orphan mode. But I'm not > >> >sure others are happy with this. > >> > >> How about to do it other way around. I will take a stab at sending patch > >> removing it. If anyone is against and has solid data to prove orphan > >> mode is needed, let them provide those. > > > >Break it with no warning and see if anyone complains? > > This is now what I suggested at all. > > >No, this is not how we handle userspace compatibility, normally. > > Sure. > > Again: > > I would send orphan removal patch containing: > 1) no module options removal. Warn if someone sets it up > 2) module option to disable napi is ignored > 3) orphan mode is removed from code > > There is no breakage. Only, hypotetically performance downgrade in some > hypotetical usecase nobody knows of. Performance is why people use virtio. It's as much a breakage as any other bug. The main difference is, with other types of breakage, they are typically binary and we can not tolerate them at all. A tiny, negligeable performance regression might be tolarable if it brings other benefits. I very much doubt avoiding interrupts is negligeable though. And making code simpler isn't a big benefit, users do not care. > My point was, if someone presents > solid data to prove orphan is needed during the patch review, let's toss > out the patch. > > Makes sense? It's not hypothetical - if anything, it's hypothetical that performance does not regress. And we just got a report from users that see a regression without. So, not really. > > > > >-- > >MST > >
On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > > >> >True. Personally, I would like to just drop orphan mode. But I'm not > > >> >sure others are happy with this. > > >> > > >> How about to do it other way around. I will take a stab at sending patch > > >> removing it. If anyone is against and has solid data to prove orphan > > >> mode is needed, let them provide those. > > > > > >Break it with no warning and see if anyone complains? > > > > This is now what I suggested at all. > > > > >No, this is not how we handle userspace compatibility, normally. > > > > Sure. > > > > Again: > > > > I would send orphan removal patch containing: > > 1) no module options removal. Warn if someone sets it up > > 2) module option to disable napi is ignored > > 3) orphan mode is removed from code > > > > There is no breakage. Only, hypotetically performance downgrade in some > > hypotetical usecase nobody knows of. > > Performance is why people use virtio. It's as much a breakage as any > other bug. The main difference is, with other types of breakage, they > are typically binary and we can not tolerate them at all. A tiny, > negligeable performance regression might be tolarable if it brings > other benefits. I very much doubt avoiding interrupts is > negligeable though. And making code simpler isn't a big benefit, > users do not care. It's not just making code simpler. As discussed in the past, it also fixes real bugs. > > > My point was, if someone presents > > solid data to prove orphan is needed during the patch review, let's toss > > out the patch. > > > > Makes sense? > > It's not hypothetical - if anything, it's hypothetical that performance > does not regress. And we just got a report from users that see a > regression without. So, not really. Probably, but do we need to define a bar here? Looking at git history, we didn't ask a full benchmark for a lot of commits that may touch performance. Thanks > > > > > > > > >-- > > >MST > > > >
Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote: >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: >> >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not >> > >> >sure others are happy with this. >> > >> >> > >> How about to do it other way around. I will take a stab at sending patch >> > >> removing it. If anyone is against and has solid data to prove orphan >> > >> mode is needed, let them provide those. >> > > >> > >Break it with no warning and see if anyone complains? >> > >> > This is now what I suggested at all. >> > >> > >No, this is not how we handle userspace compatibility, normally. >> > >> > Sure. >> > >> > Again: >> > >> > I would send orphan removal patch containing: >> > 1) no module options removal. Warn if someone sets it up >> > 2) module option to disable napi is ignored >> > 3) orphan mode is removed from code >> > >> > There is no breakage. Only, hypotetically performance downgrade in some >> > hypotetical usecase nobody knows of. >> >> Performance is why people use virtio. It's as much a breakage as any >> other bug. The main difference is, with other types of breakage, they >> are typically binary and we can not tolerate them at all. A tiny, >> negligeable performance regression might be tolarable if it brings >> other benefits. I very much doubt avoiding interrupts is >> negligeable though. And making code simpler isn't a big benefit, >> users do not care. > >It's not just making code simpler. As discussed in the past, it also >fixes real bugs. > >> >> > My point was, if someone presents >> > solid data to prove orphan is needed during the patch review, let's toss >> > out the patch. >> > >> > Makes sense? >> >> It's not hypothetical - if anything, it's hypothetical that performance >> does not regress. And we just got a report from users that see a >> regression without. So, not really. > >Probably, but do we need to define a bar here? Looking at git history, >we didn't ask a full benchmark for a lot of commits that may touch Moreover, there is no "benchmark" to run anyway, is it? >performance. > >Thanks > >> >> > >> > > >> > >-- >> > >MST >> > > >> >
On Mon, Jun 17, 2024 at 11:30:36AM +0200, Jiri Pirko wrote: > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote: > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not > >> > >> >sure others are happy with this. > >> > >> > >> > >> How about to do it other way around. I will take a stab at sending patch > >> > >> removing it. If anyone is against and has solid data to prove orphan > >> > >> mode is needed, let them provide those. > >> > > > >> > >Break it with no warning and see if anyone complains? > >> > > >> > This is now what I suggested at all. > >> > > >> > >No, this is not how we handle userspace compatibility, normally. > >> > > >> > Sure. > >> > > >> > Again: > >> > > >> > I would send orphan removal patch containing: > >> > 1) no module options removal. Warn if someone sets it up > >> > 2) module option to disable napi is ignored > >> > 3) orphan mode is removed from code > >> > > >> > There is no breakage. Only, hypotetically performance downgrade in some > >> > hypotetical usecase nobody knows of. > >> > >> Performance is why people use virtio. It's as much a breakage as any > >> other bug. The main difference is, with other types of breakage, they > >> are typically binary and we can not tolerate them at all. A tiny, > >> negligeable performance regression might be tolarable if it brings > >> other benefits. I very much doubt avoiding interrupts is > >> negligeable though. And making code simpler isn't a big benefit, > >> users do not care. > > > >It's not just making code simpler. As discussed in the past, it also > >fixes real bugs. > > > >> > >> > My point was, if someone presents > >> > solid data to prove orphan is needed during the patch review, let's toss > >> > out the patch. > >> > > >> > Makes sense? > >> > >> It's not hypothetical - if anything, it's hypothetical that performance > >> does not regress. And we just got a report from users that see a > >> regression without. So, not really. > > > >Probably, but do we need to define a bar here? Looking at git history, > >we didn't ask a full benchmark for a lot of commits that may touch It's patently obvious that not getting interrupts is better than getting interrupts. The onus of proof would be on people who claim otherwise. > Moreover, there is no "benchmark" to run anyway, is it? > Tought. Talk to users that report regressions. > >performance. > > > >Thanks > > > >> > >> > > >> > > > >> > >-- > >> > >MST > >> > > > >> > >
On Mon, Jun 17, 2024 at 09:44:55AM +0800, Jason Wang wrote: > Probably, but do we need to define a bar here? Looking at git history, > we didn't ask a full benchmark for a lot of commits that may touch > performance. There's no may here and we even got a report from a real user.
On Mon, Jun 17, 2024 at 5:30 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote: > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: > >> > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not > >> > >> >sure others are happy with this. > >> > >> > >> > >> How about to do it other way around. I will take a stab at sending patch > >> > >> removing it. If anyone is against and has solid data to prove orphan > >> > >> mode is needed, let them provide those. > >> > > > >> > >Break it with no warning and see if anyone complains? > >> > > >> > This is now what I suggested at all. > >> > > >> > >No, this is not how we handle userspace compatibility, normally. > >> > > >> > Sure. > >> > > >> > Again: > >> > > >> > I would send orphan removal patch containing: > >> > 1) no module options removal. Warn if someone sets it up > >> > 2) module option to disable napi is ignored > >> > 3) orphan mode is removed from code > >> > > >> > There is no breakage. Only, hypotetically performance downgrade in some > >> > hypotetical usecase nobody knows of. > >> > >> Performance is why people use virtio. It's as much a breakage as any > >> other bug. The main difference is, with other types of breakage, they > >> are typically binary and we can not tolerate them at all. A tiny, > >> negligeable performance regression might be tolarable if it brings > >> other benefits. I very much doubt avoiding interrupts is > >> negligeable though. And making code simpler isn't a big benefit, > >> users do not care. > > > >It's not just making code simpler. As discussed in the past, it also > >fixes real bugs. > > > >> > >> > My point was, if someone presents > >> > solid data to prove orphan is needed during the patch review, let's toss > >> > out the patch. > >> > > >> > Makes sense? > >> > >> It's not hypothetical - if anything, it's hypothetical that performance > >> does not regress. And we just got a report from users that see a > >> regression without. So, not really. > > > >Probably, but do we need to define a bar here? Looking at git history, > >we didn't ask a full benchmark for a lot of commits that may touch > > Moreover, there is no "benchmark" to run anyway, is it? Yes, so my point is to have some agreement on 1) what kind of test needs to be run for a patch like this. 2) what numbers are ok or not Thanks > > > >performance. > > > >Thanks > > > >> > >> > > >> > > > >> > >-- > >> > >MST > >> > > > >> > > >
On Tue, Jun 18, 2024 at 12:16 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Jun 17, 2024 at 11:30:36AM +0200, Jiri Pirko wrote: > > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote: > > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > >> > > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not > > >> > >> >sure others are happy with this. > > >> > >> > > >> > >> How about to do it other way around. I will take a stab at sending patch > > >> > >> removing it. If anyone is against and has solid data to prove orphan > > >> > >> mode is needed, let them provide those. > > >> > > > > >> > >Break it with no warning and see if anyone complains? > > >> > > > >> > This is now what I suggested at all. > > >> > > > >> > >No, this is not how we handle userspace compatibility, normally. > > >> > > > >> > Sure. > > >> > > > >> > Again: > > >> > > > >> > I would send orphan removal patch containing: > > >> > 1) no module options removal. Warn if someone sets it up > > >> > 2) module option to disable napi is ignored > > >> > 3) orphan mode is removed from code > > >> > > > >> > There is no breakage. Only, hypotetically performance downgrade in some > > >> > hypotetical usecase nobody knows of. > > >> > > >> Performance is why people use virtio. It's as much a breakage as any > > >> other bug. The main difference is, with other types of breakage, they > > >> are typically binary and we can not tolerate them at all. A tiny, > > >> negligeable performance regression might be tolarable if it brings > > >> other benefits. I very much doubt avoiding interrupts is > > >> negligeable though. And making code simpler isn't a big benefit, > > >> users do not care. > > > > > >It's not just making code simpler. As discussed in the past, it also > > >fixes real bugs. > > > > > >> > > >> > My point was, if someone presents > > >> > solid data to prove orphan is needed during the patch review, let's toss > > >> > out the patch. > > >> > > > >> > Makes sense? > > >> > > >> It's not hypothetical - if anything, it's hypothetical that performance > > >> does not regress. And we just got a report from users that see a > > >> regression without. So, not really. > > > > > >Probably, but do we need to define a bar here? Looking at git history, > > >we didn't ask a full benchmark for a lot of commits that may touch > > It's patently obvious that not getting interrupts is better than > getting interrupts. Exactly. But dropping orphan mode seems less intrusive than the commit that makes NAPI default. Thanks > The onus of proof would be on people who claim > otherwise. > > > > Moreover, there is no "benchmark" to run anyway, is it? > > > > Tought. Talk to users that report regressions. > > > > > >performance. > > > > > >Thanks > > > > > >> > > >> > > > >> > > > > >> > >-- > > >> > >MST > > >> > > > > >> > > > >
On Tue, Jun 18, 2024 at 08:52:38AM +0800, Jason Wang wrote: > On Mon, Jun 17, 2024 at 5:30 PM Jiri Pirko <jiri@resnulli.us> wrote: > > > > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote: > > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > >> > > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote: > > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote: > > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote: > > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not > > >> > >> >sure others are happy with this. > > >> > >> > > >> > >> How about to do it other way around. I will take a stab at sending patch > > >> > >> removing it. If anyone is against and has solid data to prove orphan > > >> > >> mode is needed, let them provide those. > > >> > > > > >> > >Break it with no warning and see if anyone complains? > > >> > > > >> > This is now what I suggested at all. > > >> > > > >> > >No, this is not how we handle userspace compatibility, normally. > > >> > > > >> > Sure. > > >> > > > >> > Again: > > >> > > > >> > I would send orphan removal patch containing: > > >> > 1) no module options removal. Warn if someone sets it up > > >> > 2) module option to disable napi is ignored > > >> > 3) orphan mode is removed from code > > >> > > > >> > There is no breakage. Only, hypotetically performance downgrade in some > > >> > hypotetical usecase nobody knows of. > > >> > > >> Performance is why people use virtio. It's as much a breakage as any > > >> other bug. The main difference is, with other types of breakage, they > > >> are typically binary and we can not tolerate them at all. A tiny, > > >> negligeable performance regression might be tolarable if it brings > > >> other benefits. I very much doubt avoiding interrupts is > > >> negligeable though. And making code simpler isn't a big benefit, > > >> users do not care. > > > > > >It's not just making code simpler. As discussed in the past, it also > > >fixes real bugs. > > > > > >> > > >> > My point was, if someone presents > > >> > solid data to prove orphan is needed during the patch review, let's toss > > >> > out the patch. > > >> > > > >> > Makes sense? > > >> > > >> It's not hypothetical - if anything, it's hypothetical that performance > > >> does not regress. And we just got a report from users that see a > > >> regression without. So, not really. > > > > > >Probably, but do we need to define a bar here? Looking at git history, > > >we didn't ask a full benchmark for a lot of commits that may touch > > > > Moreover, there is no "benchmark" to run anyway, is it? > > Yes, so my point is to have some agreement on > > 1) what kind of test needs to be run for a patch like this. > 2) what numbers are ok or not > > Thanks That's a $1mln question and the difficulty is why we don't change behaviour drastically for users without a fallback even if we think we did a bunch of testing. > > > > > > >performance. > > > > > >Thanks > > > > > >> > > >> > > > >> > > > > >> > >-- > > >> > >MST > > >> > > > > >> > > > > >
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 218a446c4c27..c53d6dc6d332 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -84,7 +84,9 @@ struct virtnet_stat_desc { struct virtnet_sq_free_stats { u64 packets; + u64 xdp_packets; u64 bytes; + u64 xdp_bytes; }; struct virtnet_sq_stats { @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - ++stats->packets; - if (!is_xdp_frame(ptr)) { struct sk_buff *skb = ptr; pr_debug("Sent skb %p\n", skb); + stats->packets++; stats->bytes += skb->len; napi_consume_skb(skb, in_napi); } else { struct xdp_frame *frame = ptr_to_xdp(ptr); - stats->bytes += xdp_get_frame_len(frame); + stats->xdp_packets++; + stats->xdp_bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); } } @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) virtnet_rq_free_buf(vi, rq, buf); } -static void free_old_xmit(struct send_queue *sq, bool in_napi) +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi) { struct virtnet_sq_free_stats stats = {0}; @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ - if (!stats.packets) + if (!stats.packets && !stats.xdp_packets) return; + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); + u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.packets, stats.packets); @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, * early means 16 slots are typically wasted. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { - netif_stop_subqueue(dev, qnum); + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + + netif_tx_stop_queue(txq); if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { netif_start_subqueue(dev, qnum); virtqueue_disable_cb(sq->vq); @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) do { virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) netif_tx_wake_queue(txq); @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); - bool kick = !netdev_xmit_more(); + bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; /* Free up any pending old buffers before queueing new ones. */ @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) if (use_napi) virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); - } while (use_napi && kick && + } while (use_napi && !xmit_more && unlikely(!virtqueue_enable_cb_delayed(sq->vq))); /* timestamp packet in software */ @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) check_sq_full_and_disable(vi, dev, sq); - if (kick || netif_xmit_stopped(txq)) { + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks);