diff mbox series

[net-next] virtio_net: add support for Byte Queue Limits

Message ID 20240509114615.317450-1-jiri@resnulli.us (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] virtio_net: add support for Byte Queue Limits | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 926 this patch: 926
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: bpf@vger.kernel.org
netdev/build_clang success Errors and warnings before: 937 this patch: 937
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 937 this patch: 937
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 112 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-10--18-00 (tests: 1014)

Commit Message

Jiri Pirko May 9, 2024, 11:46 a.m. UTC
From: Jiri Pirko <jiri@nvidia.com>

Add support for Byte Queue Limits (BQL).

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
---
 drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

Comments

Michael S. Tsirkin May 9, 2024, 12:41 p.m. UTC | #1
On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@nvidia.com>
> 
> Add support for Byte Queue Limits (BQL).
> 
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>

Can we get more detail on the benefits you observe etc?
Thanks!

> ---
>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 218a446c4c27..c53d6dc6d332 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>  
>  struct virtnet_sq_free_stats {
>  	u64 packets;
> +	u64 xdp_packets;
>  	u64 bytes;
> +	u64 xdp_bytes;
>  };
>  
>  struct virtnet_sq_stats {
> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  	void *ptr;
>  
>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		++stats->packets;
> -
>  		if (!is_xdp_frame(ptr)) {
>  			struct sk_buff *skb = ptr;
>  
>  			pr_debug("Sent skb %p\n", skb);
>  
> +			stats->packets++;
>  			stats->bytes += skb->len;
>  			napi_consume_skb(skb, in_napi);
>  		} else {
>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>  
> -			stats->bytes += xdp_get_frame_len(frame);
> +			stats->xdp_packets++;
> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>  			xdp_return_frame(frame);
>  		}
>  	}
> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>  	virtnet_rq_free_buf(vi, rq, buf);
>  }
>  
> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> +			  bool in_napi)
>  {
>  	struct virtnet_sq_free_stats stats = {0};
>  
> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>  	/* Avoid overhead when no packets have been processed
>  	 * happens when called speculatively from start_xmit.
>  	 */
> -	if (!stats.packets)
> +	if (!stats.packets && !stats.xdp_packets)
>  		return;
>  
> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> +
>  	u64_stats_update_begin(&sq->stats.syncp);
>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>  	u64_stats_add(&sq->stats.packets, stats.packets);
> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  	 * early means 16 slots are typically wasted.
>  	 */
>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> -		netif_stop_subqueue(dev, qnum);
> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> +
> +		netif_tx_stop_queue(txq);
>  		if (use_napi) {
>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
> -			free_old_xmit(sq, false);
> +			free_old_xmit(sq, txq, false);
>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>  
>  		do {
>  			virtqueue_disable_cb(sq->vq);
> -			free_old_xmit(sq, true);
> +			free_old_xmit(sq, txq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>  	txq = netdev_get_tx_queue(vi->dev, index);
>  	__netif_tx_lock(txq, raw_smp_processor_id());
>  	virtqueue_disable_cb(sq->vq);
> -	free_old_xmit(sq, true);
> +	free_old_xmit(sq, txq, true);
>  
>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>  		netif_tx_wake_queue(txq);
> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct send_queue *sq = &vi->sq[qnum];
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> -	bool kick = !netdev_xmit_more();
> +	bool xmit_more = netdev_xmit_more();
>  	bool use_napi = sq->napi.weight;
>  
>  	/* Free up any pending old buffers before queueing new ones. */
> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  		if (use_napi)
>  			virtqueue_disable_cb(sq->vq);
>  
> -		free_old_xmit(sq, false);
> +		free_old_xmit(sq, txq, false);
>  
> -	} while (use_napi && kick &&
> +	} while (use_napi && !xmit_more &&
>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  	/* timestamp packet in software */
> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	check_sq_full_and_disable(vi, dev, sq);
>  
> -	if (kick || netif_xmit_stopped(txq)) {
> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>  			u64_stats_update_begin(&sq->stats.syncp);
>  			u64_stats_inc(&sq->stats.kicks);
> -- 
> 2.44.0
Jiri Pirko May 9, 2024, 1:31 p.m. UTC | #2
Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@nvidia.com>
>> 
>> Add support for Byte Queue Limits (BQL).
>> 
>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>
>Can we get more detail on the benefits you observe etc?
>Thanks!

More info about the BQL in general is here:
https://lwn.net/Articles/469652/
Michael S. Tsirkin May 9, 2024, 2:28 p.m. UTC | #3
On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> From: Jiri Pirko <jiri@nvidia.com>
> >> 
> >> Add support for Byte Queue Limits (BQL).
> >> 
> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >
> >Can we get more detail on the benefits you observe etc?
> >Thanks!
> 
> More info about the BQL in general is here:
> https://lwn.net/Articles/469652/

I know about BQL in general. We discussed BQL for virtio in the past
mostly I got the feedback from net core maintainers that it likely won't
benefit virtio.

So I'm asking, what kind of benefit do you observe?
Jason Wang May 10, 2024, 4:25 a.m. UTC | #4
On Thu, May 9, 2024 at 7:46 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> From: Jiri Pirko <jiri@nvidia.com>
>
> Add support for Byte Queue Limits (BQL).
>
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---

Does this work for non tx NAPI mode (it seems not obvious to me)?

Thanks
Jason Wang May 10, 2024, 4:25 a.m. UTC | #5
On Thu, May 9, 2024 at 10:28 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> > >> From: Jiri Pirko <jiri@nvidia.com>
> > >>
> > >> Add support for Byte Queue Limits (BQL).
> > >>
> > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> > >
> > >Can we get more detail on the benefits you observe etc?
> > >Thanks!
> >
> > More info about the BQL in general is here:
> > https://lwn.net/Articles/469652/
>
> I know about BQL in general. We discussed BQL for virtio in the past
> mostly I got the feedback from net core maintainers that it likely won't
> benefit virtio.
>
> So I'm asking, what kind of benefit do you observe?

Yes, benmark is more than welcomed.

Thanks

>
> --
> MST
>
Heng Qi May 10, 2024, 7:11 a.m. UTC | #6
On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> From: Jiri Pirko <jiri@nvidia.com>
> 
> Add support for Byte Queue Limits (BQL).

Historically both Jason and Michael have attempted to support BQL
for virtio-net, for example:

https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/

These discussions focus primarily on:

1. BQL is based on napi tx. Therefore, the transfer of statistical information
needs to rely on the judgment of use_napi. When the napi mode is switched to
orphan, some statistical information will be lost, resulting in temporary
inaccuracy in BQL.

2. If tx dim is supported, orphan mode may be removed and tx irq will be more
reasonable. This provides good support for BQL.

Thanks.

> 
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---
>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 218a446c4c27..c53d6dc6d332 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>  
>  struct virtnet_sq_free_stats {
>  	u64 packets;
> +	u64 xdp_packets;
>  	u64 bytes;
> +	u64 xdp_bytes;
>  };
>  
>  struct virtnet_sq_stats {
> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  	void *ptr;
>  
>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		++stats->packets;
> -
>  		if (!is_xdp_frame(ptr)) {
>  			struct sk_buff *skb = ptr;
>  
>  			pr_debug("Sent skb %p\n", skb);
>  
> +			stats->packets++;
>  			stats->bytes += skb->len;
>  			napi_consume_skb(skb, in_napi);
>  		} else {
>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>  
> -			stats->bytes += xdp_get_frame_len(frame);
> +			stats->xdp_packets++;
> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>  			xdp_return_frame(frame);
>  		}
>  	}
> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>  	virtnet_rq_free_buf(vi, rq, buf);
>  }
>  
> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> +			  bool in_napi)
>  {
>  	struct virtnet_sq_free_stats stats = {0};
>  
> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>  	/* Avoid overhead when no packets have been processed
>  	 * happens when called speculatively from start_xmit.
>  	 */
> -	if (!stats.packets)
> +	if (!stats.packets && !stats.xdp_packets)
>  		return;
>  
> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> +
>  	u64_stats_update_begin(&sq->stats.syncp);
>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>  	u64_stats_add(&sq->stats.packets, stats.packets);
> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  	 * early means 16 slots are typically wasted.
>  	 */
>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> -		netif_stop_subqueue(dev, qnum);
> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> +
> +		netif_tx_stop_queue(txq);
>  		if (use_napi) {
>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
> -			free_old_xmit(sq, false);
> +			free_old_xmit(sq, txq, false);
>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>  
>  		do {
>  			virtqueue_disable_cb(sq->vq);
> -			free_old_xmit(sq, true);
> +			free_old_xmit(sq, txq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>  	txq = netdev_get_tx_queue(vi->dev, index);
>  	__netif_tx_lock(txq, raw_smp_processor_id());
>  	virtqueue_disable_cb(sq->vq);
> -	free_old_xmit(sq, true);
> +	free_old_xmit(sq, txq, true);
>  
>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>  		netif_tx_wake_queue(txq);
> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct send_queue *sq = &vi->sq[qnum];
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> -	bool kick = !netdev_xmit_more();
> +	bool xmit_more = netdev_xmit_more();
>  	bool use_napi = sq->napi.weight;
>  
>  	/* Free up any pending old buffers before queueing new ones. */
> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  		if (use_napi)
>  			virtqueue_disable_cb(sq->vq);
>  
> -		free_old_xmit(sq, false);
> +		free_old_xmit(sq, txq, false);
>  
> -	} while (use_napi && kick &&
> +	} while (use_napi && !xmit_more &&
>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  	/* timestamp packet in software */
> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	check_sq_full_and_disable(vi, dev, sq);
>  
> -	if (kick || netif_xmit_stopped(txq)) {
> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>  			u64_stats_update_begin(&sq->stats.syncp);
>  			u64_stats_inc(&sq->stats.kicks);
> -- 
> 2.44.0
> 
>
Jiri Pirko May 10, 2024, 10:35 a.m. UTC | #7
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@nvidia.com>
>> 
>> Add support for Byte Queue Limits (BQL).
>
>Historically both Jason and Michael have attempted to support BQL
>for virtio-net, for example:
>
>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>These discussions focus primarily on:
>
>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>needs to rely on the judgment of use_napi. When the napi mode is switched to
>orphan, some statistical information will be lost, resulting in temporary
>inaccuracy in BQL.
>
>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>reasonable. This provides good support for BQL.

Thanks for the pointers, will check that out.


>
>Thanks.
>
>> 
>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> ---
>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>>  1 file changed, 20 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 218a446c4c27..c53d6dc6d332 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>>  
>>  struct virtnet_sq_free_stats {
>>  	u64 packets;
>> +	u64 xdp_packets;
>>  	u64 bytes;
>> +	u64 xdp_bytes;
>>  };
>>  
>>  struct virtnet_sq_stats {
>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>>  	void *ptr;
>>  
>>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> -		++stats->packets;
>> -
>>  		if (!is_xdp_frame(ptr)) {
>>  			struct sk_buff *skb = ptr;
>>  
>>  			pr_debug("Sent skb %p\n", skb);
>>  
>> +			stats->packets++;
>>  			stats->bytes += skb->len;
>>  			napi_consume_skb(skb, in_napi);
>>  		} else {
>>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>>  
>> -			stats->bytes += xdp_get_frame_len(frame);
>> +			stats->xdp_packets++;
>> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>>  			xdp_return_frame(frame);
>>  		}
>>  	}
>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>>  	virtnet_rq_free_buf(vi, rq, buf);
>>  }
>>  
>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
>> +			  bool in_napi)
>>  {
>>  	struct virtnet_sq_free_stats stats = {0};
>>  
>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>>  	/* Avoid overhead when no packets have been processed
>>  	 * happens when called speculatively from start_xmit.
>>  	 */
>> -	if (!stats.packets)
>> +	if (!stats.packets && !stats.xdp_packets)
>>  		return;
>>  
>> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
>> +
>>  	u64_stats_update_begin(&sq->stats.syncp);
>>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>>  	u64_stats_add(&sq->stats.packets, stats.packets);
>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>>  	 * early means 16 slots are typically wasted.
>>  	 */
>>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
>> -		netif_stop_subqueue(dev, qnum);
>> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> +
>> +		netif_tx_stop_queue(txq);
>>  		if (use_napi) {
>>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>>  			/* More just got used, free them then recheck. */
>> -			free_old_xmit(sq, false);
>> +			free_old_xmit(sq, txq, false);
>>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>>  				netif_start_subqueue(dev, qnum);
>>  				virtqueue_disable_cb(sq->vq);
>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>>  
>>  		do {
>>  			virtqueue_disable_cb(sq->vq);
>> -			free_old_xmit(sq, true);
>> +			free_old_xmit(sq, txq, true);
>>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>  	txq = netdev_get_tx_queue(vi->dev, index);
>>  	__netif_tx_lock(txq, raw_smp_processor_id());
>>  	virtqueue_disable_cb(sq->vq);
>> -	free_old_xmit(sq, true);
>> +	free_old_xmit(sq, txq, true);
>>  
>>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>  		netif_tx_wake_queue(txq);
>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	struct send_queue *sq = &vi->sq[qnum];
>>  	int err;
>>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> -	bool kick = !netdev_xmit_more();
>> +	bool xmit_more = netdev_xmit_more();
>>  	bool use_napi = sq->napi.weight;
>>  
>>  	/* Free up any pending old buffers before queueing new ones. */
>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  		if (use_napi)
>>  			virtqueue_disable_cb(sq->vq);
>>  
>> -		free_old_xmit(sq, false);
>> +		free_old_xmit(sq, txq, false);
>>  
>> -	} while (use_napi && kick &&
>> +	} while (use_napi && !xmit_more &&
>>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  	/* timestamp packet in software */
>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  
>>  	check_sq_full_and_disable(vi, dev, sq);
>>  
>> -	if (kick || netif_xmit_stopped(txq)) {
>> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>>  			u64_stats_update_begin(&sq->stats.syncp);
>>  			u64_stats_inc(&sq->stats.kicks);
>> -- 
>> 2.44.0
>> 
>>
Jiri Pirko May 10, 2024, 10:37 a.m. UTC | #8
Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> 
>> >> Add support for Byte Queue Limits (BQL).
>> >> 
>> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >
>> >Can we get more detail on the benefits you observe etc?
>> >Thanks!
>> 
>> More info about the BQL in general is here:
>> https://lwn.net/Articles/469652/
>
>I know about BQL in general. We discussed BQL for virtio in the past
>mostly I got the feedback from net core maintainers that it likely won't
>benefit virtio.

Do you have some link to that, or is it this thread:
https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/

I don't see why virtio should be any different from other
drivers/devices that benefit from bql. HOL blocking is the same here are
everywhere.

>
>So I'm asking, what kind of benefit do you observe?

I don't have measurements at hand, will attach them to v2.

Thanks!

>
>-- 
>MST
>
Michael S. Tsirkin May 10, 2024, 10:52 a.m. UTC | #9
On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> 
> >> >> Add support for Byte Queue Limits (BQL).
> >> >> 
> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >
> >> >Can we get more detail on the benefits you observe etc?
> >> >Thanks!
> >> 
> >> More info about the BQL in general is here:
> >> https://lwn.net/Articles/469652/
> >
> >I know about BQL in general. We discussed BQL for virtio in the past
> >mostly I got the feedback from net core maintainers that it likely won't
> >benefit virtio.
> 
> Do you have some link to that, or is it this thread:
> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/


A quick search on lore turned up this, for example:
https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/




> I don't see why virtio should be any different from other
> drivers/devices that benefit from bql. HOL blocking is the same here are
> everywhere.
> 
> >
> >So I'm asking, what kind of benefit do you observe?
> 
> I don't have measurements at hand, will attach them to v2.
> 
> Thanks!
> 
> >
> >-- 
> >MST
> >
Jiri Pirko May 10, 2024, 11:11 a.m. UTC | #10
Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> 
>> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> 
>> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >
>> >> >Can we get more detail on the benefits you observe etc?
>> >> >Thanks!
>> >> 
>> >> More info about the BQL in general is here:
>> >> https://lwn.net/Articles/469652/
>> >
>> >I know about BQL in general. We discussed BQL for virtio in the past
>> >mostly I got the feedback from net core maintainers that it likely won't
>> >benefit virtio.
>> 
>> Do you have some link to that, or is it this thread:
>> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>
>A quick search on lore turned up this, for example:
>https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/

Says:
"Note that NIC with many TX queues make BQL almost useless, only adding extra
 overhead."

But virtio can have one tx queue, I guess that could be quite common
configuration in lot of deployments.


>
>
>
>
>> I don't see why virtio should be any different from other
>> drivers/devices that benefit from bql. HOL blocking is the same here are
>> everywhere.
>> 
>> >
>> >So I'm asking, what kind of benefit do you observe?
>> 
>> I don't have measurements at hand, will attach them to v2.
>> 
>> Thanks!
>> 
>> >
>> >-- 
>> >MST
>> >
>
Michael S. Tsirkin May 10, 2024, 11:27 a.m. UTC | #11
On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> >> 
> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >> >> 
> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >> >
> >> >> >Can we get more detail on the benefits you observe etc?
> >> >> >Thanks!
> >> >> 
> >> >> More info about the BQL in general is here:
> >> >> https://lwn.net/Articles/469652/
> >> >
> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >benefit virtio.
> >> 
> >> Do you have some link to that, or is it this thread:
> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >
> >
> >A quick search on lore turned up this, for example:
> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> 
> Says:
> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>  overhead."
> 
> But virtio can have one tx queue, I guess that could be quite common
> configuration in lot of deployments.

Not sure we should worry about performance for these though.
What I am saying is this should come with some benchmarking
results.


> 
> >
> >
> >
> >
> >> I don't see why virtio should be any different from other
> >> drivers/devices that benefit from bql. HOL blocking is the same here are
> >> everywhere.
> >> 
> >> >
> >> >So I'm asking, what kind of benefit do you observe?
> >> 
> >> I don't have measurements at hand, will attach them to v2.
> >> 
> >> Thanks!
> >> 
> >> >
> >> >-- 
> >> >MST
> >> >
> >
Jiri Pirko May 10, 2024, 11:36 a.m. UTC | #12
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> 
>> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> 
>> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >
>> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >Thanks!
>> >> >> 
>> >> >> More info about the BQL in general is here:
>> >> >> https://lwn.net/Articles/469652/
>> >> >
>> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >benefit virtio.
>> >> 
>> >> Do you have some link to that, or is it this thread:
>> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >
>> >
>> >A quick search on lore turned up this, for example:
>> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> 
>> Says:
>> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>  overhead."
>> 
>> But virtio can have one tx queue, I guess that could be quite common
>> configuration in lot of deployments.
>
>Not sure we should worry about performance for these though.

Well, queues may be scarce resource sometimes, even in those cases, you
want to perform.

>What I am saying is this should come with some benchmarking
>results.

Sure, I got the message.

>
>
>> 
>> >
>> >
>> >
>> >
>> >> I don't see why virtio should be any different from other
>> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> everywhere.
>> >> 
>> >> >
>> >> >So I'm asking, what kind of benefit do you observe?
>> >> 
>> >> I don't have measurements at hand, will attach them to v2.
>> >> 
>> >> Thanks!
>> >> 
>> >> >
>> >> >-- 
>> >> >MST
>> >> >
>> >
>
Jiri Pirko May 15, 2024, 7:34 a.m. UTC | #13
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> 
>> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> 
>> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >
>> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >Thanks!
>> >> >> 
>> >> >> More info about the BQL in general is here:
>> >> >> https://lwn.net/Articles/469652/
>> >> >
>> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >benefit virtio.
>> >> 
>> >> Do you have some link to that, or is it this thread:
>> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >
>> >
>> >A quick search on lore turned up this, for example:
>> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> 
>> Says:
>> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>  overhead."
>> 
>> But virtio can have one tx queue, I guess that could be quite common
>> configuration in lot of deployments.
>
>Not sure we should worry about performance for these though.
>What I am saying is this should come with some benchmarking
>results.

I did some measurements with VDPA, backed by ConnectX6dx NIC, single
queue pair:

super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
nice -n 20 netperf -H $ip -l 10 -t TCP_RR

RR result with no bql:
29.95
32.74
28.77

RR result with bql:
222.98
159.81
197.88



>
>
>> 
>> >
>> >
>> >
>> >
>> >> I don't see why virtio should be any different from other
>> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> everywhere.
>> >> 
>> >> >
>> >> >So I'm asking, what kind of benefit do you observe?
>> >> 
>> >> I don't have measurements at hand, will attach them to v2.
>> >> 
>> >> Thanks!
>> >> 
>> >> >
>> >> >-- 
>> >> >MST
>> >> >
>> >
>
Michael S. Tsirkin May 15, 2024, 8:20 a.m. UTC | #14
On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> >> >> 
> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >> >> >> 
> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >> >> >
> >> >> >> >Can we get more detail on the benefits you observe etc?
> >> >> >> >Thanks!
> >> >> >> 
> >> >> >> More info about the BQL in general is here:
> >> >> >> https://lwn.net/Articles/469652/
> >> >> >
> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >> >benefit virtio.
> >> >> 
> >> >> Do you have some link to that, or is it this thread:
> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >> >
> >> >
> >> >A quick search on lore turned up this, for example:
> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >> 
> >> Says:
> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >>  overhead."
> >> 
> >> But virtio can have one tx queue, I guess that could be quite common
> >> configuration in lot of deployments.
> >
> >Not sure we should worry about performance for these though.
> >What I am saying is this should come with some benchmarking
> >results.
> 
> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> queue pair:
> 
> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> 
> RR result with no bql:
> 29.95
> 32.74
> 28.77
> 
> RR result with bql:
> 222.98
> 159.81
> 197.88
> 

Okay. And on the other hand, any measureable degradation with
multiqueue and when testing throughput?


> 
> >
> >
> >> 
> >> >
> >> >
> >> >
> >> >
> >> >> I don't see why virtio should be any different from other
> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
> >> >> everywhere.
> >> >> 
> >> >> >
> >> >> >So I'm asking, what kind of benefit do you observe?
> >> >> 
> >> >> I don't have measurements at hand, will attach them to v2.
> >> >> 
> >> >> Thanks!
> >> >> 
> >> >> >
> >> >> >-- 
> >> >> >MST
> >> >> >
> >> >
> >
Jiri Pirko May 15, 2024, 10:12 a.m. UTC | #15
Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >> 
>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> >> 
>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >
>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >> >Thanks!
>> >> >> >> 
>> >> >> >> More info about the BQL in general is here:
>> >> >> >> https://lwn.net/Articles/469652/
>> >> >> >
>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >> >benefit virtio.
>> >> >> 
>> >> >> Do you have some link to that, or is it this thread:
>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> >
>> >> >
>> >> >A quick search on lore turned up this, for example:
>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >> 
>> >> Says:
>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >>  overhead."
>> >> 
>> >> But virtio can have one tx queue, I guess that could be quite common
>> >> configuration in lot of deployments.
>> >
>> >Not sure we should worry about performance for these though.
>> >What I am saying is this should come with some benchmarking
>> >results.
>> 
>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> queue pair:
>> 
>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> 
>> RR result with no bql:
>> 29.95
>> 32.74
>> 28.77
>> 
>> RR result with bql:
>> 222.98
>> 159.81
>> 197.88
>> 
>
>Okay. And on the other hand, any measureable degradation with
>multiqueue and when testing throughput?

With multiqueue it depends if the flows hits the same queue or not. If
they do, the same results will likely be shown.


>
>
>> 
>> >
>> >
>> >> 
>> >> >
>> >> >
>> >> >
>> >> >
>> >> >> I don't see why virtio should be any different from other
>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> >> everywhere.
>> >> >> 
>> >> >> >
>> >> >> >So I'm asking, what kind of benefit do you observe?
>> >> >> 
>> >> >> I don't have measurements at hand, will attach them to v2.
>> >> >> 
>> >> >> Thanks!
>> >> >> 
>> >> >> >
>> >> >> >-- 
>> >> >> >MST
>> >> >> >
>> >> >
>> >
>
Jiri Pirko May 15, 2024, 12:54 p.m. UTC | #16
Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>>> >> >> >> >> 
>>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>>> >> >> >> >> 
>>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>>> >> >> >> >
>>> >> >> >> >Can we get more detail on the benefits you observe etc?
>>> >> >> >> >Thanks!
>>> >> >> >> 
>>> >> >> >> More info about the BQL in general is here:
>>> >> >> >> https://lwn.net/Articles/469652/
>>> >> >> >
>>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>>> >> >> >benefit virtio.
>>> >> >> 
>>> >> >> Do you have some link to that, or is it this thread:
>>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>>> >> >
>>> >> >
>>> >> >A quick search on lore turned up this, for example:
>>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>>> >> 
>>> >> Says:
>>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>> >>  overhead."
>>> >> 
>>> >> But virtio can have one tx queue, I guess that could be quite common
>>> >> configuration in lot of deployments.
>>> >
>>> >Not sure we should worry about performance for these though.
>>> >What I am saying is this should come with some benchmarking
>>> >results.
>>> 
>>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>>> queue pair:
>>> 
>>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>>> 
>>> RR result with no bql:
>>> 29.95
>>> 32.74
>>> 28.77
>>> 
>>> RR result with bql:
>>> 222.98
>>> 159.81
>>> 197.88
>>> 
>>
>>Okay. And on the other hand, any measureable degradation with
>>multiqueue and when testing throughput?
>
>With multiqueue it depends if the flows hits the same queue or not. If
>they do, the same results will likely be shown.

RR 1q, w/o bql:
29.95
32.74
28.77

RR 1q, with bql:
222.98
159.81
197.88

RR 4q, w/o bql:
355.82
364.58
233.47

RR 4q, with bql:
371.19
255.93
337.77

So answer to your question is: "no measurable degradation with 4
queues".


>
>
>>
>>
>>> 
>>> >
>>> >
>>> >> 
>>> >> >
>>> >> >
>>> >> >
>>> >> >
>>> >> >> I don't see why virtio should be any different from other
>>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>>> >> >> everywhere.
>>> >> >> 
>>> >> >> >
>>> >> >> >So I'm asking, what kind of benefit do you observe?
>>> >> >> 
>>> >> >> I don't have measurements at hand, will attach them to v2.
>>> >> >> 
>>> >> >> Thanks!
>>> >> >> 
>>> >> >> >
>>> >> >> >-- 
>>> >> >> >MST
>>> >> >> >
>>> >> >
>>> >
>>
Jason Wang May 16, 2024, 4:48 a.m. UTC | #17
On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >>> >> >> >> >>
> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >>> >> >> >> >>
> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >>> >> >> >> >
> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
> >>> >> >> >> >Thanks!
> >>> >> >> >>
> >>> >> >> >> More info about the BQL in general is here:
> >>> >> >> >> https://lwn.net/Articles/469652/
> >>> >> >> >
> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >>> >> >> >benefit virtio.
> >>> >> >>
> >>> >> >> Do you have some link to that, or is it this thread:
> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >>> >> >
> >>> >> >
> >>> >> >A quick search on lore turned up this, for example:
> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >>> >>
> >>> >> Says:
> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >>> >>  overhead."
> >>> >>
> >>> >> But virtio can have one tx queue, I guess that could be quite common
> >>> >> configuration in lot of deployments.
> >>> >
> >>> >Not sure we should worry about performance for these though.
> >>> >What I am saying is this should come with some benchmarking
> >>> >results.
> >>>
> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> >>> queue pair:
> >>>
> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> >>>
> >>> RR result with no bql:
> >>> 29.95
> >>> 32.74
> >>> 28.77
> >>>
> >>> RR result with bql:
> >>> 222.98
> >>> 159.81
> >>> 197.88
> >>>
> >>
> >>Okay. And on the other hand, any measureable degradation with
> >>multiqueue and when testing throughput?
> >
> >With multiqueue it depends if the flows hits the same queue or not. If
> >they do, the same results will likely be shown.
>
> RR 1q, w/o bql:
> 29.95
> 32.74
> 28.77
>
> RR 1q, with bql:
> 222.98
> 159.81
> 197.88
>
> RR 4q, w/o bql:
> 355.82
> 364.58
> 233.47
>
> RR 4q, with bql:
> 371.19
> 255.93
> 337.77
>
> So answer to your question is: "no measurable degradation with 4
> queues".

Thanks but I think we also need benchmarks in cases other than vDPA.
For example, a simple virtualization setup.
Jiri Pirko May 16, 2024, 10:54 a.m. UTC | #18
Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
>On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>>
>> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >>> >> >> >> >>
>> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >>> >> >> >> >>
>> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >>> >> >> >> >
>> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >>> >> >> >> >Thanks!
>> >>> >> >> >>
>> >>> >> >> >> More info about the BQL in general is here:
>> >>> >> >> >> https://lwn.net/Articles/469652/
>> >>> >> >> >
>> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >>> >> >> >benefit virtio.
>> >>> >> >>
>> >>> >> >> Do you have some link to that, or is it this thread:
>> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >>> >> >
>> >>> >> >
>> >>> >> >A quick search on lore turned up this, for example:
>> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >>> >>
>> >>> >> Says:
>> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >>> >>  overhead."
>> >>> >>
>> >>> >> But virtio can have one tx queue, I guess that could be quite common
>> >>> >> configuration in lot of deployments.
>> >>> >
>> >>> >Not sure we should worry about performance for these though.
>> >>> >What I am saying is this should come with some benchmarking
>> >>> >results.
>> >>>
>> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> >>> queue pair:
>> >>>
>> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> >>>
>> >>> RR result with no bql:
>> >>> 29.95
>> >>> 32.74
>> >>> 28.77
>> >>>
>> >>> RR result with bql:
>> >>> 222.98
>> >>> 159.81
>> >>> 197.88
>> >>>
>> >>
>> >>Okay. And on the other hand, any measureable degradation with
>> >>multiqueue and when testing throughput?
>> >
>> >With multiqueue it depends if the flows hits the same queue or not. If
>> >they do, the same results will likely be shown.
>>
>> RR 1q, w/o bql:
>> 29.95
>> 32.74
>> 28.77
>>
>> RR 1q, with bql:
>> 222.98
>> 159.81
>> 197.88
>>
>> RR 4q, w/o bql:
>> 355.82
>> 364.58
>> 233.47
>>
>> RR 4q, with bql:
>> 371.19
>> 255.93
>> 337.77
>>
>> So answer to your question is: "no measurable degradation with 4
>> queues".
>
>Thanks but I think we also need benchmarks in cases other than vDPA.
>For example, a simple virtualization setup.

For virtualization setup, I get this:

VIRT RR 1q, w/0 bql:
49.18
49.75
50.07

VIRT RR 1q, with bql:
51.33
47.88
40.40

No measurable/significant difference.

>
Michael S. Tsirkin May 16, 2024, 12:31 p.m. UTC | #19
On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote:
> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >>
> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >>> >> >> >> >>
> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >>> >> >> >> >>
> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >>> >> >> >> >
> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
> >> >>> >> >> >> >Thanks!
> >> >>> >> >> >>
> >> >>> >> >> >> More info about the BQL in general is here:
> >> >>> >> >> >> https://lwn.net/Articles/469652/
> >> >>> >> >> >
> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >>> >> >> >benefit virtio.
> >> >>> >> >>
> >> >>> >> >> Do you have some link to that, or is it this thread:
> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >> >>> >> >
> >> >>> >> >
> >> >>> >> >A quick search on lore turned up this, for example:
> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >> >>> >>
> >> >>> >> Says:
> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >> >>> >>  overhead."
> >> >>> >>
> >> >>> >> But virtio can have one tx queue, I guess that could be quite common
> >> >>> >> configuration in lot of deployments.
> >> >>> >
> >> >>> >Not sure we should worry about performance for these though.
> >> >>> >What I am saying is this should come with some benchmarking
> >> >>> >results.
> >> >>>
> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> >> >>> queue pair:
> >> >>>
> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> >> >>>
> >> >>> RR result with no bql:
> >> >>> 29.95
> >> >>> 32.74
> >> >>> 28.77
> >> >>>
> >> >>> RR result with bql:
> >> >>> 222.98
> >> >>> 159.81
> >> >>> 197.88
> >> >>>
> >> >>
> >> >>Okay. And on the other hand, any measureable degradation with
> >> >>multiqueue and when testing throughput?
> >> >
> >> >With multiqueue it depends if the flows hits the same queue or not. If
> >> >they do, the same results will likely be shown.
> >>
> >> RR 1q, w/o bql:
> >> 29.95
> >> 32.74
> >> 28.77
> >>
> >> RR 1q, with bql:
> >> 222.98
> >> 159.81
> >> 197.88
> >>
> >> RR 4q, w/o bql:
> >> 355.82
> >> 364.58
> >> 233.47
> >>
> >> RR 4q, with bql:
> >> 371.19
> >> 255.93
> >> 337.77
> >>
> >> So answer to your question is: "no measurable degradation with 4
> >> queues".
> >
> >Thanks but I think we also need benchmarks in cases other than vDPA.
> >For example, a simple virtualization setup.
> 
> For virtualization setup, I get this:
> 
> VIRT RR 1q, w/0 bql:
> 49.18
> 49.75
> 50.07
> 
> VIRT RR 1q, with bql:
> 51.33
> 47.88
> 40.40
> 
> No measurable/significant difference.

Seems the results became much noisier? Also
I'd expect a regression if any to be in a streaming benchmark.
Jiri Pirko May 16, 2024, 3:25 p.m. UTC | #20
Thu, May 16, 2024 at 02:31:59PM CEST, mst@redhat.com wrote:
>On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote:
>> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
>> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>> >>
>> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >>> >> >> >> >>
>> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >>> >> >> >> >>
>> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >>> >> >> >> >
>> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >>> >> >> >> >Thanks!
>> >> >>> >> >> >>
>> >> >>> >> >> >> More info about the BQL in general is here:
>> >> >>> >> >> >> https://lwn.net/Articles/469652/
>> >> >>> >> >> >
>> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >>> >> >> >benefit virtio.
>> >> >>> >> >>
>> >> >>> >> >> Do you have some link to that, or is it this thread:
>> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> >>> >> >
>> >> >>> >> >
>> >> >>> >> >A quick search on lore turned up this, for example:
>> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >> >>> >>
>> >> >>> >> Says:
>> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >> >>> >>  overhead."
>> >> >>> >>
>> >> >>> >> But virtio can have one tx queue, I guess that could be quite common
>> >> >>> >> configuration in lot of deployments.
>> >> >>> >
>> >> >>> >Not sure we should worry about performance for these though.
>> >> >>> >What I am saying is this should come with some benchmarking
>> >> >>> >results.
>> >> >>>
>> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> >> >>> queue pair:
>> >> >>>
>> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> >> >>>
>> >> >>> RR result with no bql:
>> >> >>> 29.95
>> >> >>> 32.74
>> >> >>> 28.77
>> >> >>>
>> >> >>> RR result with bql:
>> >> >>> 222.98
>> >> >>> 159.81
>> >> >>> 197.88
>> >> >>>
>> >> >>
>> >> >>Okay. And on the other hand, any measureable degradation with
>> >> >>multiqueue and when testing throughput?
>> >> >
>> >> >With multiqueue it depends if the flows hits the same queue or not. If
>> >> >they do, the same results will likely be shown.
>> >>
>> >> RR 1q, w/o bql:
>> >> 29.95
>> >> 32.74
>> >> 28.77
>> >>
>> >> RR 1q, with bql:
>> >> 222.98
>> >> 159.81
>> >> 197.88
>> >>
>> >> RR 4q, w/o bql:
>> >> 355.82
>> >> 364.58
>> >> 233.47
>> >>
>> >> RR 4q, with bql:
>> >> 371.19
>> >> 255.93
>> >> 337.77
>> >>
>> >> So answer to your question is: "no measurable degradation with 4
>> >> queues".
>> >
>> >Thanks but I think we also need benchmarks in cases other than vDPA.
>> >For example, a simple virtualization setup.
>> 
>> For virtualization setup, I get this:
>> 
>> VIRT RR 1q, w/0 bql:
>> 49.18
>> 49.75
>> 50.07
>> 
>> VIRT RR 1q, with bql:
>> 51.33
>> 47.88
>> 40.40
>> 
>> No measurable/significant difference.
>
>Seems the results became much noisier? Also

Not enough data to assume that I believe.


>I'd expect a regression if any to be in a streaming benchmark.

Can you elaborate?


>
>-- 
>MST
>
Michael S. Tsirkin May 16, 2024, 7:04 p.m. UTC | #21
On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote:
> 
> >I'd expect a regression if any to be in a streaming benchmark.
> 
> Can you elaborate?

BQL does two things that can hurt throughput:
- limits amount of data in the queue - can limit bandwidth
  if we now get queue underruns
- adds CPU overhead - can limit bandwidth if CPU bound

So checking result of a streaming benchmark seems important.
Jiri Pirko May 17, 2024, 7:52 a.m. UTC | #22
Thu, May 16, 2024 at 09:04:41PM CEST, mst@redhat.com wrote:
>On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote:
>> 
>> >I'd expect a regression if any to be in a streaming benchmark.
>> 
>> Can you elaborate?
>
>BQL does two things that can hurt throughput:
>- limits amount of data in the queue - can limit bandwidth
>  if we now get queue underruns
>- adds CPU overhead - can limit bandwidth if CPU bound
>
>So checking result of a streaming benchmark seems important.

I understand. Didn't see any degradation in TCP_STREAM netperf. But I
will try to extend the testing.

Thanks!

>
>
>-- 
>MST
>
Jiri Pirko May 20, 2024, 12:48 p.m. UTC | #23
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@nvidia.com>
>> 
>> Add support for Byte Queue Limits (BQL).
>
>Historically both Jason and Michael have attempted to support BQL
>for virtio-net, for example:
>
>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>These discussions focus primarily on:
>
>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>needs to rely on the judgment of use_napi. When the napi mode is switched to
>orphan, some statistical information will be lost, resulting in temporary
>inaccuracy in BQL.
>
>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>reasonable. This provides good support for BQL.

But when the device does not support dim, the orphan mode is still
needed, isn't it?

>
>Thanks.
>
>> 
>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> ---
>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>>  1 file changed, 20 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 218a446c4c27..c53d6dc6d332 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>>  
>>  struct virtnet_sq_free_stats {
>>  	u64 packets;
>> +	u64 xdp_packets;
>>  	u64 bytes;
>> +	u64 xdp_bytes;
>>  };
>>  
>>  struct virtnet_sq_stats {
>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>>  	void *ptr;
>>  
>>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> -		++stats->packets;
>> -
>>  		if (!is_xdp_frame(ptr)) {
>>  			struct sk_buff *skb = ptr;
>>  
>>  			pr_debug("Sent skb %p\n", skb);
>>  
>> +			stats->packets++;
>>  			stats->bytes += skb->len;
>>  			napi_consume_skb(skb, in_napi);
>>  		} else {
>>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>>  
>> -			stats->bytes += xdp_get_frame_len(frame);
>> +			stats->xdp_packets++;
>> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>>  			xdp_return_frame(frame);
>>  		}
>>  	}
>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>>  	virtnet_rq_free_buf(vi, rq, buf);
>>  }
>>  
>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
>> +			  bool in_napi)
>>  {
>>  	struct virtnet_sq_free_stats stats = {0};
>>  
>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>>  	/* Avoid overhead when no packets have been processed
>>  	 * happens when called speculatively from start_xmit.
>>  	 */
>> -	if (!stats.packets)
>> +	if (!stats.packets && !stats.xdp_packets)
>>  		return;
>>  
>> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
>> +
>>  	u64_stats_update_begin(&sq->stats.syncp);
>>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>>  	u64_stats_add(&sq->stats.packets, stats.packets);
>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>>  	 * early means 16 slots are typically wasted.
>>  	 */
>>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
>> -		netif_stop_subqueue(dev, qnum);
>> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> +
>> +		netif_tx_stop_queue(txq);
>>  		if (use_napi) {
>>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>>  			/* More just got used, free them then recheck. */
>> -			free_old_xmit(sq, false);
>> +			free_old_xmit(sq, txq, false);
>>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>>  				netif_start_subqueue(dev, qnum);
>>  				virtqueue_disable_cb(sq->vq);
>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>>  
>>  		do {
>>  			virtqueue_disable_cb(sq->vq);
>> -			free_old_xmit(sq, true);
>> +			free_old_xmit(sq, txq, true);
>>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>  	txq = netdev_get_tx_queue(vi->dev, index);
>>  	__netif_tx_lock(txq, raw_smp_processor_id());
>>  	virtqueue_disable_cb(sq->vq);
>> -	free_old_xmit(sq, true);
>> +	free_old_xmit(sq, txq, true);
>>  
>>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>  		netif_tx_wake_queue(txq);
>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	struct send_queue *sq = &vi->sq[qnum];
>>  	int err;
>>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> -	bool kick = !netdev_xmit_more();
>> +	bool xmit_more = netdev_xmit_more();
>>  	bool use_napi = sq->napi.weight;
>>  
>>  	/* Free up any pending old buffers before queueing new ones. */
>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  		if (use_napi)
>>  			virtqueue_disable_cb(sq->vq);
>>  
>> -		free_old_xmit(sq, false);
>> +		free_old_xmit(sq, txq, false);
>>  
>> -	} while (use_napi && kick &&
>> +	} while (use_napi && !xmit_more &&
>>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  	/* timestamp packet in software */
>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  
>>  	check_sq_full_and_disable(vi, dev, sq);
>>  
>> -	if (kick || netif_xmit_stopped(txq)) {
>> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>>  			u64_stats_update_begin(&sq->stats.syncp);
>>  			u64_stats_inc(&sq->stats.kicks);
>> -- 
>> 2.44.0
>> 
>>
Jiri Pirko June 5, 2024, 11:30 a.m. UTC | #24
Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
>Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>>> From: Jiri Pirko <jiri@nvidia.com>
>>> 
>>> Add support for Byte Queue Limits (BQL).
>>
>>Historically both Jason and Michael have attempted to support BQL
>>for virtio-net, for example:
>>
>>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>>
>>These discussions focus primarily on:
>>
>>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>>needs to rely on the judgment of use_napi. When the napi mode is switched to
>>orphan, some statistical information will be lost, resulting in temporary
>>inaccuracy in BQL.
>>
>>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>>reasonable. This provides good support for BQL.
>
>But when the device does not support dim, the orphan mode is still
>needed, isn't it?

Heng, is my assuption correct here? Thanks!

>
>>
>>Thanks.
>>
>>> 
>>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>>> ---
>>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>>>  1 file changed, 20 insertions(+), 13 deletions(-)
>>> 
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 218a446c4c27..c53d6dc6d332 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>>>  
>>>  struct virtnet_sq_free_stats {
>>>  	u64 packets;
>>> +	u64 xdp_packets;
>>>  	u64 bytes;
>>> +	u64 xdp_bytes;
>>>  };
>>>  
>>>  struct virtnet_sq_stats {
>>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>>>  	void *ptr;
>>>  
>>>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>>> -		++stats->packets;
>>> -
>>>  		if (!is_xdp_frame(ptr)) {
>>>  			struct sk_buff *skb = ptr;
>>>  
>>>  			pr_debug("Sent skb %p\n", skb);
>>>  
>>> +			stats->packets++;
>>>  			stats->bytes += skb->len;
>>>  			napi_consume_skb(skb, in_napi);
>>>  		} else {
>>>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>>>  
>>> -			stats->bytes += xdp_get_frame_len(frame);
>>> +			stats->xdp_packets++;
>>> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>>>  			xdp_return_frame(frame);
>>>  		}
>>>  	}
>>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>>>  	virtnet_rq_free_buf(vi, rq, buf);
>>>  }
>>>  
>>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
>>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
>>> +			  bool in_napi)
>>>  {
>>>  	struct virtnet_sq_free_stats stats = {0};
>>>  
>>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>>>  	/* Avoid overhead when no packets have been processed
>>>  	 * happens when called speculatively from start_xmit.
>>>  	 */
>>> -	if (!stats.packets)
>>> +	if (!stats.packets && !stats.xdp_packets)
>>>  		return;
>>>  
>>> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
>>> +
>>>  	u64_stats_update_begin(&sq->stats.syncp);
>>>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>>>  	u64_stats_add(&sq->stats.packets, stats.packets);
>>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>>>  	 * early means 16 slots are typically wasted.
>>>  	 */
>>>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
>>> -		netif_stop_subqueue(dev, qnum);
>>> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>>> +
>>> +		netif_tx_stop_queue(txq);
>>>  		if (use_napi) {
>>>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>>>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>>>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>>>  			/* More just got used, free them then recheck. */
>>> -			free_old_xmit(sq, false);
>>> +			free_old_xmit(sq, txq, false);
>>>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>>>  				netif_start_subqueue(dev, qnum);
>>>  				virtqueue_disable_cb(sq->vq);
>>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>>>  
>>>  		do {
>>>  			virtqueue_disable_cb(sq->vq);
>>> -			free_old_xmit(sq, true);
>>> +			free_old_xmit(sq, txq, true);
>>>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>>  
>>>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>>  	txq = netdev_get_tx_queue(vi->dev, index);
>>>  	__netif_tx_lock(txq, raw_smp_processor_id());
>>>  	virtqueue_disable_cb(sq->vq);
>>> -	free_old_xmit(sq, true);
>>> +	free_old_xmit(sq, txq, true);
>>>  
>>>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>>  		netif_tx_wake_queue(txq);
>>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>>  	struct send_queue *sq = &vi->sq[qnum];
>>>  	int err;
>>>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>>> -	bool kick = !netdev_xmit_more();
>>> +	bool xmit_more = netdev_xmit_more();
>>>  	bool use_napi = sq->napi.weight;
>>>  
>>>  	/* Free up any pending old buffers before queueing new ones. */
>>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>>  		if (use_napi)
>>>  			virtqueue_disable_cb(sq->vq);
>>>  
>>> -		free_old_xmit(sq, false);
>>> +		free_old_xmit(sq, txq, false);
>>>  
>>> -	} while (use_napi && kick &&
>>> +	} while (use_napi && !xmit_more &&
>>>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>>  
>>>  	/* timestamp packet in software */
>>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>>  
>>>  	check_sq_full_and_disable(vi, dev, sq);
>>>  
>>> -	if (kick || netif_xmit_stopped(txq)) {
>>> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>>>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>>>  			u64_stats_update_begin(&sq->stats.syncp);
>>>  			u64_stats_inc(&sq->stats.kicks);
>>> -- 
>>> 2.44.0
>>> 
>>>
Heng Qi June 5, 2024, 11:42 a.m. UTC | #25
On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> >>> From: Jiri Pirko <jiri@nvidia.com>
> >>> 
> >>> Add support for Byte Queue Limits (BQL).
> >>
> >>Historically both Jason and Michael have attempted to support BQL
> >>for virtio-net, for example:
> >>
> >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >>
> >>These discussions focus primarily on:
> >>
> >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> >>orphan, some statistical information will be lost, resulting in temporary
> >>inaccuracy in BQL.
> >>
> >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> >>reasonable. This provides good support for BQL.
> >
> >But when the device does not support dim, the orphan mode is still
> >needed, isn't it?
> 
> Heng, is my assuption correct here? Thanks!
> 

Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
most scenarios. Although orphan mode performs better in specific benckmark,
perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
support dim for devices that want the best performance.

Back to this patch set, I think BQL as a point that affects performance should
deserve more comprehensive test data.

Thanks.

> >
> >>
> >>Thanks.
> >>
> >>> 
> >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >>> ---
> >>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
> >>>  1 file changed, 20 insertions(+), 13 deletions(-)
> >>> 
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 218a446c4c27..c53d6dc6d332 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
> >>>  
> >>>  struct virtnet_sq_free_stats {
> >>>  	u64 packets;
> >>> +	u64 xdp_packets;
> >>>  	u64 bytes;
> >>> +	u64 xdp_bytes;
> >>>  };
> >>>  
> >>>  struct virtnet_sq_stats {
> >>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> >>>  	void *ptr;
> >>>  
> >>>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> >>> -		++stats->packets;
> >>> -
> >>>  		if (!is_xdp_frame(ptr)) {
> >>>  			struct sk_buff *skb = ptr;
> >>>  
> >>>  			pr_debug("Sent skb %p\n", skb);
> >>>  
> >>> +			stats->packets++;
> >>>  			stats->bytes += skb->len;
> >>>  			napi_consume_skb(skb, in_napi);
> >>>  		} else {
> >>>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
> >>>  
> >>> -			stats->bytes += xdp_get_frame_len(frame);
> >>> +			stats->xdp_packets++;
> >>> +			stats->xdp_bytes += xdp_get_frame_len(frame);
> >>>  			xdp_return_frame(frame);
> >>>  		}
> >>>  	}
> >>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
> >>>  	virtnet_rq_free_buf(vi, rq, buf);
> >>>  }
> >>>  
> >>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> >>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> >>> +			  bool in_napi)
> >>>  {
> >>>  	struct virtnet_sq_free_stats stats = {0};
> >>>  
> >>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
> >>>  	/* Avoid overhead when no packets have been processed
> >>>  	 * happens when called speculatively from start_xmit.
> >>>  	 */
> >>> -	if (!stats.packets)
> >>> +	if (!stats.packets && !stats.xdp_packets)
> >>>  		return;
> >>>  
> >>> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> >>> +
> >>>  	u64_stats_update_begin(&sq->stats.syncp);
> >>>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
> >>>  	u64_stats_add(&sq->stats.packets, stats.packets);
> >>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
> >>>  	 * early means 16 slots are typically wasted.
> >>>  	 */
> >>>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> >>> -		netif_stop_subqueue(dev, qnum);
> >>> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> >>> +
> >>> +		netif_tx_stop_queue(txq);
> >>>  		if (use_napi) {
> >>>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
> >>>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
> >>>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
> >>>  			/* More just got used, free them then recheck. */
> >>> -			free_old_xmit(sq, false);
> >>> +			free_old_xmit(sq, txq, false);
> >>>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
> >>>  				netif_start_subqueue(dev, qnum);
> >>>  				virtqueue_disable_cb(sq->vq);
> >>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
> >>>  
> >>>  		do {
> >>>  			virtqueue_disable_cb(sq->vq);
> >>> -			free_old_xmit(sq, true);
> >>> +			free_old_xmit(sq, txq, true);
> >>>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >>>  
> >>>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> >>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> >>>  	txq = netdev_get_tx_queue(vi->dev, index);
> >>>  	__netif_tx_lock(txq, raw_smp_processor_id());
> >>>  	virtqueue_disable_cb(sq->vq);
> >>> -	free_old_xmit(sq, true);
> >>> +	free_old_xmit(sq, txq, true);
> >>>  
> >>>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> >>>  		netif_tx_wake_queue(txq);
> >>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>  	struct send_queue *sq = &vi->sq[qnum];
> >>>  	int err;
> >>>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> >>> -	bool kick = !netdev_xmit_more();
> >>> +	bool xmit_more = netdev_xmit_more();
> >>>  	bool use_napi = sq->napi.weight;
> >>>  
> >>>  	/* Free up any pending old buffers before queueing new ones. */
> >>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>  		if (use_napi)
> >>>  			virtqueue_disable_cb(sq->vq);
> >>>  
> >>> -		free_old_xmit(sq, false);
> >>> +		free_old_xmit(sq, txq, false);
> >>>  
> >>> -	} while (use_napi && kick &&
> >>> +	} while (use_napi && !xmit_more &&
> >>>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >>>  
> >>>  	/* timestamp packet in software */
> >>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>  
> >>>  	check_sq_full_and_disable(vi, dev, sq);
> >>>  
> >>> -	if (kick || netif_xmit_stopped(txq)) {
> >>> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
> >>>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
> >>>  			u64_stats_update_begin(&sq->stats.syncp);
> >>>  			u64_stats_inc(&sq->stats.kicks);
> >>> -- 
> >>> 2.44.0
> >>> 
> >>>
Jason Wang June 6, 2024, 12:20 a.m. UTC | #26
On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
>
> On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > >>> From: Jiri Pirko <jiri@nvidia.com>
> > >>>
> > >>> Add support for Byte Queue Limits (BQL).
> > >>
> > >>Historically both Jason and Michael have attempted to support BQL
> > >>for virtio-net, for example:
> > >>
> > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > >>
> > >>These discussions focus primarily on:
> > >>
> > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > >>orphan, some statistical information will be lost, resulting in temporary
> > >>inaccuracy in BQL.
> > >>
> > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > >>reasonable. This provides good support for BQL.
> > >
> > >But when the device does not support dim, the orphan mode is still
> > >needed, isn't it?
> >
> > Heng, is my assuption correct here? Thanks!
> >
>
> Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> most scenarios. Although orphan mode performs better in specific benckmark,

For example pktgen (I meant even if the orphan mode can break pktgen,
it can finish when there's a new packet that needs to be sent after
pktgen is completed).

> perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> support dim for devices that want the best performance.

Ideally, if we can drop orphan mode, everything would be simplified.

>
> Back to this patch set, I think BQL as a point that affects performance should
> deserve more comprehensive test data.

Thanks

>
> Thanks.
>
> > >
> > >>
> > >>Thanks.
> > >>
> > >>>
> > >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> > >>> ---
> > >>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
> > >>>  1 file changed, 20 insertions(+), 13 deletions(-)
> > >>>
> > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > >>> index 218a446c4c27..c53d6dc6d332 100644
> > >>> --- a/drivers/net/virtio_net.c
> > >>> +++ b/drivers/net/virtio_net.c
> > >>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
> > >>>
> > >>>  struct virtnet_sq_free_stats {
> > >>>   u64 packets;
> > >>> + u64 xdp_packets;
> > >>>   u64 bytes;
> > >>> + u64 xdp_bytes;
> > >>>  };
> > >>>
> > >>>  struct virtnet_sq_stats {
> > >>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
> > >>>   void *ptr;
> > >>>
> > >>>   while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > >>> -         ++stats->packets;
> > >>> -
> > >>>           if (!is_xdp_frame(ptr)) {
> > >>>                   struct sk_buff *skb = ptr;
> > >>>
> > >>>                   pr_debug("Sent skb %p\n", skb);
> > >>>
> > >>> +                 stats->packets++;
> > >>>                   stats->bytes += skb->len;
> > >>>                   napi_consume_skb(skb, in_napi);
> > >>>           } else {
> > >>>                   struct xdp_frame *frame = ptr_to_xdp(ptr);
> > >>>
> > >>> -                 stats->bytes += xdp_get_frame_len(frame);
> > >>> +                 stats->xdp_packets++;
> > >>> +                 stats->xdp_bytes += xdp_get_frame_len(frame);
> > >>>                   xdp_return_frame(frame);
> > >>>           }
> > >>>   }
> > >>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
> > >>>   virtnet_rq_free_buf(vi, rq, buf);
> > >>>  }
> > >>>
> > >>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> > >>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > >>> +                   bool in_napi)
> > >>>  {
> > >>>   struct virtnet_sq_free_stats stats = {0};
> > >>>
> > >>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
> > >>>   /* Avoid overhead when no packets have been processed
> > >>>    * happens when called speculatively from start_xmit.
> > >>>    */
> > >>> - if (!stats.packets)
> > >>> + if (!stats.packets && !stats.xdp_packets)
> > >>>           return;
> > >>>
> > >>> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> > >>> +
> > >>>   u64_stats_update_begin(&sq->stats.syncp);
> > >>>   u64_stats_add(&sq->stats.bytes, stats.bytes);
> > >>>   u64_stats_add(&sq->stats.packets, stats.packets);
> > >>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
> > >>>    * early means 16 slots are typically wasted.
> > >>>    */
> > >>>   if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> > >>> -         netif_stop_subqueue(dev, qnum);
> > >>> +         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> > >>> +
> > >>> +         netif_tx_stop_queue(txq);
> > >>>           if (use_napi) {
> > >>>                   if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
> > >>>                           virtqueue_napi_schedule(&sq->napi, sq->vq);
> > >>>           } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
> > >>>                   /* More just got used, free them then recheck. */
> > >>> -                 free_old_xmit(sq, false);
> > >>> +                 free_old_xmit(sq, txq, false);
> > >>>                   if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
> > >>>                           netif_start_subqueue(dev, qnum);
> > >>>                           virtqueue_disable_cb(sq->vq);
> > >>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
> > >>>
> > >>>           do {
> > >>>                   virtqueue_disable_cb(sq->vq);
> > >>> -                 free_old_xmit(sq, true);
> > >>> +                 free_old_xmit(sq, txq, true);
> > >>>           } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> > >>>
> > >>>           if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> > >>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > >>>   txq = netdev_get_tx_queue(vi->dev, index);
> > >>>   __netif_tx_lock(txq, raw_smp_processor_id());
> > >>>   virtqueue_disable_cb(sq->vq);
> > >>> - free_old_xmit(sq, true);
> > >>> + free_old_xmit(sq, txq, true);
> > >>>
> > >>>   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> > >>>           netif_tx_wake_queue(txq);
> > >>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >>>   struct send_queue *sq = &vi->sq[qnum];
> > >>>   int err;
> > >>>   struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> > >>> - bool kick = !netdev_xmit_more();
> > >>> + bool xmit_more = netdev_xmit_more();
> > >>>   bool use_napi = sq->napi.weight;
> > >>>
> > >>>   /* Free up any pending old buffers before queueing new ones. */
> > >>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >>>           if (use_napi)
> > >>>                   virtqueue_disable_cb(sq->vq);
> > >>>
> > >>> -         free_old_xmit(sq, false);
> > >>> +         free_old_xmit(sq, txq, false);
> > >>>
> > >>> - } while (use_napi && kick &&
> > >>> + } while (use_napi && !xmit_more &&
> > >>>          unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> > >>>
> > >>>   /* timestamp packet in software */
> > >>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >>>
> > >>>   check_sq_full_and_disable(vi, dev, sq);
> > >>>
> > >>> - if (kick || netif_xmit_stopped(txq)) {
> > >>> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
> > >>>           if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
> > >>>                   u64_stats_update_begin(&sq->stats.syncp);
> > >>>                   u64_stats_inc(&sq->stats.kicks);
> > >>> --
> > >>> 2.44.0
> > >>>
> > >>>
>
Jason Xing June 6, 2024, 2:58 a.m. UTC | #27
Hello Jason,

On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> >
> > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > >>> From: Jiri Pirko <jiri@nvidia.com>
> > > >>>
> > > >>> Add support for Byte Queue Limits (BQL).
> > > >>
> > > >>Historically both Jason and Michael have attempted to support BQL
> > > >>for virtio-net, for example:
> > > >>
> > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > > >>
> > > >>These discussions focus primarily on:
> > > >>
> > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > > >>orphan, some statistical information will be lost, resulting in temporary
> > > >>inaccuracy in BQL.
> > > >>
> > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > > >>reasonable. This provides good support for BQL.
> > > >
> > > >But when the device does not support dim, the orphan mode is still
> > > >needed, isn't it?
> > >
> > > Heng, is my assuption correct here? Thanks!
> > >
> >
> > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> > most scenarios. Although orphan mode performs better in specific benckmark,
>
> For example pktgen (I meant even if the orphan mode can break pktgen,
> it can finish when there's a new packet that needs to be sent after
> pktgen is completed).
>
> > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> > support dim for devices that want the best performance.
>
> Ideally, if we can drop orphan mode, everything would be simplified.

Please please don't do this. Orphan mode still has its merits. In some
cases which can hardly be reproduced in production, we still choose to
turn off the napi_tx mode because the delay of freeing a skb could
cause lower performance in the tx path, which is, I know, surely
designed on purpose.

If the codes of orphan mode don't have an impact when you enable
napi_tx mode, please keep it if you can.

Thank you.
Jason Wang June 6, 2024, 4:25 a.m. UTC | #28
On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hello Jason,
>
> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > >
> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > >>> From: Jiri Pirko <jiri@nvidia.com>
> > > > >>>
> > > > >>> Add support for Byte Queue Limits (BQL).
> > > > >>
> > > > >>Historically both Jason and Michael have attempted to support BQL
> > > > >>for virtio-net, for example:
> > > > >>
> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > > > >>
> > > > >>These discussions focus primarily on:
> > > > >>
> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > > > >>orphan, some statistical information will be lost, resulting in temporary
> > > > >>inaccuracy in BQL.
> > > > >>
> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > > > >>reasonable. This provides good support for BQL.
> > > > >
> > > > >But when the device does not support dim, the orphan mode is still
> > > > >needed, isn't it?
> > > >
> > > > Heng, is my assuption correct here? Thanks!
> > > >
> > >
> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> > > most scenarios. Although orphan mode performs better in specific benckmark,
> >
> > For example pktgen (I meant even if the orphan mode can break pktgen,
> > it can finish when there's a new packet that needs to be sent after
> > pktgen is completed).
> >
> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> > > support dim for devices that want the best performance.
> >
> > Ideally, if we can drop orphan mode, everything would be simplified.
>
> Please please don't do this. Orphan mode still has its merits. In some
> cases which can hardly be reproduced in production, we still choose to
> turn off the napi_tx mode because the delay of freeing a skb could
> cause lower performance in the tx path,

Well, it's probably just a side effect and it depends on how to define
performance here.

> which is, I know, surely
> designed on purpose.

I don't think so and no modern NIC uses that. It breaks a lot of things.

>
> If the codes of orphan mode don't have an impact when you enable
> napi_tx mode, please keep it if you can.

For example, it complicates BQL implementation.

Thanks

>
> Thank you.
>
Michael S. Tsirkin June 6, 2024, 6:05 a.m. UTC | #29
On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> > If the codes of orphan mode don't have an impact when you enable
> > napi_tx mode, please keep it if you can.
> 
> For example, it complicates BQL implementation.
> 
> Thanks

I very much doubt sending interrupts to a VM can
*on all benchmarks* compete with not sending interrupts.

So yea, it's great if napi and hardware are advanced enough
that the default can be changed, since this way virtio
is closer to a regular nic and more or standard
infrastructure can be used.

But dropping it will go against *no breaking userspace* rule.
Complicated? Tough.
Jiri Pirko June 6, 2024, 7:30 a.m. UTC | #30
Thu, Jun 06, 2024 at 04:20:48AM CEST, dave.taht@gmail.com wrote:
>On Wed, May 15, 2024 at 12:37 AM Jiri Pirko <jiri@resnulli.us> wrote:
>
>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >>
>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> >>
>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >
>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >> >Thanks!
>> >> >> >>
>> >> >> >> More info about the BQL in general is here:
>> >> >> >> https://lwn.net/Articles/469652/
>> >> >> >
>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >> >mostly I got the feedback from net core maintainers that it likely
>> won't
>> >> >> >benefit virtio.
>> >> >>
>> >> >> Do you have some link to that, or is it this thread:
>> >> >>
>> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> >
>> >> >
>> >> >A quick search on lore turned up this, for example:
>> >> >
>> https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >>
>> >> Says:
>> >> "Note that NIC with many TX queues make BQL almost useless, only adding
>> extra
>> >>  overhead."
>> >>
>> >> But virtio can have one tx queue, I guess that could be quite common
>> >> configuration in lot of deployments.
>> >
>> >Not sure we should worry about performance for these though.
>> >What I am saying is this should come with some benchmarking
>> >results.
>>
>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> queue pair:
>>
>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>>
>> RR result with no bql:
>> 29.95
>> 32.74
>> 28.77
>>
>> RR result with bql:
>> 222.98
>> 159.81
>> 197.88
>>
>>
>>
>Yay! is that with pfifo_fast or fq_codel as the underlying qdisc?

pfifo_fast


>
>fq_codel, please?
>
>
>>
>> >
>> >
>> >>
>> >> >
>> >> >
>> >> >
>> >> >
>> >> >> I don't see why virtio should be any different from other
>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here
>> are
>> >> >> everywhere.
>> >> >>
>> >> >> >
>> >> >> >So I'm asking, what kind of benefit do you observe?
>> >> >>
>> >> >> I don't have measurements at hand, will attach them to v2.
>> >> >>
>> >> >> Thanks!
>> >> >>
>> >> >> >
>> >> >> >--
>> >> >> >MST
>> >> >> >
>> >> >
>> >
>>
>>
>
>-- 
>https://www.youtube.com/watch?v=BVFWSyMp3xg&t=1098s Waves Podcast
>Dave Täht CSO, LibreQos
Jason Wang June 6, 2024, 7:56 a.m. UTC | #31
On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> > > If the codes of orphan mode don't have an impact when you enable
> > > napi_tx mode, please keep it if you can.
> >
> > For example, it complicates BQL implementation.
> >
> > Thanks
>
> I very much doubt sending interrupts to a VM can
> *on all benchmarks* compete with not sending interrupts.

It should not differ too much from the physical NIC. We can have one
more round of benchmarks to see the difference.

But if NAPI mode needs to win all of the benchmarks in order to get
rid of orphan, that would be very difficult. Considering various bugs
will be fixed by dropping skb_orphan(), it would be sufficient if most
of the benchmark doesn't show obvious differences.

Looking at git history, there're commits that removes skb_orphan(), for example:

commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
Author: Eric Dumazet <edumazet@google.com>
Date:   Fri Sep 28 07:53:26 2012 +0000

    mlx4: dont orphan skbs in mlx4_en_xmit()

    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
    completions) we no longer need to orphan skbs in mlx4_en_xmit()
    since skb wont stay a long time in TX ring before their release.

    Orphaning skbs in ndo_start_xmit() should be avoided as much as
    possible, since it breaks TCP Small Queue or other flow control
    mechanisms (per socket limits)

    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
    Cc: Or Gerlitz <ogerlitz@mellanox.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

>
> So yea, it's great if napi and hardware are advanced enough
> that the default can be changed, since this way virtio
> is closer to a regular nic and more or standard
> infrastructure can be used.
>
> But dropping it will go against *no breaking userspace* rule.
> Complicated? Tough.

I don't know what kind of userspace is broken by this. Or why it is
not broken since the day we enable NAPI mode by default.

Thanks

>
> --
> MST
>
Jason Xing June 6, 2024, 11:42 a.m. UTC | #32
On Thu, Jun 6, 2024 at 12:25 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hello Jason,
> >
> > On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > >
> > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > > >>> From: Jiri Pirko <jiri@nvidia.com>
> > > > > >>>
> > > > > >>> Add support for Byte Queue Limits (BQL).
> > > > > >>
> > > > > >>Historically both Jason and Michael have attempted to support BQL
> > > > > >>for virtio-net, for example:
> > > > > >>
> > > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > > > > >>
> > > > > >>These discussions focus primarily on:
> > > > > >>
> > > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > > > > >>orphan, some statistical information will be lost, resulting in temporary
> > > > > >>inaccuracy in BQL.
> > > > > >>
> > > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > > > > >>reasonable. This provides good support for BQL.
> > > > > >
> > > > > >But when the device does not support dim, the orphan mode is still
> > > > > >needed, isn't it?
> > > > >
> > > > > Heng, is my assuption correct here? Thanks!
> > > > >
> > > >
> > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> > > > most scenarios. Although orphan mode performs better in specific benckmark,
> > >
> > > For example pktgen (I meant even if the orphan mode can break pktgen,
> > > it can finish when there's a new packet that needs to be sent after
> > > pktgen is completed).
> > >
> > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> > > > support dim for devices that want the best performance.
> > >
> > > Ideally, if we can drop orphan mode, everything would be simplified.
> >
> > Please please don't do this. Orphan mode still has its merits. In some
> > cases which can hardly be reproduced in production, we still choose to
> > turn off the napi_tx mode because the delay of freeing a skb could
> > cause lower performance in the tx path,
>
> Well, it's probably just a side effect and it depends on how to define
> performance here.

Yes.

>
> > which is, I know, surely
> > designed on purpose.
>
> I don't think so and no modern NIC uses that. It breaks a lot of things.

To avoid confusion, I meant napi_tx mode can delay/slow down the speed
in the tx path and no modern nic uses skb_orphan().

I think I will have some time to test BQL in virtio_net.

Thanks,
Jason
Michael S. Tsirkin June 6, 2024, noon UTC | #33
On Thu, Jun 06, 2024 at 07:42:35PM +0800, Jason Xing wrote:
> On Thu, Jun 6, 2024 at 12:25 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >
> > > Hello Jason,
> > >
> > > On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > >
> > > > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > > > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > > > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > > > > > >>> From: Jiri Pirko <jiri@nvidia.com>
> > > > > > >>>
> > > > > > >>> Add support for Byte Queue Limits (BQL).
> > > > > > >>
> > > > > > >>Historically both Jason and Michael have attempted to support BQL
> > > > > > >>for virtio-net, for example:
> > > > > > >>
> > > > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > > > > > >>
> > > > > > >>These discussions focus primarily on:
> > > > > > >>
> > > > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > > > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > > > > > >>orphan, some statistical information will be lost, resulting in temporary
> > > > > > >>inaccuracy in BQL.
> > > > > > >>
> > > > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > > > > > >>reasonable. This provides good support for BQL.
> > > > > > >
> > > > > > >But when the device does not support dim, the orphan mode is still
> > > > > > >needed, isn't it?
> > > > > >
> > > > > > Heng, is my assuption correct here? Thanks!
> > > > > >
> > > > >
> > > > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> > > > > most scenarios. Although orphan mode performs better in specific benckmark,
> > > >
> > > > For example pktgen (I meant even if the orphan mode can break pktgen,
> > > > it can finish when there's a new packet that needs to be sent after
> > > > pktgen is completed).
> > > >
> > > > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> > > > > support dim for devices that want the best performance.
> > > >
> > > > Ideally, if we can drop orphan mode, everything would be simplified.
> > >
> > > Please please don't do this. Orphan mode still has its merits. In some
> > > cases which can hardly be reproduced in production, we still choose to
> > > turn off the napi_tx mode because the delay of freeing a skb could
> > > cause lower performance in the tx path,
> >
> > Well, it's probably just a side effect and it depends on how to define
> > performance here.
> 
> Yes.
> 
> >
> > > which is, I know, surely
> > > designed on purpose.
> >
> > I don't think so and no modern NIC uses that. It breaks a lot of things.
> 
> To avoid confusion, I meant napi_tx mode can delay/slow down the speed
> in the tx path and no modern nic uses skb_orphan().

Clearly it's been designed for software NICs and when the
cost of interrupts is very high.

> I think I will have some time to test BQL in virtio_net.
> 
> Thanks,
> Jason
Jiri Pirko June 6, 2024, 1:41 p.m. UTC | #34
Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote:
>On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>>
>> Hello Jason,
>>
>> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >
>> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
>> > >
>> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
>> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>> > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> > > > >>> From: Jiri Pirko <jiri@nvidia.com>
>> > > > >>>
>> > > > >>> Add support for Byte Queue Limits (BQL).
>> > > > >>
>> > > > >>Historically both Jason and Michael have attempted to support BQL
>> > > > >>for virtio-net, for example:
>> > > > >>
>> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> > > > >>
>> > > > >>These discussions focus primarily on:
>> > > > >>
>> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
>> > > > >>orphan, some statistical information will be lost, resulting in temporary
>> > > > >>inaccuracy in BQL.
>> > > > >>
>> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>> > > > >>reasonable. This provides good support for BQL.
>> > > > >
>> > > > >But when the device does not support dim, the orphan mode is still
>> > > > >needed, isn't it?
>> > > >
>> > > > Heng, is my assuption correct here? Thanks!
>> > > >
>> > >
>> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
>> > > most scenarios. Although orphan mode performs better in specific benckmark,
>> >
>> > For example pktgen (I meant even if the orphan mode can break pktgen,
>> > it can finish when there's a new packet that needs to be sent after
>> > pktgen is completed).
>> >
>> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
>> > > support dim for devices that want the best performance.
>> >
>> > Ideally, if we can drop orphan mode, everything would be simplified.
>>
>> Please please don't do this. Orphan mode still has its merits. In some
>> cases which can hardly be reproduced in production, we still choose to
>> turn off the napi_tx mode because the delay of freeing a skb could
>> cause lower performance in the tx path,
>
>Well, it's probably just a side effect and it depends on how to define
>performance here.
>
>> which is, I know, surely
>> designed on purpose.
>
>I don't think so and no modern NIC uses that. It breaks a lot of things.
>
>>
>> If the codes of orphan mode don't have an impact when you enable
>> napi_tx mode, please keep it if you can.
>
>For example, it complicates BQL implementation.

Well, bql could be disabled when napi is not used. It is just a matter
of one "if" in the xmit path.


>
>Thanks
>
>>
>> Thank you.
>>
>
Jiri Pirko June 6, 2024, 1:45 p.m. UTC | #35
Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
>On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>
>> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
>> > > If the codes of orphan mode don't have an impact when you enable
>> > > napi_tx mode, please keep it if you can.
>> >
>> > For example, it complicates BQL implementation.
>> >
>> > Thanks
>>
>> I very much doubt sending interrupts to a VM can
>> *on all benchmarks* compete with not sending interrupts.
>
>It should not differ too much from the physical NIC. We can have one
>more round of benchmarks to see the difference.
>
>But if NAPI mode needs to win all of the benchmarks in order to get
>rid of orphan, that would be very difficult. Considering various bugs
>will be fixed by dropping skb_orphan(), it would be sufficient if most
>of the benchmark doesn't show obvious differences.
>
>Looking at git history, there're commits that removes skb_orphan(), for example:
>
>commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
>Author: Eric Dumazet <edumazet@google.com>
>Date:   Fri Sep 28 07:53:26 2012 +0000
>
>    mlx4: dont orphan skbs in mlx4_en_xmit()
>
>    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
>    completions) we no longer need to orphan skbs in mlx4_en_xmit()
>    since skb wont stay a long time in TX ring before their release.
>
>    Orphaning skbs in ndo_start_xmit() should be avoided as much as
>    possible, since it breaks TCP Small Queue or other flow control
>    mechanisms (per socket limits)
>
>    Signed-off-by: Eric Dumazet <edumazet@google.com>
>    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
>    Cc: Or Gerlitz <ogerlitz@mellanox.com>
>    Signed-off-by: David S. Miller <davem@davemloft.net>
>
>>
>> So yea, it's great if napi and hardware are advanced enough
>> that the default can be changed, since this way virtio
>> is closer to a regular nic and more or standard
>> infrastructure can be used.
>>
>> But dropping it will go against *no breaking userspace* rule.
>> Complicated? Tough.
>
>I don't know what kind of userspace is broken by this. Or why it is
>not broken since the day we enable NAPI mode by default.

There is a module option that explicitly allows user to set
napi_tx=false
or
napi_weight=0

So if you remove this option or ignore it, both breaks the user
expectation. I personally would vote for this breakage. To carry ancient
things like this one forever does not make sense to me. While at it,
let's remove all virtio net module params. Thoughts?



>
>Thanks
>
>>
>> --
>> MST
>>
>
Jason Wang June 7, 2024, 6:22 a.m. UTC | #36
On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote:
> >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >>
> >> Hello Jason,
> >>
> >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
> >> >
> >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> >> > >
> >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> >> > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> >> > > > >>> From: Jiri Pirko <jiri@nvidia.com>
> >> > > > >>>
> >> > > > >>> Add support for Byte Queue Limits (BQL).
> >> > > > >>
> >> > > > >>Historically both Jason and Michael have attempted to support BQL
> >> > > > >>for virtio-net, for example:
> >> > > > >>
> >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >> > > > >>
> >> > > > >>These discussions focus primarily on:
> >> > > > >>
> >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> >> > > > >>orphan, some statistical information will be lost, resulting in temporary
> >> > > > >>inaccuracy in BQL.
> >> > > > >>
> >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> >> > > > >>reasonable. This provides good support for BQL.
> >> > > > >
> >> > > > >But when the device does not support dim, the orphan mode is still
> >> > > > >needed, isn't it?
> >> > > >
> >> > > > Heng, is my assuption correct here? Thanks!
> >> > > >
> >> > >
> >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> >> > > most scenarios. Although orphan mode performs better in specific benckmark,
> >> >
> >> > For example pktgen (I meant even if the orphan mode can break pktgen,
> >> > it can finish when there's a new packet that needs to be sent after
> >> > pktgen is completed).
> >> >
> >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> >> > > support dim for devices that want the best performance.
> >> >
> >> > Ideally, if we can drop orphan mode, everything would be simplified.
> >>
> >> Please please don't do this. Orphan mode still has its merits. In some
> >> cases which can hardly be reproduced in production, we still choose to
> >> turn off the napi_tx mode because the delay of freeing a skb could
> >> cause lower performance in the tx path,
> >
> >Well, it's probably just a side effect and it depends on how to define
> >performance here.
> >
> >> which is, I know, surely
> >> designed on purpose.
> >
> >I don't think so and no modern NIC uses that. It breaks a lot of things.
> >
> >>
> >> If the codes of orphan mode don't have an impact when you enable
> >> napi_tx mode, please keep it if you can.
> >
> >For example, it complicates BQL implementation.
>
> Well, bql could be disabled when napi is not used. It is just a matter
> of one "if" in the xmit path.

Maybe, care to post a patch?

The trick part is, a skb is queued when BQL is enabled but sent when
BQL is disabled as discussed here:

https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/

Thanks

>
>
> >
> >Thanks
> >
> >>
> >> Thank you.
> >>
> >
>
Jason Wang June 7, 2024, 6:25 a.m. UTC | #37
On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>
> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> >> > > If the codes of orphan mode don't have an impact when you enable
> >> > > napi_tx mode, please keep it if you can.
> >> >
> >> > For example, it complicates BQL implementation.
> >> >
> >> > Thanks
> >>
> >> I very much doubt sending interrupts to a VM can
> >> *on all benchmarks* compete with not sending interrupts.
> >
> >It should not differ too much from the physical NIC. We can have one
> >more round of benchmarks to see the difference.
> >
> >But if NAPI mode needs to win all of the benchmarks in order to get
> >rid of orphan, that would be very difficult. Considering various bugs
> >will be fixed by dropping skb_orphan(), it would be sufficient if most
> >of the benchmark doesn't show obvious differences.
> >
> >Looking at git history, there're commits that removes skb_orphan(), for example:
> >
> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
> >Author: Eric Dumazet <edumazet@google.com>
> >Date:   Fri Sep 28 07:53:26 2012 +0000
> >
> >    mlx4: dont orphan skbs in mlx4_en_xmit()
> >
> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
> >    since skb wont stay a long time in TX ring before their release.
> >
> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
> >    possible, since it breaks TCP Small Queue or other flow control
> >    mechanisms (per socket limits)
> >
> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
> >    Signed-off-by: David S. Miller <davem@davemloft.net>
> >
> >>
> >> So yea, it's great if napi and hardware are advanced enough
> >> that the default can be changed, since this way virtio
> >> is closer to a regular nic and more or standard
> >> infrastructure can be used.
> >>
> >> But dropping it will go against *no breaking userspace* rule.
> >> Complicated? Tough.
> >
> >I don't know what kind of userspace is broken by this. Or why it is
> >not broken since the day we enable NAPI mode by default.
>
> There is a module option that explicitly allows user to set
> napi_tx=false
> or
> napi_weight=0
>
> So if you remove this option or ignore it, both breaks the user
> expectation.

We can keep them, but I wonder what's the expectation of the user
here? The only thing so far I can imagine is the performance
difference.

> I personally would vote for this breakage. To carry ancient
> things like this one forever does not make sense to me.

Exactly.

> While at it,
> let's remove all virtio net module params. Thoughts?

I tend to

1) drop the orphan mode, but we can have some benchmarks first
2) keep the module parameters

Thanks

>
>
>
> >
> >Thanks
> >
> >>
> >> --
> >> MST
> >>
> >
>
Jiri Pirko June 7, 2024, 6:39 a.m. UTC | #38
Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote:
>On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
>>
>> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
>> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>> >>
>> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
>> >> > > If the codes of orphan mode don't have an impact when you enable
>> >> > > napi_tx mode, please keep it if you can.
>> >> >
>> >> > For example, it complicates BQL implementation.
>> >> >
>> >> > Thanks
>> >>
>> >> I very much doubt sending interrupts to a VM can
>> >> *on all benchmarks* compete with not sending interrupts.
>> >
>> >It should not differ too much from the physical NIC. We can have one
>> >more round of benchmarks to see the difference.
>> >
>> >But if NAPI mode needs to win all of the benchmarks in order to get
>> >rid of orphan, that would be very difficult. Considering various bugs
>> >will be fixed by dropping skb_orphan(), it would be sufficient if most
>> >of the benchmark doesn't show obvious differences.
>> >
>> >Looking at git history, there're commits that removes skb_orphan(), for example:
>> >
>> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
>> >Author: Eric Dumazet <edumazet@google.com>
>> >Date:   Fri Sep 28 07:53:26 2012 +0000
>> >
>> >    mlx4: dont orphan skbs in mlx4_en_xmit()
>> >
>> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
>> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
>> >    since skb wont stay a long time in TX ring before their release.
>> >
>> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
>> >    possible, since it breaks TCP Small Queue or other flow control
>> >    mechanisms (per socket limits)
>> >
>> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
>> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
>> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
>> >    Signed-off-by: David S. Miller <davem@davemloft.net>
>> >
>> >>
>> >> So yea, it's great if napi and hardware are advanced enough
>> >> that the default can be changed, since this way virtio
>> >> is closer to a regular nic and more or standard
>> >> infrastructure can be used.
>> >>
>> >> But dropping it will go against *no breaking userspace* rule.
>> >> Complicated? Tough.
>> >
>> >I don't know what kind of userspace is broken by this. Or why it is
>> >not broken since the day we enable NAPI mode by default.
>>
>> There is a module option that explicitly allows user to set
>> napi_tx=false
>> or
>> napi_weight=0
>>
>> So if you remove this option or ignore it, both breaks the user
>> expectation.
>
>We can keep them, but I wonder what's the expectation of the user
>here? The only thing so far I can imagine is the performance
>difference.

True.

>
>> I personally would vote for this breakage. To carry ancient
>> things like this one forever does not make sense to me.
>
>Exactly.
>
>> While at it,
>> let's remove all virtio net module params. Thoughts?
>
>I tend to
>
>1) drop the orphan mode, but we can have some benchmarks first

Any idea which? That would be really tricky to find the ones where
orphan mode makes difference I assume.


>2) keep the module parameters

and ignore them, correct? Perhaps a warning would be good.


>
>Thanks
>
>>
>>
>>
>> >
>> >Thanks
>> >
>> >>
>> >> --
>> >> MST
>> >>
>> >
>>
>
Michael S. Tsirkin June 7, 2024, 6:39 a.m. UTC | #39
On Fri, Jun 07, 2024 at 02:22:31PM +0800, Jason Wang wrote:
> On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >
> > Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote:
> > >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >>
> > >> Hello Jason,
> > >>
> > >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
> > >> >
> > >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > >> > >
> > >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
> > >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
> > >> > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> > >> > > > >>> From: Jiri Pirko <jiri@nvidia.com>
> > >> > > > >>>
> > >> > > > >>> Add support for Byte Queue Limits (BQL).
> > >> > > > >>
> > >> > > > >>Historically both Jason and Michael have attempted to support BQL
> > >> > > > >>for virtio-net, for example:
> > >> > > > >>
> > >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> > >> > > > >>
> > >> > > > >>These discussions focus primarily on:
> > >> > > > >>
> > >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
> > >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
> > >> > > > >>orphan, some statistical information will be lost, resulting in temporary
> > >> > > > >>inaccuracy in BQL.
> > >> > > > >>
> > >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
> > >> > > > >>reasonable. This provides good support for BQL.
> > >> > > > >
> > >> > > > >But when the device does not support dim, the orphan mode is still
> > >> > > > >needed, isn't it?
> > >> > > >
> > >> > > > Heng, is my assuption correct here? Thanks!
> > >> > > >
> > >> > >
> > >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
> > >> > > most scenarios. Although orphan mode performs better in specific benckmark,
> > >> >
> > >> > For example pktgen (I meant even if the orphan mode can break pktgen,
> > >> > it can finish when there's a new packet that needs to be sent after
> > >> > pktgen is completed).
> > >> >
> > >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
> > >> > > support dim for devices that want the best performance.
> > >> >
> > >> > Ideally, if we can drop orphan mode, everything would be simplified.
> > >>
> > >> Please please don't do this. Orphan mode still has its merits. In some
> > >> cases which can hardly be reproduced in production, we still choose to
> > >> turn off the napi_tx mode because the delay of freeing a skb could
> > >> cause lower performance in the tx path,
> > >
> > >Well, it's probably just a side effect and it depends on how to define
> > >performance here.
> > >
> > >> which is, I know, surely
> > >> designed on purpose.
> > >
> > >I don't think so and no modern NIC uses that. It breaks a lot of things.
> > >
> > >>
> > >> If the codes of orphan mode don't have an impact when you enable
> > >> napi_tx mode, please keep it if you can.
> > >
> > >For example, it complicates BQL implementation.
> >
> > Well, bql could be disabled when napi is not used. It is just a matter
> > of one "if" in the xmit path.
> 
> Maybe, care to post a patch?
> 
> The trick part is, a skb is queued when BQL is enabled but sent when
> BQL is disabled as discussed here:
> 
> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> 
> Thanks

Yes of course. Or we can stick a dummy value in skb->destructor after we
orphan, maybe that's easier.


> >
> >
> > >
> > >Thanks
> > >
> > >>
> > >> Thank you.
> > >>
> > >
> >
Jiri Pirko June 7, 2024, 6:40 a.m. UTC | #40
Fri, Jun 07, 2024 at 08:22:31AM CEST, jasowang@redhat.com wrote:
>On Thu, Jun 6, 2024 at 9:41 PM Jiri Pirko <jiri@resnulli.us> wrote:
>>
>> Thu, Jun 06, 2024 at 06:25:15AM CEST, jasowang@redhat.com wrote:
>> >On Thu, Jun 6, 2024 at 10:59 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>> >>
>> >> Hello Jason,
>> >>
>> >> On Thu, Jun 6, 2024 at 8:21 AM Jason Wang <jasowang@redhat.com> wrote:
>> >> >
>> >> > On Wed, Jun 5, 2024 at 7:51 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
>> >> > >
>> >> > > On Wed, 5 Jun 2024 13:30:51 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> >> > > > Mon, May 20, 2024 at 02:48:15PM CEST, jiri@resnulli.us wrote:
>> >> > > > >Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>> >> > > > >>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> >> > > > >>> From: Jiri Pirko <jiri@nvidia.com>
>> >> > > > >>>
>> >> > > > >>> Add support for Byte Queue Limits (BQL).
>> >> > > > >>
>> >> > > > >>Historically both Jason and Michael have attempted to support BQL
>> >> > > > >>for virtio-net, for example:
>> >> > > > >>
>> >> > > > >>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> > > > >>
>> >> > > > >>These discussions focus primarily on:
>> >> > > > >>
>> >> > > > >>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>> >> > > > >>needs to rely on the judgment of use_napi. When the napi mode is switched to
>> >> > > > >>orphan, some statistical information will be lost, resulting in temporary
>> >> > > > >>inaccuracy in BQL.
>> >> > > > >>
>> >> > > > >>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>> >> > > > >>reasonable. This provides good support for BQL.
>> >> > > > >
>> >> > > > >But when the device does not support dim, the orphan mode is still
>> >> > > > >needed, isn't it?
>> >> > > >
>> >> > > > Heng, is my assuption correct here? Thanks!
>> >> > > >
>> >> > >
>> >> > > Maybe, according to our cloud data, napi_tx=on works better than orphan mode in
>> >> > > most scenarios. Although orphan mode performs better in specific benckmark,
>> >> >
>> >> > For example pktgen (I meant even if the orphan mode can break pktgen,
>> >> > it can finish when there's a new packet that needs to be sent after
>> >> > pktgen is completed).
>> >> >
>> >> > > perf of napi_tx can be enhanced through tx dim. Then, there is no reason not to
>> >> > > support dim for devices that want the best performance.
>> >> >
>> >> > Ideally, if we can drop orphan mode, everything would be simplified.
>> >>
>> >> Please please don't do this. Orphan mode still has its merits. In some
>> >> cases which can hardly be reproduced in production, we still choose to
>> >> turn off the napi_tx mode because the delay of freeing a skb could
>> >> cause lower performance in the tx path,
>> >
>> >Well, it's probably just a side effect and it depends on how to define
>> >performance here.
>> >
>> >> which is, I know, surely
>> >> designed on purpose.
>> >
>> >I don't think so and no modern NIC uses that. It breaks a lot of things.
>> >
>> >>
>> >> If the codes of orphan mode don't have an impact when you enable
>> >> napi_tx mode, please keep it if you can.
>> >
>> >For example, it complicates BQL implementation.
>>
>> Well, bql could be disabled when napi is not used. It is just a matter
>> of one "if" in the xmit path.
>
>Maybe, care to post a patch?
>
>The trick part is, a skb is queued when BQL is enabled but sent when
>BQL is disabled as discussed here:
>
>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>Thanks

Will try to go in orphan removal direction first.

>
>>
>>
>> >
>> >Thanks
>> >
>> >>
>> >> Thank you.
>> >>
>> >
>>
>
Michael S. Tsirkin June 7, 2024, 6:43 a.m. UTC | #41
On Fri, Jun 07, 2024 at 08:39:20AM +0200, Jiri Pirko wrote:
> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote:
> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >>
> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >> >>
> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> >> >> > > If the codes of orphan mode don't have an impact when you enable
> >> >> > > napi_tx mode, please keep it if you can.
> >> >> >
> >> >> > For example, it complicates BQL implementation.
> >> >> >
> >> >> > Thanks
> >> >>
> >> >> I very much doubt sending interrupts to a VM can
> >> >> *on all benchmarks* compete with not sending interrupts.
> >> >
> >> >It should not differ too much from the physical NIC. We can have one
> >> >more round of benchmarks to see the difference.
> >> >
> >> >But if NAPI mode needs to win all of the benchmarks in order to get
> >> >rid of orphan, that would be very difficult. Considering various bugs
> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most
> >> >of the benchmark doesn't show obvious differences.
> >> >
> >> >Looking at git history, there're commits that removes skb_orphan(), for example:
> >> >
> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
> >> >Author: Eric Dumazet <edumazet@google.com>
> >> >Date:   Fri Sep 28 07:53:26 2012 +0000
> >> >
> >> >    mlx4: dont orphan skbs in mlx4_en_xmit()
> >> >
> >> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
> >> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
> >> >    since skb wont stay a long time in TX ring before their release.
> >> >
> >> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
> >> >    possible, since it breaks TCP Small Queue or other flow control
> >> >    mechanisms (per socket limits)
> >> >
> >> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
> >> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
> >> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
> >> >    Signed-off-by: David S. Miller <davem@davemloft.net>
> >> >
> >> >>
> >> >> So yea, it's great if napi and hardware are advanced enough
> >> >> that the default can be changed, since this way virtio
> >> >> is closer to a regular nic and more or standard
> >> >> infrastructure can be used.
> >> >>
> >> >> But dropping it will go against *no breaking userspace* rule.
> >> >> Complicated? Tough.
> >> >
> >> >I don't know what kind of userspace is broken by this. Or why it is
> >> >not broken since the day we enable NAPI mode by default.
> >>
> >> There is a module option that explicitly allows user to set
> >> napi_tx=false
> >> or
> >> napi_weight=0
> >>
> >> So if you remove this option or ignore it, both breaks the user
> >> expectation.
> >
> >We can keep them, but I wonder what's the expectation of the user
> >here? The only thing so far I can imagine is the performance
> >difference.
> 
> True.
> 
> >
> >> I personally would vote for this breakage. To carry ancient
> >> things like this one forever does not make sense to me.
> >
> >Exactly.
> >
> >> While at it,
> >> let's remove all virtio net module params. Thoughts?
> >
> >I tend to
> >
> >1) drop the orphan mode, but we can have some benchmarks first
> 
> Any idea which? That would be really tricky to find the ones where
> orphan mode makes difference I assume.

Exactly. We are kind of stuck with it I think.
I would just do this:

void orphan_destructor(struct sk_buff *skb)
{
}

	skb_orphan(skb);
	skb->destructor = orphan_destructor;
	/* skip BQL */
	return;


and then later
	/* skip BQL accounting if we orphaned on xmit path */
	if (skb->destructor == orphan_destructor)
		return;



Hmm?


> 
> >2) keep the module parameters
> 
> and ignore them, correct? Perhaps a warning would be good.
> 
> 
> >
> >Thanks
> >
> >>
> >>
> >>
> >> >
> >> >Thanks
> >> >
> >> >>
> >> >> --
> >> >> MST
> >> >>
> >> >
> >>
> >
Jason Wang June 7, 2024, 6:47 a.m. UTC | #42
On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote:
> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >>
> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >> >>
> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> >> >> > > If the codes of orphan mode don't have an impact when you enable
> >> >> > > napi_tx mode, please keep it if you can.
> >> >> >
> >> >> > For example, it complicates BQL implementation.
> >> >> >
> >> >> > Thanks
> >> >>
> >> >> I very much doubt sending interrupts to a VM can
> >> >> *on all benchmarks* compete with not sending interrupts.
> >> >
> >> >It should not differ too much from the physical NIC. We can have one
> >> >more round of benchmarks to see the difference.
> >> >
> >> >But if NAPI mode needs to win all of the benchmarks in order to get
> >> >rid of orphan, that would be very difficult. Considering various bugs
> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most
> >> >of the benchmark doesn't show obvious differences.
> >> >
> >> >Looking at git history, there're commits that removes skb_orphan(), for example:
> >> >
> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
> >> >Author: Eric Dumazet <edumazet@google.com>
> >> >Date:   Fri Sep 28 07:53:26 2012 +0000
> >> >
> >> >    mlx4: dont orphan skbs in mlx4_en_xmit()
> >> >
> >> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
> >> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
> >> >    since skb wont stay a long time in TX ring before their release.
> >> >
> >> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
> >> >    possible, since it breaks TCP Small Queue or other flow control
> >> >    mechanisms (per socket limits)
> >> >
> >> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
> >> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
> >> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
> >> >    Signed-off-by: David S. Miller <davem@davemloft.net>
> >> >
> >> >>
> >> >> So yea, it's great if napi and hardware are advanced enough
> >> >> that the default can be changed, since this way virtio
> >> >> is closer to a regular nic and more or standard
> >> >> infrastructure can be used.
> >> >>
> >> >> But dropping it will go against *no breaking userspace* rule.
> >> >> Complicated? Tough.
> >> >
> >> >I don't know what kind of userspace is broken by this. Or why it is
> >> >not broken since the day we enable NAPI mode by default.
> >>
> >> There is a module option that explicitly allows user to set
> >> napi_tx=false
> >> or
> >> napi_weight=0
> >>
> >> So if you remove this option or ignore it, both breaks the user
> >> expectation.
> >
> >We can keep them, but I wonder what's the expectation of the user
> >here? The only thing so far I can imagine is the performance
> >difference.
>
> True.
>
> >
> >> I personally would vote for this breakage. To carry ancient
> >> things like this one forever does not make sense to me.
> >
> >Exactly.
> >
> >> While at it,
> >> let's remove all virtio net module params. Thoughts?
> >
> >I tend to
> >
> >1) drop the orphan mode, but we can have some benchmarks first
>
> Any idea which? That would be really tricky to find the ones where
> orphan mode makes difference I assume.

True. Personally, I would like to just drop orphan mode. But I'm not
sure others are happy with this.

Thanks

>
>
> >2) keep the module parameters
>
> and ignore them, correct? Perhaps a warning would be good.
>
>
> >
> >Thanks
> >
> >>
> >>
> >>
> >> >
> >> >Thanks
> >> >
> >> >>
> >> >> --
> >> >> MST
> >> >>
> >> >
> >>
> >
>
Jiri Pirko June 7, 2024, 9:57 a.m. UTC | #43
Fri, Jun 07, 2024 at 08:47:43AM CEST, jasowang@redhat.com wrote:
>On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote:
>>
>> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote:
>> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
>> >>
>> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
>> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>> >> >>
>> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
>> >> >> > > If the codes of orphan mode don't have an impact when you enable
>> >> >> > > napi_tx mode, please keep it if you can.
>> >> >> >
>> >> >> > For example, it complicates BQL implementation.
>> >> >> >
>> >> >> > Thanks
>> >> >>
>> >> >> I very much doubt sending interrupts to a VM can
>> >> >> *on all benchmarks* compete with not sending interrupts.
>> >> >
>> >> >It should not differ too much from the physical NIC. We can have one
>> >> >more round of benchmarks to see the difference.
>> >> >
>> >> >But if NAPI mode needs to win all of the benchmarks in order to get
>> >> >rid of orphan, that would be very difficult. Considering various bugs
>> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most
>> >> >of the benchmark doesn't show obvious differences.
>> >> >
>> >> >Looking at git history, there're commits that removes skb_orphan(), for example:
>> >> >
>> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
>> >> >Author: Eric Dumazet <edumazet@google.com>
>> >> >Date:   Fri Sep 28 07:53:26 2012 +0000
>> >> >
>> >> >    mlx4: dont orphan skbs in mlx4_en_xmit()
>> >> >
>> >> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
>> >> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
>> >> >    since skb wont stay a long time in TX ring before their release.
>> >> >
>> >> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
>> >> >    possible, since it breaks TCP Small Queue or other flow control
>> >> >    mechanisms (per socket limits)
>> >> >
>> >> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
>> >> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
>> >> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
>> >> >    Signed-off-by: David S. Miller <davem@davemloft.net>
>> >> >
>> >> >>
>> >> >> So yea, it's great if napi and hardware are advanced enough
>> >> >> that the default can be changed, since this way virtio
>> >> >> is closer to a regular nic and more or standard
>> >> >> infrastructure can be used.
>> >> >>
>> >> >> But dropping it will go against *no breaking userspace* rule.
>> >> >> Complicated? Tough.
>> >> >
>> >> >I don't know what kind of userspace is broken by this. Or why it is
>> >> >not broken since the day we enable NAPI mode by default.
>> >>
>> >> There is a module option that explicitly allows user to set
>> >> napi_tx=false
>> >> or
>> >> napi_weight=0
>> >>
>> >> So if you remove this option or ignore it, both breaks the user
>> >> expectation.
>> >
>> >We can keep them, but I wonder what's the expectation of the user
>> >here? The only thing so far I can imagine is the performance
>> >difference.
>>
>> True.
>>
>> >
>> >> I personally would vote for this breakage. To carry ancient
>> >> things like this one forever does not make sense to me.
>> >
>> >Exactly.
>> >
>> >> While at it,
>> >> let's remove all virtio net module params. Thoughts?
>> >
>> >I tend to
>> >
>> >1) drop the orphan mode, but we can have some benchmarks first
>>
>> Any idea which? That would be really tricky to find the ones where
>> orphan mode makes difference I assume.
>
>True. Personally, I would like to just drop orphan mode. But I'm not
>sure others are happy with this.

How about to do it other way around. I will take a stab at sending patch
removing it. If anyone is against and has solid data to prove orphan
mode is needed, let them provide those.


>
>Thanks
>
>>
>>
>> >2) keep the module parameters
>>
>> and ignore them, correct? Perhaps a warning would be good.
>>
>>
>> >
>> >Thanks
>> >
>> >>
>> >>
>> >>
>> >> >
>> >> >Thanks
>> >> >
>> >> >>
>> >> >> --
>> >> >> MST
>> >> >>
>> >> >
>> >>
>> >
>>
>
Michael S. Tsirkin June 7, 2024, 10:23 a.m. UTC | #44
On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> >True. Personally, I would like to just drop orphan mode. But I'm not
> >sure others are happy with this.
> 
> How about to do it other way around. I will take a stab at sending patch
> removing it. If anyone is against and has solid data to prove orphan
> mode is needed, let them provide those.

Break it with no warning and see if anyone complains?
No, this is not how we handle userspace compatibility, normally.
Jason Xing June 7, 2024, 11:22 a.m. UTC | #45
On Fri, Jun 7, 2024 at 5:57 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Fri, Jun 07, 2024 at 08:47:43AM CEST, jasowang@redhat.com wrote:
> >On Fri, Jun 7, 2024 at 2:39 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >>
> >> Fri, Jun 07, 2024 at 08:25:19AM CEST, jasowang@redhat.com wrote:
> >> >On Thu, Jun 6, 2024 at 9:45 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >> >>
> >> >> Thu, Jun 06, 2024 at 09:56:50AM CEST, jasowang@redhat.com wrote:
> >> >> >On Thu, Jun 6, 2024 at 2:05 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >> >> >>
> >> >> >> On Thu, Jun 06, 2024 at 12:25:15PM +0800, Jason Wang wrote:
> >> >> >> > > If the codes of orphan mode don't have an impact when you enable
> >> >> >> > > napi_tx mode, please keep it if you can.
> >> >> >> >
> >> >> >> > For example, it complicates BQL implementation.
> >> >> >> >
> >> >> >> > Thanks
> >> >> >>
> >> >> >> I very much doubt sending interrupts to a VM can
> >> >> >> *on all benchmarks* compete with not sending interrupts.
> >> >> >
> >> >> >It should not differ too much from the physical NIC. We can have one
> >> >> >more round of benchmarks to see the difference.
> >> >> >
> >> >> >But if NAPI mode needs to win all of the benchmarks in order to get
> >> >> >rid of orphan, that would be very difficult. Considering various bugs
> >> >> >will be fixed by dropping skb_orphan(), it would be sufficient if most
> >> >> >of the benchmark doesn't show obvious differences.
> >> >> >
> >> >> >Looking at git history, there're commits that removes skb_orphan(), for example:
> >> >> >
> >> >> >commit 8112ec3b8722680251aecdcc23dfd81aa7af6340
> >> >> >Author: Eric Dumazet <edumazet@google.com>
> >> >> >Date:   Fri Sep 28 07:53:26 2012 +0000
> >> >> >
> >> >> >    mlx4: dont orphan skbs in mlx4_en_xmit()
> >> >> >
> >> >> >    After commit e22979d96a55d (mlx4_en: Moving to Interrupts for TX
> >> >> >    completions) we no longer need to orphan skbs in mlx4_en_xmit()
> >> >> >    since skb wont stay a long time in TX ring before their release.
> >> >> >
> >> >> >    Orphaning skbs in ndo_start_xmit() should be avoided as much as
> >> >> >    possible, since it breaks TCP Small Queue or other flow control
> >> >> >    mechanisms (per socket limits)
> >> >> >
> >> >> >    Signed-off-by: Eric Dumazet <edumazet@google.com>
> >> >> >    Acked-by: Yevgeny Petrilin <yevgenyp@mellanox.com>
> >> >> >    Cc: Or Gerlitz <ogerlitz@mellanox.com>
> >> >> >    Signed-off-by: David S. Miller <davem@davemloft.net>
> >> >> >
> >> >> >>
> >> >> >> So yea, it's great if napi and hardware are advanced enough
> >> >> >> that the default can be changed, since this way virtio
> >> >> >> is closer to a regular nic and more or standard
> >> >> >> infrastructure can be used.
> >> >> >>
> >> >> >> But dropping it will go against *no breaking userspace* rule.
> >> >> >> Complicated? Tough.
> >> >> >
> >> >> >I don't know what kind of userspace is broken by this. Or why it is
> >> >> >not broken since the day we enable NAPI mode by default.
> >> >>
> >> >> There is a module option that explicitly allows user to set
> >> >> napi_tx=false
> >> >> or
> >> >> napi_weight=0
> >> >>
> >> >> So if you remove this option or ignore it, both breaks the user
> >> >> expectation.
> >> >
> >> >We can keep them, but I wonder what's the expectation of the user
> >> >here? The only thing so far I can imagine is the performance
> >> >difference.
> >>
> >> True.
> >>
> >> >
> >> >> I personally would vote for this breakage. To carry ancient
> >> >> things like this one forever does not make sense to me.
> >> >
> >> >Exactly.
> >> >
> >> >> While at it,
> >> >> let's remove all virtio net module params. Thoughts?
> >> >
> >> >I tend to
> >> >
> >> >1) drop the orphan mode, but we can have some benchmarks first
> >>
> >> Any idea which? That would be really tricky to find the ones where
> >> orphan mode makes difference I assume.
> >
> >True. Personally, I would like to just drop orphan mode. But I'm not
> >sure others are happy with this.
>
> How about to do it other way around. I will take a stab at sending patch
> removing it. If anyone is against and has solid data to prove orphan
> mode is needed, let them provide those.

Honestly, we in production have to use skb orphan mode. I cannot
provide enough and strong evidence about why default mode on earth
causes performance degradation in some cases. I mean I don't know the
root cause. The only thing I can tell is if I enable the skb orphan
mode, then 1)  will let more skb pass the TCP layer, 2) some key
members like snd_cwnd in tcp will behave normally.

I know without orphan mode the skb will be controlled/limited by the
combination of TSO and napi_tx mode thanks to sk_wmem_alloc. So I
_guess_ the root cause is: the possible delay of interrupts generated
by the host machine will cause the delay of freeing skbs, resulting in
the slowing down the tx speed. If the interval between two interrupts
is very short like the real NIC, I think the issue would disappear.

That's all I can tell. Just for record. Hope this information could
also be useful to other readers.

Thanks,
Jason

>
>
> >
> >Thanks
> >
> >>
> >>
> >> >2) keep the module parameters
> >>
> >> and ignore them, correct? Perhaps a warning would be good.
> >>
> >>
> >> >
> >> >Thanks
> >> >
> >> >>
> >> >>
> >> >>
> >> >> >
> >> >> >Thanks
> >> >> >
> >> >> >>
> >> >> >> --
> >> >> >> MST
> >> >> >>
> >> >> >
> >> >>
> >> >
> >>
> >
Jiri Pirko June 7, 2024, 11:30 a.m. UTC | #46
Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
>On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
>> >True. Personally, I would like to just drop orphan mode. But I'm not
>> >sure others are happy with this.
>> 
>> How about to do it other way around. I will take a stab at sending patch
>> removing it. If anyone is against and has solid data to prove orphan
>> mode is needed, let them provide those.
>
>Break it with no warning and see if anyone complains?

This is now what I suggested at all.

>No, this is not how we handle userspace compatibility, normally.

Sure.

Again:

I would send orphan removal patch containing:
1) no module options removal. Warn if someone sets it up
2) module option to disable napi is ignored
3) orphan mode is removed from code

There is no breakage. Only, hypotetically performance downgrade in some
hypotetical usecase nobody knows of. My point was, if someone presents
solid data to prove orphan is needed during the patch review, let's toss
out the patch.

Makes sense?


>
>-- 
>MST
>
Michael S. Tsirkin June 10, 2024, 2:18 p.m. UTC | #47
On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> >> >True. Personally, I would like to just drop orphan mode. But I'm not
> >> >sure others are happy with this.
> >> 
> >> How about to do it other way around. I will take a stab at sending patch
> >> removing it. If anyone is against and has solid data to prove orphan
> >> mode is needed, let them provide those.
> >
> >Break it with no warning and see if anyone complains?
> 
> This is now what I suggested at all.
> 
> >No, this is not how we handle userspace compatibility, normally.
> 
> Sure.
> 
> Again:
> 
> I would send orphan removal patch containing:
> 1) no module options removal. Warn if someone sets it up
> 2) module option to disable napi is ignored
> 3) orphan mode is removed from code
> 
> There is no breakage. Only, hypotetically performance downgrade in some
> hypotetical usecase nobody knows of.

Performance is why people use virtio. It's as much a breakage as any
other bug. The main difference is, with other types of breakage, they
are typically binary and we can not tolerate them at all.  A tiny,
negligeable performance regression might be tolarable if it brings
other benefits. I very much doubt avoiding interrupts is
negligeable though. And making code simpler isn't a big benefit,
users do not care.

> My point was, if someone presents
> solid data to prove orphan is needed during the patch review, let's toss
> out the patch.
> 
> Makes sense?

It's not hypothetical - if anything, it's hypothetical that performance
does not regress.  And we just got a report from users that see a
regression without.  So, not really.

> 
> >
> >-- 
> >MST
> >
Jason Wang June 17, 2024, 1:44 a.m. UTC | #48
On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
> > >> >sure others are happy with this.
> > >>
> > >> How about to do it other way around. I will take a stab at sending patch
> > >> removing it. If anyone is against and has solid data to prove orphan
> > >> mode is needed, let them provide those.
> > >
> > >Break it with no warning and see if anyone complains?
> >
> > This is now what I suggested at all.
> >
> > >No, this is not how we handle userspace compatibility, normally.
> >
> > Sure.
> >
> > Again:
> >
> > I would send orphan removal patch containing:
> > 1) no module options removal. Warn if someone sets it up
> > 2) module option to disable napi is ignored
> > 3) orphan mode is removed from code
> >
> > There is no breakage. Only, hypotetically performance downgrade in some
> > hypotetical usecase nobody knows of.
>
> Performance is why people use virtio. It's as much a breakage as any
> other bug. The main difference is, with other types of breakage, they
> are typically binary and we can not tolerate them at all.  A tiny,
> negligeable performance regression might be tolarable if it brings
> other benefits. I very much doubt avoiding interrupts is
> negligeable though. And making code simpler isn't a big benefit,
> users do not care.

It's not just making code simpler. As discussed in the past, it also
fixes real bugs.

>
> > My point was, if someone presents
> > solid data to prove orphan is needed during the patch review, let's toss
> > out the patch.
> >
> > Makes sense?
>
> It's not hypothetical - if anything, it's hypothetical that performance
> does not regress.  And we just got a report from users that see a
> regression without.  So, not really.

Probably, but do we need to define a bar here? Looking at git history,
we didn't ask a full benchmark for a lot of commits that may touch
performance.

Thanks

>
> >
> > >
> > >--
> > >MST
> > >
>
Jiri Pirko June 17, 2024, 9:30 a.m. UTC | #49
Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote:
>On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>
>> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
>> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
>> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
>> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
>> > >> >sure others are happy with this.
>> > >>
>> > >> How about to do it other way around. I will take a stab at sending patch
>> > >> removing it. If anyone is against and has solid data to prove orphan
>> > >> mode is needed, let them provide those.
>> > >
>> > >Break it with no warning and see if anyone complains?
>> >
>> > This is now what I suggested at all.
>> >
>> > >No, this is not how we handle userspace compatibility, normally.
>> >
>> > Sure.
>> >
>> > Again:
>> >
>> > I would send orphan removal patch containing:
>> > 1) no module options removal. Warn if someone sets it up
>> > 2) module option to disable napi is ignored
>> > 3) orphan mode is removed from code
>> >
>> > There is no breakage. Only, hypotetically performance downgrade in some
>> > hypotetical usecase nobody knows of.
>>
>> Performance is why people use virtio. It's as much a breakage as any
>> other bug. The main difference is, with other types of breakage, they
>> are typically binary and we can not tolerate them at all.  A tiny,
>> negligeable performance regression might be tolarable if it brings
>> other benefits. I very much doubt avoiding interrupts is
>> negligeable though. And making code simpler isn't a big benefit,
>> users do not care.
>
>It's not just making code simpler. As discussed in the past, it also
>fixes real bugs.
>
>>
>> > My point was, if someone presents
>> > solid data to prove orphan is needed during the patch review, let's toss
>> > out the patch.
>> >
>> > Makes sense?
>>
>> It's not hypothetical - if anything, it's hypothetical that performance
>> does not regress.  And we just got a report from users that see a
>> regression without.  So, not really.
>
>Probably, but do we need to define a bar here? Looking at git history,
>we didn't ask a full benchmark for a lot of commits that may touch

Moreover, there is no "benchmark" to run anyway, is it?


>performance.
>
>Thanks
>
>>
>> >
>> > >
>> > >--
>> > >MST
>> > >
>>
>
Michael S. Tsirkin June 17, 2024, 4:16 p.m. UTC | #50
On Mon, Jun 17, 2024 at 11:30:36AM +0200, Jiri Pirko wrote:
> Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote:
> >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>
> >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
> >> > >> >sure others are happy with this.
> >> > >>
> >> > >> How about to do it other way around. I will take a stab at sending patch
> >> > >> removing it. If anyone is against and has solid data to prove orphan
> >> > >> mode is needed, let them provide those.
> >> > >
> >> > >Break it with no warning and see if anyone complains?
> >> >
> >> > This is now what I suggested at all.
> >> >
> >> > >No, this is not how we handle userspace compatibility, normally.
> >> >
> >> > Sure.
> >> >
> >> > Again:
> >> >
> >> > I would send orphan removal patch containing:
> >> > 1) no module options removal. Warn if someone sets it up
> >> > 2) module option to disable napi is ignored
> >> > 3) orphan mode is removed from code
> >> >
> >> > There is no breakage. Only, hypotetically performance downgrade in some
> >> > hypotetical usecase nobody knows of.
> >>
> >> Performance is why people use virtio. It's as much a breakage as any
> >> other bug. The main difference is, with other types of breakage, they
> >> are typically binary and we can not tolerate them at all.  A tiny,
> >> negligeable performance regression might be tolarable if it brings
> >> other benefits. I very much doubt avoiding interrupts is
> >> negligeable though. And making code simpler isn't a big benefit,
> >> users do not care.
> >
> >It's not just making code simpler. As discussed in the past, it also
> >fixes real bugs.
> >
> >>
> >> > My point was, if someone presents
> >> > solid data to prove orphan is needed during the patch review, let's toss
> >> > out the patch.
> >> >
> >> > Makes sense?
> >>
> >> It's not hypothetical - if anything, it's hypothetical that performance
> >> does not regress.  And we just got a report from users that see a
> >> regression without.  So, not really.
> >
> >Probably, but do we need to define a bar here? Looking at git history,
> >we didn't ask a full benchmark for a lot of commits that may touch

It's patently obvious that not getting interrupts is better than
getting interrupts. The onus of proof would be on people who claim
otherwise.


> Moreover, there is no "benchmark" to run anyway, is it?
> 

Tought.  Talk to users that report regressions.



> >performance.
> >
> >Thanks
> >
> >>
> >> >
> >> > >
> >> > >--
> >> > >MST
> >> > >
> >>
> >
Michael S. Tsirkin June 17, 2024, 4:18 p.m. UTC | #51
On Mon, Jun 17, 2024 at 09:44:55AM +0800, Jason Wang wrote:
> Probably, but do we need to define a bar here? Looking at git history,
> we didn't ask a full benchmark for a lot of commits that may touch
> performance.

There's no may here and we even got a report from a real user.
Jason Wang June 18, 2024, 12:52 a.m. UTC | #52
On Mon, Jun 17, 2024 at 5:30 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote:
> >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>
> >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
> >> > >> >sure others are happy with this.
> >> > >>
> >> > >> How about to do it other way around. I will take a stab at sending patch
> >> > >> removing it. If anyone is against and has solid data to prove orphan
> >> > >> mode is needed, let them provide those.
> >> > >
> >> > >Break it with no warning and see if anyone complains?
> >> >
> >> > This is now what I suggested at all.
> >> >
> >> > >No, this is not how we handle userspace compatibility, normally.
> >> >
> >> > Sure.
> >> >
> >> > Again:
> >> >
> >> > I would send orphan removal patch containing:
> >> > 1) no module options removal. Warn if someone sets it up
> >> > 2) module option to disable napi is ignored
> >> > 3) orphan mode is removed from code
> >> >
> >> > There is no breakage. Only, hypotetically performance downgrade in some
> >> > hypotetical usecase nobody knows of.
> >>
> >> Performance is why people use virtio. It's as much a breakage as any
> >> other bug. The main difference is, with other types of breakage, they
> >> are typically binary and we can not tolerate them at all.  A tiny,
> >> negligeable performance regression might be tolarable if it brings
> >> other benefits. I very much doubt avoiding interrupts is
> >> negligeable though. And making code simpler isn't a big benefit,
> >> users do not care.
> >
> >It's not just making code simpler. As discussed in the past, it also
> >fixes real bugs.
> >
> >>
> >> > My point was, if someone presents
> >> > solid data to prove orphan is needed during the patch review, let's toss
> >> > out the patch.
> >> >
> >> > Makes sense?
> >>
> >> It's not hypothetical - if anything, it's hypothetical that performance
> >> does not regress.  And we just got a report from users that see a
> >> regression without.  So, not really.
> >
> >Probably, but do we need to define a bar here? Looking at git history,
> >we didn't ask a full benchmark for a lot of commits that may touch
>
> Moreover, there is no "benchmark" to run anyway, is it?

Yes, so my point is to have some agreement on

1) what kind of test needs to be run for a patch like this.
2) what numbers are ok or not

Thanks

>
>
> >performance.
> >
> >Thanks
> >
> >>
> >> >
> >> > >
> >> > >--
> >> > >MST
> >> > >
> >>
> >
>
Jason Wang June 18, 2024, 1:19 a.m. UTC | #53
On Tue, Jun 18, 2024 at 12:16 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 17, 2024 at 11:30:36AM +0200, Jiri Pirko wrote:
> > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote:
> > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >>
> > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
> > >> > >> >sure others are happy with this.
> > >> > >>
> > >> > >> How about to do it other way around. I will take a stab at sending patch
> > >> > >> removing it. If anyone is against and has solid data to prove orphan
> > >> > >> mode is needed, let them provide those.
> > >> > >
> > >> > >Break it with no warning and see if anyone complains?
> > >> >
> > >> > This is now what I suggested at all.
> > >> >
> > >> > >No, this is not how we handle userspace compatibility, normally.
> > >> >
> > >> > Sure.
> > >> >
> > >> > Again:
> > >> >
> > >> > I would send orphan removal patch containing:
> > >> > 1) no module options removal. Warn if someone sets it up
> > >> > 2) module option to disable napi is ignored
> > >> > 3) orphan mode is removed from code
> > >> >
> > >> > There is no breakage. Only, hypotetically performance downgrade in some
> > >> > hypotetical usecase nobody knows of.
> > >>
> > >> Performance is why people use virtio. It's as much a breakage as any
> > >> other bug. The main difference is, with other types of breakage, they
> > >> are typically binary and we can not tolerate them at all.  A tiny,
> > >> negligeable performance regression might be tolarable if it brings
> > >> other benefits. I very much doubt avoiding interrupts is
> > >> negligeable though. And making code simpler isn't a big benefit,
> > >> users do not care.
> > >
> > >It's not just making code simpler. As discussed in the past, it also
> > >fixes real bugs.
> > >
> > >>
> > >> > My point was, if someone presents
> > >> > solid data to prove orphan is needed during the patch review, let's toss
> > >> > out the patch.
> > >> >
> > >> > Makes sense?
> > >>
> > >> It's not hypothetical - if anything, it's hypothetical that performance
> > >> does not regress.  And we just got a report from users that see a
> > >> regression without.  So, not really.
> > >
> > >Probably, but do we need to define a bar here? Looking at git history,
> > >we didn't ask a full benchmark for a lot of commits that may touch
>
> It's patently obvious that not getting interrupts is better than
> getting interrupts.

Exactly. But dropping orphan mode seems less intrusive than the commit
that makes NAPI default.

Thanks

> The onus of proof would be on people who claim
> otherwise.
>
>
> > Moreover, there is no "benchmark" to run anyway, is it?
> >
>
> Tought.  Talk to users that report regressions.
>
>
>
> > >performance.
> > >
> > >Thanks
> > >
> > >>
> > >> >
> > >> > >
> > >> > >--
> > >> > >MST
> > >> > >
> > >>
> > >
>
Michael S. Tsirkin June 18, 2024, 6:23 p.m. UTC | #54
On Tue, Jun 18, 2024 at 08:52:38AM +0800, Jason Wang wrote:
> On Mon, Jun 17, 2024 at 5:30 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >
> > Mon, Jun 17, 2024 at 03:44:55AM CEST, jasowang@redhat.com wrote:
> > >On Mon, Jun 10, 2024 at 10:19 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >>
> > >> On Fri, Jun 07, 2024 at 01:30:34PM +0200, Jiri Pirko wrote:
> > >> > Fri, Jun 07, 2024 at 12:23:37PM CEST, mst@redhat.com wrote:
> > >> > >On Fri, Jun 07, 2024 at 11:57:37AM +0200, Jiri Pirko wrote:
> > >> > >> >True. Personally, I would like to just drop orphan mode. But I'm not
> > >> > >> >sure others are happy with this.
> > >> > >>
> > >> > >> How about to do it other way around. I will take a stab at sending patch
> > >> > >> removing it. If anyone is against and has solid data to prove orphan
> > >> > >> mode is needed, let them provide those.
> > >> > >
> > >> > >Break it with no warning and see if anyone complains?
> > >> >
> > >> > This is now what I suggested at all.
> > >> >
> > >> > >No, this is not how we handle userspace compatibility, normally.
> > >> >
> > >> > Sure.
> > >> >
> > >> > Again:
> > >> >
> > >> > I would send orphan removal patch containing:
> > >> > 1) no module options removal. Warn if someone sets it up
> > >> > 2) module option to disable napi is ignored
> > >> > 3) orphan mode is removed from code
> > >> >
> > >> > There is no breakage. Only, hypotetically performance downgrade in some
> > >> > hypotetical usecase nobody knows of.
> > >>
> > >> Performance is why people use virtio. It's as much a breakage as any
> > >> other bug. The main difference is, with other types of breakage, they
> > >> are typically binary and we can not tolerate them at all.  A tiny,
> > >> negligeable performance regression might be tolarable if it brings
> > >> other benefits. I very much doubt avoiding interrupts is
> > >> negligeable though. And making code simpler isn't a big benefit,
> > >> users do not care.
> > >
> > >It's not just making code simpler. As discussed in the past, it also
> > >fixes real bugs.
> > >
> > >>
> > >> > My point was, if someone presents
> > >> > solid data to prove orphan is needed during the patch review, let's toss
> > >> > out the patch.
> > >> >
> > >> > Makes sense?
> > >>
> > >> It's not hypothetical - if anything, it's hypothetical that performance
> > >> does not regress.  And we just got a report from users that see a
> > >> regression without.  So, not really.
> > >
> > >Probably, but do we need to define a bar here? Looking at git history,
> > >we didn't ask a full benchmark for a lot of commits that may touch
> >
> > Moreover, there is no "benchmark" to run anyway, is it?
> 
> Yes, so my point is to have some agreement on
> 
> 1) what kind of test needs to be run for a patch like this.
> 2) what numbers are ok or not
> 
> Thanks

That's a $1mln question and the difficulty is why we don't change
behaviour drastically for users without a fallback even if
we think we did a bunch of testing.



> >
> >
> > >performance.
> > >
> > >Thanks
> > >
> > >>
> > >> >
> > >> > >
> > >> > >--
> > >> > >MST
> > >> > >
> > >>
> > >
> >
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 218a446c4c27..c53d6dc6d332 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -84,7 +84,9 @@  struct virtnet_stat_desc {
 
 struct virtnet_sq_free_stats {
 	u64 packets;
+	u64 xdp_packets;
 	u64 bytes;
+	u64 xdp_bytes;
 };
 
 struct virtnet_sq_stats {
@@ -512,19 +514,19 @@  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
 	void *ptr;
 
 	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		++stats->packets;
-
 		if (!is_xdp_frame(ptr)) {
 			struct sk_buff *skb = ptr;
 
 			pr_debug("Sent skb %p\n", skb);
 
+			stats->packets++;
 			stats->bytes += skb->len;
 			napi_consume_skb(skb, in_napi);
 		} else {
 			struct xdp_frame *frame = ptr_to_xdp(ptr);
 
-			stats->bytes += xdp_get_frame_len(frame);
+			stats->xdp_packets++;
+			stats->xdp_bytes += xdp_get_frame_len(frame);
 			xdp_return_frame(frame);
 		}
 	}
@@ -965,7 +967,8 @@  static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
 	virtnet_rq_free_buf(vi, rq, buf);
 }
 
-static void free_old_xmit(struct send_queue *sq, bool in_napi)
+static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
+			  bool in_napi)
 {
 	struct virtnet_sq_free_stats stats = {0};
 
@@ -974,9 +977,11 @@  static void free_old_xmit(struct send_queue *sq, bool in_napi)
 	/* Avoid overhead when no packets have been processed
 	 * happens when called speculatively from start_xmit.
 	 */
-	if (!stats.packets)
+	if (!stats.packets && !stats.xdp_packets)
 		return;
 
+	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
+
 	u64_stats_update_begin(&sq->stats.syncp);
 	u64_stats_add(&sq->stats.bytes, stats.bytes);
 	u64_stats_add(&sq->stats.packets, stats.packets);
@@ -1013,13 +1018,15 @@  static void check_sq_full_and_disable(struct virtnet_info *vi,
 	 * early means 16 slots are typically wasted.
 	 */
 	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
-		netif_stop_subqueue(dev, qnum);
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
+
+		netif_tx_stop_queue(txq);
 		if (use_napi) {
 			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
 				virtqueue_napi_schedule(&sq->napi, sq->vq);
 		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
-			free_old_xmit(sq, false);
+			free_old_xmit(sq, txq, false);
 			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
 				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
@@ -2319,7 +2326,7 @@  static void virtnet_poll_cleantx(struct receive_queue *rq)
 
 		do {
 			virtqueue_disable_cb(sq->vq);
-			free_old_xmit(sq, true);
+			free_old_xmit(sq, txq, true);
 		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
 		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
@@ -2471,7 +2478,7 @@  static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 	txq = netdev_get_tx_queue(vi->dev, index);
 	__netif_tx_lock(txq, raw_smp_processor_id());
 	virtqueue_disable_cb(sq->vq);
-	free_old_xmit(sq, true);
+	free_old_xmit(sq, txq, true);
 
 	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
 		netif_tx_wake_queue(txq);
@@ -2553,7 +2560,7 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct send_queue *sq = &vi->sq[qnum];
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
-	bool kick = !netdev_xmit_more();
+	bool xmit_more = netdev_xmit_more();
 	bool use_napi = sq->napi.weight;
 
 	/* Free up any pending old buffers before queueing new ones. */
@@ -2561,9 +2568,9 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (use_napi)
 			virtqueue_disable_cb(sq->vq);
 
-		free_old_xmit(sq, false);
+		free_old_xmit(sq, txq, false);
 
-	} while (use_napi && kick &&
+	} while (use_napi && !xmit_more &&
 	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
 	/* timestamp packet in software */
@@ -2592,7 +2599,7 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	check_sq_full_and_disable(vi, dev, sq);
 
-	if (kick || netif_xmit_stopped(txq)) {
+	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
 		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
 			u64_stats_update_begin(&sq->stats.syncp);
 			u64_stats_inc(&sq->stats.kicks);