diff mbox series

virtio_net: drain unconsumed tx completions if any before dql_reset

Message ID 20241126024200.2371546-1-koichiro.den@canonical.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series virtio_net: drain unconsumed tx completions if any before dql_reset | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 1 blamed authors not CCed: jiri@resnulli.us; 1 maintainers not CCed: jiri@resnulli.us
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-11-29--03-00 (tests: 792)

Commit Message

Koichiro Den Nov. 26, 2024, 2:42 a.m. UTC
When virtnet_close is followed by virtnet_open, there is a slight chance
that some TX completions remain unconsumed. Those are handled during the
first NAPI poll, but since dql_reset occurs just beforehand, it can lead
to a crash [1].

This issue can be reproduced by running: `while :; do ip l set DEV down;
ip l set DEV up; done` under heavy network TX load from inside of the
machine.

To fix this, drain unconsumed TX completions if any before dql_reset,
allowing BQL to start cleanly.

------------[ cut here ]------------
kernel BUG at lib/dynamic_queue_limits.c:99!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
Tainted: [N]=TEST
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
RIP: 0010:dql_completed+0x26b/0x290
Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
PKRU: 55555554
Call Trace:
 <IRQ>
 ? die+0x32/0x80
 ? do_trap+0xd9/0x100
 ? dql_completed+0x26b/0x290
 ? dql_completed+0x26b/0x290
 ? do_error_trap+0x6d/0xb0
 ? dql_completed+0x26b/0x290
 ? exc_invalid_op+0x4c/0x60
 ? dql_completed+0x26b/0x290
 ? asm_exc_invalid_op+0x16/0x20
 ? dql_completed+0x26b/0x290
 __free_old_xmit+0xff/0x170 [virtio_net]
 free_old_xmit+0x54/0xc0 [virtio_net]
 virtnet_poll+0xf4/0xe30 [virtio_net]
 ? __update_load_avg_cfs_rq+0x264/0x2d0
 ? update_curr+0x35/0x260
 ? reweight_entity+0x1be/0x260
 __napi_poll.constprop.0+0x28/0x1c0
 net_rx_action+0x329/0x420
 ? enqueue_hrtimer+0x35/0x90
 ? trace_hardirqs_on+0x1d/0x80
 ? kvm_sched_clock_read+0xd/0x20
 ? sched_clock+0xc/0x30
 ? kvm_sched_clock_read+0xd/0x20
 ? sched_clock+0xc/0x30
 ? sched_clock_cpu+0xd/0x1a0
 handle_softirqs+0x138/0x3e0
 do_softirq.part.0+0x89/0xc0
 </IRQ>
 <TASK>
 __local_bh_enable_ip+0xa7/0xb0
 virtnet_open+0xc8/0x310 [virtio_net]
 __dev_open+0xfa/0x1b0
 __dev_change_flags+0x1de/0x250
 dev_change_flags+0x22/0x60
 do_setlink.isra.0+0x2df/0x10b0
 ? rtnetlink_rcv_msg+0x34f/0x3f0
 ? netlink_rcv_skb+0x54/0x100
 ? netlink_unicast+0x23e/0x390
 ? netlink_sendmsg+0x21e/0x490
 ? ____sys_sendmsg+0x31b/0x350
 ? avc_has_perm_noaudit+0x67/0xf0
 ? cred_has_capability.isra.0+0x75/0x110
 ? __nla_validate_parse+0x5f/0xee0
 ? __pfx___probestub_irq_enable+0x3/0x10
 ? __create_object+0x5e/0x90
 ? security_capable+0x3b/0x70
 rtnl_newlink+0x784/0xaf0
 ? avc_has_perm_noaudit+0x67/0xf0
 ? cred_has_capability.isra.0+0x75/0x110
 ? stack_depot_save_flags+0x24/0x6d0
 ? __pfx_rtnl_newlink+0x10/0x10
 rtnetlink_rcv_msg+0x34f/0x3f0
 ? do_syscall_64+0x6c/0x180
 ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
 ? __pfx_rtnetlink_rcv_msg+0x10/0x10
 netlink_rcv_skb+0x54/0x100
 netlink_unicast+0x23e/0x390
 netlink_sendmsg+0x21e/0x490
 ____sys_sendmsg+0x31b/0x350
 ? copy_msghdr_from_user+0x6d/0xa0
 ___sys_sendmsg+0x86/0xd0
 ? __pte_offset_map+0x17/0x160
 ? preempt_count_add+0x69/0xa0
 ? __call_rcu_common.constprop.0+0x147/0x610
 ? preempt_count_add+0x69/0xa0
 ? preempt_count_add+0x69/0xa0
 ? _raw_spin_trylock+0x13/0x60
 ? trace_hardirqs_on+0x1d/0x80
 __sys_sendmsg+0x66/0xc0
 do_syscall_64+0x6c/0x180
 entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f41defe5b34
Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
 </TASK>
[...]
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
Cc: <stable@vger.kernel.org> # v6.11+
Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
---
 drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

Comments

Jason Wang Nov. 26, 2024, 3:50 a.m. UTC | #1
On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
<koichiro.den@canonical.com> wrote:
>
> When virtnet_close is followed by virtnet_open, there is a slight chance
> that some TX completions remain unconsumed. Those are handled during the
> first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> to a crash [1].
>
> This issue can be reproduced by running: `while :; do ip l set DEV down;
> ip l set DEV up; done` under heavy network TX load from inside of the
> machine.
>
> To fix this, drain unconsumed TX completions if any before dql_reset,
> allowing BQL to start cleanly.
>
> ------------[ cut here ]------------
> kernel BUG at lib/dynamic_queue_limits.c:99!
> Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> Tainted: [N]=TEST
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> RIP: 0010:dql_completed+0x26b/0x290
> Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> PKRU: 55555554
> Call Trace:
>  <IRQ>
>  ? die+0x32/0x80
>  ? do_trap+0xd9/0x100
>  ? dql_completed+0x26b/0x290
>  ? dql_completed+0x26b/0x290
>  ? do_error_trap+0x6d/0xb0
>  ? dql_completed+0x26b/0x290
>  ? exc_invalid_op+0x4c/0x60
>  ? dql_completed+0x26b/0x290
>  ? asm_exc_invalid_op+0x16/0x20
>  ? dql_completed+0x26b/0x290
>  __free_old_xmit+0xff/0x170 [virtio_net]
>  free_old_xmit+0x54/0xc0 [virtio_net]
>  virtnet_poll+0xf4/0xe30 [virtio_net]
>  ? __update_load_avg_cfs_rq+0x264/0x2d0
>  ? update_curr+0x35/0x260
>  ? reweight_entity+0x1be/0x260
>  __napi_poll.constprop.0+0x28/0x1c0
>  net_rx_action+0x329/0x420
>  ? enqueue_hrtimer+0x35/0x90
>  ? trace_hardirqs_on+0x1d/0x80
>  ? kvm_sched_clock_read+0xd/0x20
>  ? sched_clock+0xc/0x30
>  ? kvm_sched_clock_read+0xd/0x20
>  ? sched_clock+0xc/0x30
>  ? sched_clock_cpu+0xd/0x1a0
>  handle_softirqs+0x138/0x3e0
>  do_softirq.part.0+0x89/0xc0
>  </IRQ>
>  <TASK>
>  __local_bh_enable_ip+0xa7/0xb0
>  virtnet_open+0xc8/0x310 [virtio_net]
>  __dev_open+0xfa/0x1b0
>  __dev_change_flags+0x1de/0x250
>  dev_change_flags+0x22/0x60
>  do_setlink.isra.0+0x2df/0x10b0
>  ? rtnetlink_rcv_msg+0x34f/0x3f0
>  ? netlink_rcv_skb+0x54/0x100
>  ? netlink_unicast+0x23e/0x390
>  ? netlink_sendmsg+0x21e/0x490
>  ? ____sys_sendmsg+0x31b/0x350
>  ? avc_has_perm_noaudit+0x67/0xf0
>  ? cred_has_capability.isra.0+0x75/0x110
>  ? __nla_validate_parse+0x5f/0xee0
>  ? __pfx___probestub_irq_enable+0x3/0x10
>  ? __create_object+0x5e/0x90
>  ? security_capable+0x3b/0x70
>  rtnl_newlink+0x784/0xaf0
>  ? avc_has_perm_noaudit+0x67/0xf0
>  ? cred_has_capability.isra.0+0x75/0x110
>  ? stack_depot_save_flags+0x24/0x6d0
>  ? __pfx_rtnl_newlink+0x10/0x10
>  rtnetlink_rcv_msg+0x34f/0x3f0
>  ? do_syscall_64+0x6c/0x180
>  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
>  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
>  netlink_rcv_skb+0x54/0x100
>  netlink_unicast+0x23e/0x390
>  netlink_sendmsg+0x21e/0x490
>  ____sys_sendmsg+0x31b/0x350
>  ? copy_msghdr_from_user+0x6d/0xa0
>  ___sys_sendmsg+0x86/0xd0
>  ? __pte_offset_map+0x17/0x160
>  ? preempt_count_add+0x69/0xa0
>  ? __call_rcu_common.constprop.0+0x147/0x610
>  ? preempt_count_add+0x69/0xa0
>  ? preempt_count_add+0x69/0xa0
>  ? _raw_spin_trylock+0x13/0x60
>  ? trace_hardirqs_on+0x1d/0x80
>  __sys_sendmsg+0x66/0xc0
>  do_syscall_64+0x6c/0x180
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> RIP: 0033:0x7f41defe5b34
> Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
>  </TASK>
> [...]
> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
>
> Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> Cc: <stable@vger.kernel.org> # v6.11+
> Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> ---
>  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
>  1 file changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 64c87bb48a41..3e36c0470600 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
>                                                struct sk_buff *curr_skb,
>                                                struct page *page, void *buf,
>                                                int len, int truesize);
> -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
>
>  enum virtnet_xmit_type {
>         VIRTNET_XMIT_TYPE_SKB,
> @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
>  }
>
>  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> +                           bool drain)
>  {
>         struct xdp_frame *frame;
>         struct sk_buff *skb;
> @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
>                         break;
>                 }
>         }
> -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> +       if (!drain)
> +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
>  }
>
>  static void virtnet_free_old_xmit(struct send_queue *sq,
> @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
>                                   bool in_napi,
>                                   struct virtnet_sq_free_stats *stats)
>  {
> -       __free_old_xmit(sq, txq, in_napi, stats);
> +       __free_old_xmit(sq, txq, in_napi, stats, false);
>
>         if (stats->xsk)
> -               virtnet_xsk_completed(sq, stats->xsk);
> +               virtnet_xsk_completed(sq, stats->xsk, false);
> +}
> +
> +static void virtnet_drain_old_xmit(struct send_queue *sq,
> +                                  struct netdev_queue *txq)
> +{
> +       struct virtnet_sq_free_stats stats = {0};
> +
> +       __free_old_xmit(sq, txq, false, &stats, true);
> +
> +       if (stats.xsk)
> +               virtnet_xsk_completed(sq, stats.xsk, true);
>  }

Are we sure this can drain the queue? Note that the device is not stopped.

>
>  /* Converting between virtqueue no. and kernel tx/rx queue no.
> @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
>         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
>          * free_old_xmit().
>          */
> -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> +                       &stats, false);
>
>         if (stats.xsk)
>                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
>         return 0;
>  }
>
> -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
>  {
>         xsk_tx_completed(sq->xsk_pool, num);
>
> +       if (drain)
> +               return;
> +
>         /* If this is called by rx poll, start_xmit and xdp xmit we should
>          * wakeup the tx napi to consume the xsk tx queue, because the tx
>          * interrupt may not be triggered.
> @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
>
>  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>  {
> +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
>         struct net_device *dev = vi->dev;
>         int err;
>
> @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>         if (err < 0)
>                 goto err_xdp_reg_mem_model;
>
> -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> +
> +       netdev_tx_reset_queue(txq);
>         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
>         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
>
> --
> 2.43.0
>
>

Thanks
Koichiro Den Nov. 26, 2024, 4:44 a.m. UTC | #2
On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> <koichiro.den@canonical.com> wrote:
> >
> > When virtnet_close is followed by virtnet_open, there is a slight chance
> > that some TX completions remain unconsumed. Those are handled during the
> > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > to a crash [1].
> >
> > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > ip l set DEV up; done` under heavy network TX load from inside of the
> > machine.
> >
> > To fix this, drain unconsumed TX completions if any before dql_reset,
> > allowing BQL to start cleanly.
> >
> > ------------[ cut here ]------------
> > kernel BUG at lib/dynamic_queue_limits.c:99!
> > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > Tainted: [N]=TEST
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > RIP: 0010:dql_completed+0x26b/0x290
> > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > PKRU: 55555554
> > Call Trace:
> >  <IRQ>
> >  ? die+0x32/0x80
> >  ? do_trap+0xd9/0x100
> >  ? dql_completed+0x26b/0x290
> >  ? dql_completed+0x26b/0x290
> >  ? do_error_trap+0x6d/0xb0
> >  ? dql_completed+0x26b/0x290
> >  ? exc_invalid_op+0x4c/0x60
> >  ? dql_completed+0x26b/0x290
> >  ? asm_exc_invalid_op+0x16/0x20
> >  ? dql_completed+0x26b/0x290
> >  __free_old_xmit+0xff/0x170 [virtio_net]
> >  free_old_xmit+0x54/0xc0 [virtio_net]
> >  virtnet_poll+0xf4/0xe30 [virtio_net]
> >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> >  ? update_curr+0x35/0x260
> >  ? reweight_entity+0x1be/0x260
> >  __napi_poll.constprop.0+0x28/0x1c0
> >  net_rx_action+0x329/0x420
> >  ? enqueue_hrtimer+0x35/0x90
> >  ? trace_hardirqs_on+0x1d/0x80
> >  ? kvm_sched_clock_read+0xd/0x20
> >  ? sched_clock+0xc/0x30
> >  ? kvm_sched_clock_read+0xd/0x20
> >  ? sched_clock+0xc/0x30
> >  ? sched_clock_cpu+0xd/0x1a0
> >  handle_softirqs+0x138/0x3e0
> >  do_softirq.part.0+0x89/0xc0
> >  </IRQ>
> >  <TASK>
> >  __local_bh_enable_ip+0xa7/0xb0
> >  virtnet_open+0xc8/0x310 [virtio_net]
> >  __dev_open+0xfa/0x1b0
> >  __dev_change_flags+0x1de/0x250
> >  dev_change_flags+0x22/0x60
> >  do_setlink.isra.0+0x2df/0x10b0
> >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> >  ? netlink_rcv_skb+0x54/0x100
> >  ? netlink_unicast+0x23e/0x390
> >  ? netlink_sendmsg+0x21e/0x490
> >  ? ____sys_sendmsg+0x31b/0x350
> >  ? avc_has_perm_noaudit+0x67/0xf0
> >  ? cred_has_capability.isra.0+0x75/0x110
> >  ? __nla_validate_parse+0x5f/0xee0
> >  ? __pfx___probestub_irq_enable+0x3/0x10
> >  ? __create_object+0x5e/0x90
> >  ? security_capable+0x3b/0x70
> >  rtnl_newlink+0x784/0xaf0
> >  ? avc_has_perm_noaudit+0x67/0xf0
> >  ? cred_has_capability.isra.0+0x75/0x110
> >  ? stack_depot_save_flags+0x24/0x6d0
> >  ? __pfx_rtnl_newlink+0x10/0x10
> >  rtnetlink_rcv_msg+0x34f/0x3f0
> >  ? do_syscall_64+0x6c/0x180
> >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> >  netlink_rcv_skb+0x54/0x100
> >  netlink_unicast+0x23e/0x390
> >  netlink_sendmsg+0x21e/0x490
> >  ____sys_sendmsg+0x31b/0x350
> >  ? copy_msghdr_from_user+0x6d/0xa0
> >  ___sys_sendmsg+0x86/0xd0
> >  ? __pte_offset_map+0x17/0x160
> >  ? preempt_count_add+0x69/0xa0
> >  ? __call_rcu_common.constprop.0+0x147/0x610
> >  ? preempt_count_add+0x69/0xa0
> >  ? preempt_count_add+0x69/0xa0
> >  ? _raw_spin_trylock+0x13/0x60
> >  ? trace_hardirqs_on+0x1d/0x80
> >  __sys_sendmsg+0x66/0xc0
> >  do_syscall_64+0x6c/0x180
> >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > RIP: 0033:0x7f41defe5b34
> > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> >  </TASK>
> > [...]
> > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> >
> > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > Cc: <stable@vger.kernel.org> # v6.11+
> > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > ---
> >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> >  1 file changed, 29 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 64c87bb48a41..3e36c0470600 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> >                                                struct sk_buff *curr_skb,
> >                                                struct page *page, void *buf,
> >                                                int len, int truesize);
> > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> >
> >  enum virtnet_xmit_type {
> >         VIRTNET_XMIT_TYPE_SKB,
> > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> >  }
> >
> >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > +                           bool drain)
> >  {
> >         struct xdp_frame *frame;
> >         struct sk_buff *skb;
> > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> >                         break;
> >                 }
> >         }
> > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > +       if (!drain)
> > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> >  }
> >
> >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> >                                   bool in_napi,
> >                                   struct virtnet_sq_free_stats *stats)
> >  {
> > -       __free_old_xmit(sq, txq, in_napi, stats);
> > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> >
> >         if (stats->xsk)
> > -               virtnet_xsk_completed(sq, stats->xsk);
> > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > +}
> > +
> > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > +                                  struct netdev_queue *txq)
> > +{
> > +       struct virtnet_sq_free_stats stats = {0};
> > +
> > +       __free_old_xmit(sq, txq, false, &stats, true);
> > +
> > +       if (stats.xsk)
> > +               virtnet_xsk_completed(sq, stats.xsk, true);
> >  }
> 
> Are we sure this can drain the queue? Note that the device is not stopped.

Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
point I added e.g. via virtnet_config_changed_work, so it seems that I need
to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
Please let me know if I’m mistaken.

> 
> >
> >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> >          * free_old_xmit().
> >          */
> > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > +                       &stats, false);
> >
> >         if (stats.xsk)
> >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> >         return 0;
> >  }
> >
> > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> >  {
> >         xsk_tx_completed(sq->xsk_pool, num);
> >
> > +       if (drain)
> > +               return;
> > +
> >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> >          * interrupt may not be triggered.
> > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> >
> >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> >  {
> > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> >         struct net_device *dev = vi->dev;
> >         int err;
> >
> > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> >         if (err < 0)
> >                 goto err_xdp_reg_mem_model;
> >
> > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > +
> > +       netdev_tx_reset_queue(txq);
> >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> >
> > --
> > 2.43.0
> >
> >
> 
> Thanks
>
Jason Wang Nov. 27, 2024, 3:24 a.m. UTC | #3
On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
<koichiro.den@canonical.com> wrote:
>
> On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > <koichiro.den@canonical.com> wrote:
> > >
> > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > that some TX completions remain unconsumed. Those are handled during the
> > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > to a crash [1].
> > >
> > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > machine.
> > >
> > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > allowing BQL to start cleanly.
> > >
> > > ------------[ cut here ]------------
> > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > Tainted: [N]=TEST
> > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > RIP: 0010:dql_completed+0x26b/0x290
> > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > knlGS:0000000000000000
> > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > PKRU: 55555554
> > > Call Trace:
> > >  <IRQ>
> > >  ? die+0x32/0x80
> > >  ? do_trap+0xd9/0x100
> > >  ? dql_completed+0x26b/0x290
> > >  ? dql_completed+0x26b/0x290
> > >  ? do_error_trap+0x6d/0xb0
> > >  ? dql_completed+0x26b/0x290
> > >  ? exc_invalid_op+0x4c/0x60
> > >  ? dql_completed+0x26b/0x290
> > >  ? asm_exc_invalid_op+0x16/0x20
> > >  ? dql_completed+0x26b/0x290
> > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > >  ? update_curr+0x35/0x260
> > >  ? reweight_entity+0x1be/0x260
> > >  __napi_poll.constprop.0+0x28/0x1c0
> > >  net_rx_action+0x329/0x420
> > >  ? enqueue_hrtimer+0x35/0x90
> > >  ? trace_hardirqs_on+0x1d/0x80
> > >  ? kvm_sched_clock_read+0xd/0x20
> > >  ? sched_clock+0xc/0x30
> > >  ? kvm_sched_clock_read+0xd/0x20
> > >  ? sched_clock+0xc/0x30
> > >  ? sched_clock_cpu+0xd/0x1a0
> > >  handle_softirqs+0x138/0x3e0
> > >  do_softirq.part.0+0x89/0xc0
> > >  </IRQ>
> > >  <TASK>
> > >  __local_bh_enable_ip+0xa7/0xb0
> > >  virtnet_open+0xc8/0x310 [virtio_net]
> > >  __dev_open+0xfa/0x1b0
> > >  __dev_change_flags+0x1de/0x250
> > >  dev_change_flags+0x22/0x60
> > >  do_setlink.isra.0+0x2df/0x10b0
> > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > >  ? netlink_rcv_skb+0x54/0x100
> > >  ? netlink_unicast+0x23e/0x390
> > >  ? netlink_sendmsg+0x21e/0x490
> > >  ? ____sys_sendmsg+0x31b/0x350
> > >  ? avc_has_perm_noaudit+0x67/0xf0
> > >  ? cred_has_capability.isra.0+0x75/0x110
> > >  ? __nla_validate_parse+0x5f/0xee0
> > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > >  ? __create_object+0x5e/0x90
> > >  ? security_capable+0x3b/0x70
> > >  rtnl_newlink+0x784/0xaf0
> > >  ? avc_has_perm_noaudit+0x67/0xf0
> > >  ? cred_has_capability.isra.0+0x75/0x110
> > >  ? stack_depot_save_flags+0x24/0x6d0
> > >  ? __pfx_rtnl_newlink+0x10/0x10
> > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > >  ? do_syscall_64+0x6c/0x180
> > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > >  netlink_rcv_skb+0x54/0x100
> > >  netlink_unicast+0x23e/0x390
> > >  netlink_sendmsg+0x21e/0x490
> > >  ____sys_sendmsg+0x31b/0x350
> > >  ? copy_msghdr_from_user+0x6d/0xa0
> > >  ___sys_sendmsg+0x86/0xd0
> > >  ? __pte_offset_map+0x17/0x160
> > >  ? preempt_count_add+0x69/0xa0
> > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > >  ? preempt_count_add+0x69/0xa0
> > >  ? preempt_count_add+0x69/0xa0
> > >  ? _raw_spin_trylock+0x13/0x60
> > >  ? trace_hardirqs_on+0x1d/0x80
> > >  __sys_sendmsg+0x66/0xc0
> > >  do_syscall_64+0x6c/0x180
> > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > RIP: 0033:0x7f41defe5b34
> > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > >  </TASK>
> > > [...]
> > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > >
> > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > Cc: <stable@vger.kernel.org> # v6.11+
> > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > ---
> > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 64c87bb48a41..3e36c0470600 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > >                                                struct sk_buff *curr_skb,
> > >                                                struct page *page, void *buf,
> > >                                                int len, int truesize);
> > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > >
> > >  enum virtnet_xmit_type {
> > >         VIRTNET_XMIT_TYPE_SKB,
> > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > >  }
> > >
> > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > +                           bool drain)
> > >  {
> > >         struct xdp_frame *frame;
> > >         struct sk_buff *skb;
> > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > >                         break;
> > >                 }
> > >         }
> > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > +       if (!drain)
> > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > >  }
> > >
> > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > >                                   bool in_napi,
> > >                                   struct virtnet_sq_free_stats *stats)
> > >  {
> > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > >
> > >         if (stats->xsk)
> > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > +}
> > > +
> > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > +                                  struct netdev_queue *txq)
> > > +{
> > > +       struct virtnet_sq_free_stats stats = {0};
> > > +
> > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > +
> > > +       if (stats.xsk)
> > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > >  }
> >
> > Are we sure this can drain the queue? Note that the device is not stopped.
>
> Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> point I added e.g. via virtnet_config_changed_work, so it seems that I need
> to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> Please let me know if I’m mistaken.

Not sure I get you, but I meant we don't reset the device so it can
keep raising tx interrupts:

virtnet_drain_old_xmit()
netdev_tx_reset_queue()
skb_xmit_done()
napi_enable()
netdev_tx_completed_queue() // here we might still surprise the bql?

Thanks

>
> >
> > >
> > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > >          * free_old_xmit().
> > >          */
> > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > +                       &stats, false);
> > >
> > >         if (stats.xsk)
> > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > >         return 0;
> > >  }
> > >
> > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > >  {
> > >         xsk_tx_completed(sq->xsk_pool, num);
> > >
> > > +       if (drain)
> > > +               return;
> > > +
> > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > >          * interrupt may not be triggered.
> > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > >
> > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > >  {
> > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > >         struct net_device *dev = vi->dev;
> > >         int err;
> > >
> > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > >         if (err < 0)
> > >                 goto err_xdp_reg_mem_model;
> > >
> > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > +
> > > +       netdev_tx_reset_queue(txq);
> > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > >
> > > --
> > > 2.43.0
> > >
> > >
> >
> > Thanks
> >
>
Koichiro Den Nov. 27, 2024, 4:07 a.m. UTC | #4
On Wed, Nov 27, 2024 at 11:24:15AM +0800, Jason Wang wrote:
> On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
> <koichiro.den@canonical.com> wrote:
> >
> > On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > > <koichiro.den@canonical.com> wrote:
> > > >
> > > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > > that some TX completions remain unconsumed. Those are handled during the
> > > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > > to a crash [1].
> > > >
> > > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > > machine.
> > > >
> > > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > > allowing BQL to start cleanly.
> > > >
> > > > ------------[ cut here ]------------
> > > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > > Tainted: [N]=TEST
> > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > > RIP: 0010:dql_completed+0x26b/0x290
> > > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > > knlGS:0000000000000000
> > > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > > PKRU: 55555554
> > > > Call Trace:
> > > >  <IRQ>
> > > >  ? die+0x32/0x80
> > > >  ? do_trap+0xd9/0x100
> > > >  ? dql_completed+0x26b/0x290
> > > >  ? dql_completed+0x26b/0x290
> > > >  ? do_error_trap+0x6d/0xb0
> > > >  ? dql_completed+0x26b/0x290
> > > >  ? exc_invalid_op+0x4c/0x60
> > > >  ? dql_completed+0x26b/0x290
> > > >  ? asm_exc_invalid_op+0x16/0x20
> > > >  ? dql_completed+0x26b/0x290
> > > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > > >  ? update_curr+0x35/0x260
> > > >  ? reweight_entity+0x1be/0x260
> > > >  __napi_poll.constprop.0+0x28/0x1c0
> > > >  net_rx_action+0x329/0x420
> > > >  ? enqueue_hrtimer+0x35/0x90
> > > >  ? trace_hardirqs_on+0x1d/0x80
> > > >  ? kvm_sched_clock_read+0xd/0x20
> > > >  ? sched_clock+0xc/0x30
> > > >  ? kvm_sched_clock_read+0xd/0x20
> > > >  ? sched_clock+0xc/0x30
> > > >  ? sched_clock_cpu+0xd/0x1a0
> > > >  handle_softirqs+0x138/0x3e0
> > > >  do_softirq.part.0+0x89/0xc0
> > > >  </IRQ>
> > > >  <TASK>
> > > >  __local_bh_enable_ip+0xa7/0xb0
> > > >  virtnet_open+0xc8/0x310 [virtio_net]
> > > >  __dev_open+0xfa/0x1b0
> > > >  __dev_change_flags+0x1de/0x250
> > > >  dev_change_flags+0x22/0x60
> > > >  do_setlink.isra.0+0x2df/0x10b0
> > > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > > >  ? netlink_rcv_skb+0x54/0x100
> > > >  ? netlink_unicast+0x23e/0x390
> > > >  ? netlink_sendmsg+0x21e/0x490
> > > >  ? ____sys_sendmsg+0x31b/0x350
> > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > >  ? __nla_validate_parse+0x5f/0xee0
> > > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > > >  ? __create_object+0x5e/0x90
> > > >  ? security_capable+0x3b/0x70
> > > >  rtnl_newlink+0x784/0xaf0
> > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > >  ? stack_depot_save_flags+0x24/0x6d0
> > > >  ? __pfx_rtnl_newlink+0x10/0x10
> > > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > > >  ? do_syscall_64+0x6c/0x180
> > > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > > >  netlink_rcv_skb+0x54/0x100
> > > >  netlink_unicast+0x23e/0x390
> > > >  netlink_sendmsg+0x21e/0x490
> > > >  ____sys_sendmsg+0x31b/0x350
> > > >  ? copy_msghdr_from_user+0x6d/0xa0
> > > >  ___sys_sendmsg+0x86/0xd0
> > > >  ? __pte_offset_map+0x17/0x160
> > > >  ? preempt_count_add+0x69/0xa0
> > > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > > >  ? preempt_count_add+0x69/0xa0
> > > >  ? preempt_count_add+0x69/0xa0
> > > >  ? _raw_spin_trylock+0x13/0x60
> > > >  ? trace_hardirqs_on+0x1d/0x80
> > > >  __sys_sendmsg+0x66/0xc0
> > > >  do_syscall_64+0x6c/0x180
> > > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > RIP: 0033:0x7f41defe5b34
> > > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > > >  </TASK>
> > > > [...]
> > > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > > >
> > > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > > Cc: <stable@vger.kernel.org> # v6.11+
> > > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 64c87bb48a41..3e36c0470600 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > > >                                                struct sk_buff *curr_skb,
> > > >                                                struct page *page, void *buf,
> > > >                                                int len, int truesize);
> > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > > >
> > > >  enum virtnet_xmit_type {
> > > >         VIRTNET_XMIT_TYPE_SKB,
> > > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > >  }
> > > >
> > > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > > +                           bool drain)
> > > >  {
> > > >         struct xdp_frame *frame;
> > > >         struct sk_buff *skb;
> > > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > >                         break;
> > > >                 }
> > > >         }
> > > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > +       if (!drain)
> > > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > >  }
> > > >
> > > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > > >                                   bool in_napi,
> > > >                                   struct virtnet_sq_free_stats *stats)
> > > >  {
> > > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > > >
> > > >         if (stats->xsk)
> > > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > > +}
> > > > +
> > > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > > +                                  struct netdev_queue *txq)
> > > > +{
> > > > +       struct virtnet_sq_free_stats stats = {0};
> > > > +
> > > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > > +
> > > > +       if (stats.xsk)
> > > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > > >  }
> > >
> > > Are we sure this can drain the queue? Note that the device is not stopped.
> >
> > Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> > point I added e.g. via virtnet_config_changed_work, so it seems that I need
> > to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> > Please let me know if I’m mistaken.
> 
> Not sure I get you, but I meant we don't reset the device so it can

I was wondering whether there would be a scenario where the tx queue is
woken up and some new packets from the upper layer reach dql_queued()
before the drain point, which also could cause the crash.

> keep raising tx interrupts:
> 
> virtnet_drain_old_xmit()
> netdev_tx_reset_queue()
> skb_xmit_done()
> napi_enable()
> netdev_tx_completed_queue() // here we might still surprise the bql?

Indeed, virtqueue_disable_cb() is needed before the drain point.

Thanks

> 
> Thanks
> 
> >
> > >
> > > >
> > > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > > >          * free_old_xmit().
> > > >          */
> > > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > > +                       &stats, false);
> > > >
> > > >         if (stats.xsk)
> > > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > >         return 0;
> > > >  }
> > > >
> > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > > >  {
> > > >         xsk_tx_completed(sq->xsk_pool, num);
> > > >
> > > > +       if (drain)
> > > > +               return;
> > > > +
> > > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > > >          * interrupt may not be triggered.
> > > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > >
> > > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > >  {
> > > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > > >         struct net_device *dev = vi->dev;
> > > >         int err;
> > > >
> > > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > >         if (err < 0)
> > > >                 goto err_xdp_reg_mem_model;
> > > >
> > > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > > +
> > > > +       netdev_tx_reset_queue(txq);
> > > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > > >
> > > > --
> > > > 2.43.0
> > > >
> > > >
> > >
> > > Thanks
> > >
> >
>
Jason Wang Nov. 28, 2024, 2:57 a.m. UTC | #5
On Wed, Nov 27, 2024 at 12:08 PM Koichiro Den
<koichiro.den@canonical.com> wrote:
>
> On Wed, Nov 27, 2024 at 11:24:15AM +0800, Jason Wang wrote:
> > On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
> > <koichiro.den@canonical.com> wrote:
> > >
> > > On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > > > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > > > <koichiro.den@canonical.com> wrote:
> > > > >
> > > > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > > > that some TX completions remain unconsumed. Those are handled during the
> > > > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > > > to a crash [1].
> > > > >
> > > > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > > > machine.
> > > > >
> > > > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > > > allowing BQL to start cleanly.
> > > > >
> > > > > ------------[ cut here ]------------
> > > > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > > > Tainted: [N]=TEST
> > > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > > > RIP: 0010:dql_completed+0x26b/0x290
> > > > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > > > knlGS:0000000000000000
> > > > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > > > PKRU: 55555554
> > > > > Call Trace:
> > > > >  <IRQ>
> > > > >  ? die+0x32/0x80
> > > > >  ? do_trap+0xd9/0x100
> > > > >  ? dql_completed+0x26b/0x290
> > > > >  ? dql_completed+0x26b/0x290
> > > > >  ? do_error_trap+0x6d/0xb0
> > > > >  ? dql_completed+0x26b/0x290
> > > > >  ? exc_invalid_op+0x4c/0x60
> > > > >  ? dql_completed+0x26b/0x290
> > > > >  ? asm_exc_invalid_op+0x16/0x20
> > > > >  ? dql_completed+0x26b/0x290
> > > > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > > > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > > > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > > > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > > > >  ? update_curr+0x35/0x260
> > > > >  ? reweight_entity+0x1be/0x260
> > > > >  __napi_poll.constprop.0+0x28/0x1c0
> > > > >  net_rx_action+0x329/0x420
> > > > >  ? enqueue_hrtimer+0x35/0x90
> > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > >  ? sched_clock+0xc/0x30
> > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > >  ? sched_clock+0xc/0x30
> > > > >  ? sched_clock_cpu+0xd/0x1a0
> > > > >  handle_softirqs+0x138/0x3e0
> > > > >  do_softirq.part.0+0x89/0xc0
> > > > >  </IRQ>
> > > > >  <TASK>
> > > > >  __local_bh_enable_ip+0xa7/0xb0
> > > > >  virtnet_open+0xc8/0x310 [virtio_net]
> > > > >  __dev_open+0xfa/0x1b0
> > > > >  __dev_change_flags+0x1de/0x250
> > > > >  dev_change_flags+0x22/0x60
> > > > >  do_setlink.isra.0+0x2df/0x10b0
> > > > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > > > >  ? netlink_rcv_skb+0x54/0x100
> > > > >  ? netlink_unicast+0x23e/0x390
> > > > >  ? netlink_sendmsg+0x21e/0x490
> > > > >  ? ____sys_sendmsg+0x31b/0x350
> > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > >  ? __nla_validate_parse+0x5f/0xee0
> > > > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > > > >  ? __create_object+0x5e/0x90
> > > > >  ? security_capable+0x3b/0x70
> > > > >  rtnl_newlink+0x784/0xaf0
> > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > >  ? stack_depot_save_flags+0x24/0x6d0
> > > > >  ? __pfx_rtnl_newlink+0x10/0x10
> > > > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > > > >  ? do_syscall_64+0x6c/0x180
> > > > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > > > >  netlink_rcv_skb+0x54/0x100
> > > > >  netlink_unicast+0x23e/0x390
> > > > >  netlink_sendmsg+0x21e/0x490
> > > > >  ____sys_sendmsg+0x31b/0x350
> > > > >  ? copy_msghdr_from_user+0x6d/0xa0
> > > > >  ___sys_sendmsg+0x86/0xd0
> > > > >  ? __pte_offset_map+0x17/0x160
> > > > >  ? preempt_count_add+0x69/0xa0
> > > > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > > > >  ? preempt_count_add+0x69/0xa0
> > > > >  ? preempt_count_add+0x69/0xa0
> > > > >  ? _raw_spin_trylock+0x13/0x60
> > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > >  __sys_sendmsg+0x66/0xc0
> > > > >  do_syscall_64+0x6c/0x180
> > > > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > RIP: 0033:0x7f41defe5b34
> > > > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > > > >  </TASK>
> > > > > [...]
> > > > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > > > >
> > > > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > > > Cc: <stable@vger.kernel.org> # v6.11+
> > > > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > > > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index 64c87bb48a41..3e36c0470600 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > > > >                                                struct sk_buff *curr_skb,
> > > > >                                                struct page *page, void *buf,
> > > > >                                                int len, int truesize);
> > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > > > >
> > > > >  enum virtnet_xmit_type {
> > > > >         VIRTNET_XMIT_TYPE_SKB,
> > > > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > > >  }
> > > > >
> > > > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > > > +                           bool drain)
> > > > >  {
> > > > >         struct xdp_frame *frame;
> > > > >         struct sk_buff *skb;
> > > > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > >                         break;
> > > > >                 }
> > > > >         }
> > > > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > +       if (!drain)
> > > > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > >  }
> > > > >
> > > > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > >                                   bool in_napi,
> > > > >                                   struct virtnet_sq_free_stats *stats)
> > > > >  {
> > > > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > > > >
> > > > >         if (stats->xsk)
> > > > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > > > +}
> > > > > +
> > > > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > > > +                                  struct netdev_queue *txq)
> > > > > +{
> > > > > +       struct virtnet_sq_free_stats stats = {0};
> > > > > +
> > > > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > > > +
> > > > > +       if (stats.xsk)
> > > > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > > > >  }
> > > >
> > > > Are we sure this can drain the queue? Note that the device is not stopped.
> > >
> > > Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> > > point I added e.g. via virtnet_config_changed_work, so it seems that I need
> > > to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> > > Please let me know if I’m mistaken.
> >
> > Not sure I get you, but I meant we don't reset the device so it can
>
> I was wondering whether there would be a scenario where the tx queue is
> woken up and some new packets from the upper layer reach dql_queued()
> before the drain point, which also could cause the crash.

Ok.

>
> > keep raising tx interrupts:
> >
> > virtnet_drain_old_xmit()
> > netdev_tx_reset_queue()
> > skb_xmit_done()
> > napi_enable()
> > netdev_tx_completed_queue() // here we might still surprise the bql?
>
> Indeed, virtqueue_disable_cb() is needed before the drain point.

Two problems:

1) device/virtqueue is not reset, it can still process the packets
after virtnet_drain_old_xmit()
2) virtqueue_disable_cb() just does its best effort, it can't
guarantee no interrupt after that.

To drain TX, the only reliable seems to be:

1) reset a virtqueue (or a device)
2) drain by using free_old_xmit()
3) netif_reset_tx_queue() // btw this seems to be better done in close not open

Or I wonder if this can be easily fixed by just removing
netdev_tx_reset_queue()?

Thanks

>
> Thanks
>
> >
> > Thanks
> >
> > >
> > > >
> > > > >
> > > > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > > > >          * free_old_xmit().
> > > > >          */
> > > > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > > > +                       &stats, false);
> > > > >
> > > > >         if (stats.xsk)
> > > > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > > > >  {
> > > > >         xsk_tx_completed(sq->xsk_pool, num);
> > > > >
> > > > > +       if (drain)
> > > > > +               return;
> > > > > +
> > > > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > > > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > > > >          * interrupt may not be triggered.
> > > > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > >
> > > > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > >  {
> > > > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > > > >         struct net_device *dev = vi->dev;
> > > > >         int err;
> > > > >
> > > > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > >         if (err < 0)
> > > > >                 goto err_xdp_reg_mem_model;
> > > > >
> > > > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > > > +
> > > > > +       netdev_tx_reset_queue(txq);
> > > > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > > > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > > > >
> > > > > --
> > > > > 2.43.0
> > > > >
> > > > >
> > > >
> > > > Thanks
> > > >
> > >
> >
>
Koichiro Den Nov. 28, 2024, 4:25 a.m. UTC | #6
On Thu, Nov 28, 2024 at 10:57:01AM +0800, Jason Wang wrote:
> On Wed, Nov 27, 2024 at 12:08 PM Koichiro Den
> <koichiro.den@canonical.com> wrote:
> >
> > On Wed, Nov 27, 2024 at 11:24:15AM +0800, Jason Wang wrote:
> > > On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
> > > <koichiro.den@canonical.com> wrote:
> > > >
> > > > On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > > > > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > > > > <koichiro.den@canonical.com> wrote:
> > > > > >
> > > > > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > > > > that some TX completions remain unconsumed. Those are handled during the
> > > > > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > > > > to a crash [1].
> > > > > >
> > > > > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > > > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > > > > machine.
> > > > > >
> > > > > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > > > > allowing BQL to start cleanly.
> > > > > >
> > > > > > ------------[ cut here ]------------
> > > > > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > > > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > > > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > > > > Tainted: [N]=TEST
> > > > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > > > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > > > > RIP: 0010:dql_completed+0x26b/0x290
> > > > > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > > > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > > > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > > > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > > > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > > > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > > > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > > > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > > > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > > > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > > > > knlGS:0000000000000000
> > > > > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > > > > PKRU: 55555554
> > > > > > Call Trace:
> > > > > >  <IRQ>
> > > > > >  ? die+0x32/0x80
> > > > > >  ? do_trap+0xd9/0x100
> > > > > >  ? dql_completed+0x26b/0x290
> > > > > >  ? dql_completed+0x26b/0x290
> > > > > >  ? do_error_trap+0x6d/0xb0
> > > > > >  ? dql_completed+0x26b/0x290
> > > > > >  ? exc_invalid_op+0x4c/0x60
> > > > > >  ? dql_completed+0x26b/0x290
> > > > > >  ? asm_exc_invalid_op+0x16/0x20
> > > > > >  ? dql_completed+0x26b/0x290
> > > > > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > > > > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > > > > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > > > > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > > > > >  ? update_curr+0x35/0x260
> > > > > >  ? reweight_entity+0x1be/0x260
> > > > > >  __napi_poll.constprop.0+0x28/0x1c0
> > > > > >  net_rx_action+0x329/0x420
> > > > > >  ? enqueue_hrtimer+0x35/0x90
> > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > >  ? sched_clock+0xc/0x30
> > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > >  ? sched_clock+0xc/0x30
> > > > > >  ? sched_clock_cpu+0xd/0x1a0
> > > > > >  handle_softirqs+0x138/0x3e0
> > > > > >  do_softirq.part.0+0x89/0xc0
> > > > > >  </IRQ>
> > > > > >  <TASK>
> > > > > >  __local_bh_enable_ip+0xa7/0xb0
> > > > > >  virtnet_open+0xc8/0x310 [virtio_net]
> > > > > >  __dev_open+0xfa/0x1b0
> > > > > >  __dev_change_flags+0x1de/0x250
> > > > > >  dev_change_flags+0x22/0x60
> > > > > >  do_setlink.isra.0+0x2df/0x10b0
> > > > > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > >  ? netlink_rcv_skb+0x54/0x100
> > > > > >  ? netlink_unicast+0x23e/0x390
> > > > > >  ? netlink_sendmsg+0x21e/0x490
> > > > > >  ? ____sys_sendmsg+0x31b/0x350
> > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > >  ? __nla_validate_parse+0x5f/0xee0
> > > > > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > > > > >  ? __create_object+0x5e/0x90
> > > > > >  ? security_capable+0x3b/0x70
> > > > > >  rtnl_newlink+0x784/0xaf0
> > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > >  ? stack_depot_save_flags+0x24/0x6d0
> > > > > >  ? __pfx_rtnl_newlink+0x10/0x10
> > > > > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > >  ? do_syscall_64+0x6c/0x180
> > > > > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > > > > >  netlink_rcv_skb+0x54/0x100
> > > > > >  netlink_unicast+0x23e/0x390
> > > > > >  netlink_sendmsg+0x21e/0x490
> > > > > >  ____sys_sendmsg+0x31b/0x350
> > > > > >  ? copy_msghdr_from_user+0x6d/0xa0
> > > > > >  ___sys_sendmsg+0x86/0xd0
> > > > > >  ? __pte_offset_map+0x17/0x160
> > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > >  ? _raw_spin_trylock+0x13/0x60
> > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > >  __sys_sendmsg+0x66/0xc0
> > > > > >  do_syscall_64+0x6c/0x180
> > > > > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > > RIP: 0033:0x7f41defe5b34
> > > > > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > > > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > > > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > > > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > > > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > > > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > > > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > > > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > > > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > > > > >  </TASK>
> > > > > > [...]
> > > > > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > > > > >
> > > > > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > > > > Cc: <stable@vger.kernel.org> # v6.11+
> > > > > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > > > > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index 64c87bb48a41..3e36c0470600 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > > > > >                                                struct sk_buff *curr_skb,
> > > > > >                                                struct page *page, void *buf,
> > > > > >                                                int len, int truesize);
> > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > > > > >
> > > > > >  enum virtnet_xmit_type {
> > > > > >         VIRTNET_XMIT_TYPE_SKB,
> > > > > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > > > >  }
> > > > > >
> > > > > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > > > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > > > > +                           bool drain)
> > > > > >  {
> > > > > >         struct xdp_frame *frame;
> > > > > >         struct sk_buff *skb;
> > > > > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > >                         break;
> > > > > >                 }
> > > > > >         }
> > > > > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > > +       if (!drain)
> > > > > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > >  }
> > > > > >
> > > > > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > >                                   bool in_napi,
> > > > > >                                   struct virtnet_sq_free_stats *stats)
> > > > > >  {
> > > > > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > > > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > > > > >
> > > > > >         if (stats->xsk)
> > > > > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > > > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > > > > +}
> > > > > > +
> > > > > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > > > > +                                  struct netdev_queue *txq)
> > > > > > +{
> > > > > > +       struct virtnet_sq_free_stats stats = {0};
> > > > > > +
> > > > > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > > > > +
> > > > > > +       if (stats.xsk)
> > > > > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > > > > >  }
> > > > >
> > > > > Are we sure this can drain the queue? Note that the device is not stopped.
> > > >
> > > > Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> > > > point I added e.g. via virtnet_config_changed_work, so it seems that I need
> > > > to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> > > > Please let me know if I’m mistaken.
> > >
> > > Not sure I get you, but I meant we don't reset the device so it can
> >
> > I was wondering whether there would be a scenario where the tx queue is
> > woken up and some new packets from the upper layer reach dql_queued()
> > before the drain point, which also could cause the crash.
> 
> Ok.
> 
> >
> > > keep raising tx interrupts:
> > >
> > > virtnet_drain_old_xmit()
> > > netdev_tx_reset_queue()
> > > skb_xmit_done()
> > > napi_enable()
> > > netdev_tx_completed_queue() // here we might still surprise the bql?
> >
> > Indeed, virtqueue_disable_cb() is needed before the drain point.
> 
> Two problems:
> 
> 1) device/virtqueue is not reset, it can still process the packets
> after virtnet_drain_old_xmit()
> 2) virtqueue_disable_cb() just does its best effort, it can't
> guarantee no interrupt after that.
> 
> To drain TX, the only reliable seems to be:
> 
> 1) reset a virtqueue (or a device)
> 2) drain by using free_old_xmit()
> 3) netif_reset_tx_queue() // btw this seems to be better done in close not open

Thank you for the clarification.
As for 1), I may be missing something but what if VIRTIO_F_RING_RESET is
not supported. A device reset in this context feels a bit excessive to me
(just my two cents though) if in virtnet_open, so as you said, it seems
better done in close in that regard as well.

> 
> Or I wonder if this can be easily fixed by just removing
> netdev_tx_reset_queue()?

Perhaps introducing a variant of dql_reset() that clears all stall history
would suffice? To avoid excessively long stall_max to be possibly recorded.

Thanks

> 
> Thanks
> 
> >
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > >
> > > > > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > > > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > > > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > > > > >          * free_old_xmit().
> > > > > >          */
> > > > > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > > > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > > > > +                       &stats, false);
> > > > > >
> > > > > >         if (stats.xsk)
> > > > > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > > > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > > > >         return 0;
> > > > > >  }
> > > > > >
> > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > > > > >  {
> > > > > >         xsk_tx_completed(sq->xsk_pool, num);
> > > > > >
> > > > > > +       if (drain)
> > > > > > +               return;
> > > > > > +
> > > > > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > > > > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > > > > >          * interrupt may not be triggered.
> > > > > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >
> > > > > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >  {
> > > > > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > > > > >         struct net_device *dev = vi->dev;
> > > > > >         int err;
> > > > > >
> > > > > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >         if (err < 0)
> > > > > >                 goto err_xdp_reg_mem_model;
> > > > > >
> > > > > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > > > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > > > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > > > > +
> > > > > > +       netdev_tx_reset_queue(txq);
> > > > > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > > > > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > > > > >
> > > > > > --
> > > > > > 2.43.0
> > > > > >
> > > > > >
> > > > >
> > > > > Thanks
> > > > >
> > > >
> > >
> >
>
Jason Wang Nov. 29, 2024, 2:18 a.m. UTC | #7
On Thu, Nov 28, 2024 at 12:25 PM Koichiro Den
<koichiro.den@canonical.com> wrote:
>
> On Thu, Nov 28, 2024 at 10:57:01AM +0800, Jason Wang wrote:
> > On Wed, Nov 27, 2024 at 12:08 PM Koichiro Den
> > <koichiro.den@canonical.com> wrote:
> > >
> > > On Wed, Nov 27, 2024 at 11:24:15AM +0800, Jason Wang wrote:
> > > > On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
> > > > <koichiro.den@canonical.com> wrote:
> > > > >
> > > > > On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > > > > > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > > > > > <koichiro.den@canonical.com> wrote:
> > > > > > >
> > > > > > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > > > > > that some TX completions remain unconsumed. Those are handled during the
> > > > > > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > > > > > to a crash [1].
> > > > > > >
> > > > > > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > > > > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > > > > > machine.
> > > > > > >
> > > > > > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > > > > > allowing BQL to start cleanly.
> > > > > > >
> > > > > > > ------------[ cut here ]------------
> > > > > > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > > > > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > > > > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > > > > > Tainted: [N]=TEST
> > > > > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > > > > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > > > > > RIP: 0010:dql_completed+0x26b/0x290
> > > > > > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > > > > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > > > > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > > > > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > > > > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > > > > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > > > > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > > > > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > > > > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > > > > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > > > > > knlGS:0000000000000000
> > > > > > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > > > > > PKRU: 55555554
> > > > > > > Call Trace:
> > > > > > >  <IRQ>
> > > > > > >  ? die+0x32/0x80
> > > > > > >  ? do_trap+0xd9/0x100
> > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > >  ? do_error_trap+0x6d/0xb0
> > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > >  ? exc_invalid_op+0x4c/0x60
> > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > >  ? asm_exc_invalid_op+0x16/0x20
> > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > > > > > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > > > > > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > > > > > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > > > > > >  ? update_curr+0x35/0x260
> > > > > > >  ? reweight_entity+0x1be/0x260
> > > > > > >  __napi_poll.constprop.0+0x28/0x1c0
> > > > > > >  net_rx_action+0x329/0x420
> > > > > > >  ? enqueue_hrtimer+0x35/0x90
> > > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > > >  ? sched_clock+0xc/0x30
> > > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > > >  ? sched_clock+0xc/0x30
> > > > > > >  ? sched_clock_cpu+0xd/0x1a0
> > > > > > >  handle_softirqs+0x138/0x3e0
> > > > > > >  do_softirq.part.0+0x89/0xc0
> > > > > > >  </IRQ>
> > > > > > >  <TASK>
> > > > > > >  __local_bh_enable_ip+0xa7/0xb0
> > > > > > >  virtnet_open+0xc8/0x310 [virtio_net]
> > > > > > >  __dev_open+0xfa/0x1b0
> > > > > > >  __dev_change_flags+0x1de/0x250
> > > > > > >  dev_change_flags+0x22/0x60
> > > > > > >  do_setlink.isra.0+0x2df/0x10b0
> > > > > > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > > >  ? netlink_rcv_skb+0x54/0x100
> > > > > > >  ? netlink_unicast+0x23e/0x390
> > > > > > >  ? netlink_sendmsg+0x21e/0x490
> > > > > > >  ? ____sys_sendmsg+0x31b/0x350
> > > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > > >  ? __nla_validate_parse+0x5f/0xee0
> > > > > > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > > > > > >  ? __create_object+0x5e/0x90
> > > > > > >  ? security_capable+0x3b/0x70
> > > > > > >  rtnl_newlink+0x784/0xaf0
> > > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > > >  ? stack_depot_save_flags+0x24/0x6d0
> > > > > > >  ? __pfx_rtnl_newlink+0x10/0x10
> > > > > > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > > >  ? do_syscall_64+0x6c/0x180
> > > > > > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > > > > > >  netlink_rcv_skb+0x54/0x100
> > > > > > >  netlink_unicast+0x23e/0x390
> > > > > > >  netlink_sendmsg+0x21e/0x490
> > > > > > >  ____sys_sendmsg+0x31b/0x350
> > > > > > >  ? copy_msghdr_from_user+0x6d/0xa0
> > > > > > >  ___sys_sendmsg+0x86/0xd0
> > > > > > >  ? __pte_offset_map+0x17/0x160
> > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > >  ? _raw_spin_trylock+0x13/0x60
> > > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > > >  __sys_sendmsg+0x66/0xc0
> > > > > > >  do_syscall_64+0x6c/0x180
> > > > > > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > > > RIP: 0033:0x7f41defe5b34
> > > > > > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > > > > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > > > > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > > > > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > > > > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > > > > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > > > > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > > > > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > > > > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > > > > > >  </TASK>
> > > > > > > [...]
> > > > > > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > > > > > >
> > > > > > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > > > > > Cc: <stable@vger.kernel.org> # v6.11+
> > > > > > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > > > > > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index 64c87bb48a41..3e36c0470600 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > > > > > >                                                struct sk_buff *curr_skb,
> > > > > > >                                                struct page *page, void *buf,
> > > > > > >                                                int len, int truesize);
> > > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > > > > > >
> > > > > > >  enum virtnet_xmit_type {
> > > > > > >         VIRTNET_XMIT_TYPE_SKB,
> > > > > > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > > > > >  }
> > > > > > >
> > > > > > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > > > > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > > > > > +                           bool drain)
> > > > > > >  {
> > > > > > >         struct xdp_frame *frame;
> > > > > > >         struct sk_buff *skb;
> > > > > > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > > >                         break;
> > > > > > >                 }
> > > > > > >         }
> > > > > > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > > > +       if (!drain)
> > > > > > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > > >  }
> > > > > > >
> > > > > > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > > >                                   bool in_napi,
> > > > > > >                                   struct virtnet_sq_free_stats *stats)
> > > > > > >  {
> > > > > > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > > > > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > > > > > >
> > > > > > >         if (stats->xsk)
> > > > > > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > > > > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > > > > > +                                  struct netdev_queue *txq)
> > > > > > > +{
> > > > > > > +       struct virtnet_sq_free_stats stats = {0};
> > > > > > > +
> > > > > > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > > > > > +
> > > > > > > +       if (stats.xsk)
> > > > > > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > > > > > >  }
> > > > > >
> > > > > > Are we sure this can drain the queue? Note that the device is not stopped.
> > > > >
> > > > > Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> > > > > point I added e.g. via virtnet_config_changed_work, so it seems that I need
> > > > > to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> > > > > Please let me know if I’m mistaken.
> > > >
> > > > Not sure I get you, but I meant we don't reset the device so it can
> > >
> > > I was wondering whether there would be a scenario where the tx queue is
> > > woken up and some new packets from the upper layer reach dql_queued()
> > > before the drain point, which also could cause the crash.
> >
> > Ok.
> >
> > >
> > > > keep raising tx interrupts:
> > > >
> > > > virtnet_drain_old_xmit()
> > > > netdev_tx_reset_queue()
> > > > skb_xmit_done()
> > > > napi_enable()
> > > > netdev_tx_completed_queue() // here we might still surprise the bql?
> > >
> > > Indeed, virtqueue_disable_cb() is needed before the drain point.
> >
> > Two problems:
> >
> > 1) device/virtqueue is not reset, it can still process the packets
> > after virtnet_drain_old_xmit()
> > 2) virtqueue_disable_cb() just does its best effort, it can't
> > guarantee no interrupt after that.
> >
> > To drain TX, the only reliable seems to be:
> >
> > 1) reset a virtqueue (or a device)
> > 2) drain by using free_old_xmit()
> > 3) netif_reset_tx_queue() // btw this seems to be better done in close not open
>
> Thank you for the clarification.
> As for 1), I may be missing something but what if VIRTIO_F_RING_RESET is
> not supported. A device reset in this context feels a bit excessive to me
> (just my two cents though) if in virtnet_open, so as you said, it seems
> better done in close in that regard as well.

Probably, most NIC has a way to stop its TX which is missed in
virtio-net. So we don't have more choices other than device reset when
there's no virtqueue reset.

Or we can limit the BQL to virtqueue reset, AF_XDP used to suffer from
the exact issue when there's no virtqueue reset. That's why virtqueue
reset is invented and AF_XDP is limited to virtqueue reset.

>
> >
> > Or I wonder if this can be easily fixed by just removing
> > netdev_tx_reset_queue()?
>
> Perhaps introducing a variant of dql_reset() that clears all stall history
> would suffice? To avoid excessively long stall_max to be possibly recorded.

Not sure, it looks like the problem still, bql might still be
surprised when it might get completed tx packets when it think it
hasn't sent any?

Thanks

>
> Thanks
>
> >
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > > > > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > > > > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > > > > > >          * free_old_xmit().
> > > > > > >          */
> > > > > > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > > > > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > > > > > +                       &stats, false);
> > > > > > >
> > > > > > >         if (stats.xsk)
> > > > > > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > > > > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > > > > >         return 0;
> > > > > > >  }
> > > > > > >
> > > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > > > > > >  {
> > > > > > >         xsk_tx_completed(sq->xsk_pool, num);
> > > > > > >
> > > > > > > +       if (drain)
> > > > > > > +               return;
> > > > > > > +
> > > > > > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > > > > > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > > > > > >          * interrupt may not be triggered.
> > > > > > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >
> > > > > > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >  {
> > > > > > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > > > > > >         struct net_device *dev = vi->dev;
> > > > > > >         int err;
> > > > > > >
> > > > > > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >         if (err < 0)
> > > > > > >                 goto err_xdp_reg_mem_model;
> > > > > > >
> > > > > > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > > > > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > > > > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > > > > > +
> > > > > > > +       netdev_tx_reset_queue(txq);
> > > > > > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > > > > > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > > > > > >
> > > > > > > --
> > > > > > > 2.43.0
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > >
> > > >
> > >
> >
>
Koichiro Den Nov. 30, 2024, 3:17 p.m. UTC | #8
On Fri, Nov 29, 2024 at 10:18:04AM +0800, Jason Wang wrote:
> On Thu, Nov 28, 2024 at 12:25 PM Koichiro Den
> <koichiro.den@canonical.com> wrote:
> >
> > On Thu, Nov 28, 2024 at 10:57:01AM +0800, Jason Wang wrote:
> > > On Wed, Nov 27, 2024 at 12:08 PM Koichiro Den
> > > <koichiro.den@canonical.com> wrote:
> > > >
> > > > On Wed, Nov 27, 2024 at 11:24:15AM +0800, Jason Wang wrote:
> > > > > On Tue, Nov 26, 2024 at 12:44 PM Koichiro Den
> > > > > <koichiro.den@canonical.com> wrote:
> > > > > >
> > > > > > On Tue, Nov 26, 2024 at 11:50:17AM +0800, Jason Wang wrote:
> > > > > > > On Tue, Nov 26, 2024 at 10:42 AM Koichiro Den
> > > > > > > <koichiro.den@canonical.com> wrote:
> > > > > > > >
> > > > > > > > When virtnet_close is followed by virtnet_open, there is a slight chance
> > > > > > > > that some TX completions remain unconsumed. Those are handled during the
> > > > > > > > first NAPI poll, but since dql_reset occurs just beforehand, it can lead
> > > > > > > > to a crash [1].
> > > > > > > >
> > > > > > > > This issue can be reproduced by running: `while :; do ip l set DEV down;
> > > > > > > > ip l set DEV up; done` under heavy network TX load from inside of the
> > > > > > > > machine.
> > > > > > > >
> > > > > > > > To fix this, drain unconsumed TX completions if any before dql_reset,
> > > > > > > > allowing BQL to start cleanly.
> > > > > > > >
> > > > > > > > ------------[ cut here ]------------
> > > > > > > > kernel BUG at lib/dynamic_queue_limits.c:99!
> > > > > > > > Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > > > > > > > CPU: 7 UID: 0 PID: 1598 Comm: ip Tainted: G    N 6.12.0net-next_main+ #2
> > > > > > > > Tainted: [N]=TEST
> > > > > > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), \
> > > > > > > > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> > > > > > > > RIP: 0010:dql_completed+0x26b/0x290
> > > > > > > > Code: b7 c2 49 89 e9 44 89 da 89 c6 4c 89 d7 e8 ed 17 47 00 58 65 ff 0d
> > > > > > > > 4d 27 90 7e 0f 85 fd fe ff ff e8 ea 53 8d ff e9 f3 fe ff ff <0f> 0b 01
> > > > > > > > d2 44 89 d1 29 d1 ba 00 00 00 00 0f 48 ca e9 28 ff ff ff
> > > > > > > > RSP: 0018:ffffc900002b0d08 EFLAGS: 00010297
> > > > > > > > RAX: 0000000000000000 RBX: ffff888102398c80 RCX: 0000000080190009
> > > > > > > > RDX: 0000000000000000 RSI: 000000000000006a RDI: 0000000000000000
> > > > > > > > RBP: ffff888102398c00 R08: 0000000000000000 R09: 0000000000000000
> > > > > > > > R10: 00000000000000ca R11: 0000000000015681 R12: 0000000000000001
> > > > > > > > R13: ffffc900002b0d68 R14: ffff88811115e000 R15: ffff8881107aca40
> > > > > > > > FS:  00007f41ded69500(0000) GS:ffff888667dc0000(0000)
> > > > > > > > knlGS:0000000000000000
> > > > > > > > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > > > > CR2: 0000556ccc2dc1a0 CR3: 0000000104fd8003 CR4: 0000000000772ef0
> > > > > > > > PKRU: 55555554
> > > > > > > > Call Trace:
> > > > > > > >  <IRQ>
> > > > > > > >  ? die+0x32/0x80
> > > > > > > >  ? do_trap+0xd9/0x100
> > > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > > >  ? do_error_trap+0x6d/0xb0
> > > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > > >  ? exc_invalid_op+0x4c/0x60
> > > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > > >  ? asm_exc_invalid_op+0x16/0x20
> > > > > > > >  ? dql_completed+0x26b/0x290
> > > > > > > >  __free_old_xmit+0xff/0x170 [virtio_net]
> > > > > > > >  free_old_xmit+0x54/0xc0 [virtio_net]
> > > > > > > >  virtnet_poll+0xf4/0xe30 [virtio_net]
> > > > > > > >  ? __update_load_avg_cfs_rq+0x264/0x2d0
> > > > > > > >  ? update_curr+0x35/0x260
> > > > > > > >  ? reweight_entity+0x1be/0x260
> > > > > > > >  __napi_poll.constprop.0+0x28/0x1c0
> > > > > > > >  net_rx_action+0x329/0x420
> > > > > > > >  ? enqueue_hrtimer+0x35/0x90
> > > > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > > > >  ? sched_clock+0xc/0x30
> > > > > > > >  ? kvm_sched_clock_read+0xd/0x20
> > > > > > > >  ? sched_clock+0xc/0x30
> > > > > > > >  ? sched_clock_cpu+0xd/0x1a0
> > > > > > > >  handle_softirqs+0x138/0x3e0
> > > > > > > >  do_softirq.part.0+0x89/0xc0
> > > > > > > >  </IRQ>
> > > > > > > >  <TASK>
> > > > > > > >  __local_bh_enable_ip+0xa7/0xb0
> > > > > > > >  virtnet_open+0xc8/0x310 [virtio_net]
> > > > > > > >  __dev_open+0xfa/0x1b0
> > > > > > > >  __dev_change_flags+0x1de/0x250
> > > > > > > >  dev_change_flags+0x22/0x60
> > > > > > > >  do_setlink.isra.0+0x2df/0x10b0
> > > > > > > >  ? rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > > > >  ? netlink_rcv_skb+0x54/0x100
> > > > > > > >  ? netlink_unicast+0x23e/0x390
> > > > > > > >  ? netlink_sendmsg+0x21e/0x490
> > > > > > > >  ? ____sys_sendmsg+0x31b/0x350
> > > > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > > > >  ? __nla_validate_parse+0x5f/0xee0
> > > > > > > >  ? __pfx___probestub_irq_enable+0x3/0x10
> > > > > > > >  ? __create_object+0x5e/0x90
> > > > > > > >  ? security_capable+0x3b/0x70
> > > > > > > >  rtnl_newlink+0x784/0xaf0
> > > > > > > >  ? avc_has_perm_noaudit+0x67/0xf0
> > > > > > > >  ? cred_has_capability.isra.0+0x75/0x110
> > > > > > > >  ? stack_depot_save_flags+0x24/0x6d0
> > > > > > > >  ? __pfx_rtnl_newlink+0x10/0x10
> > > > > > > >  rtnetlink_rcv_msg+0x34f/0x3f0
> > > > > > > >  ? do_syscall_64+0x6c/0x180
> > > > > > > >  ? entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > > > >  ? __pfx_rtnetlink_rcv_msg+0x10/0x10
> > > > > > > >  netlink_rcv_skb+0x54/0x100
> > > > > > > >  netlink_unicast+0x23e/0x390
> > > > > > > >  netlink_sendmsg+0x21e/0x490
> > > > > > > >  ____sys_sendmsg+0x31b/0x350
> > > > > > > >  ? copy_msghdr_from_user+0x6d/0xa0
> > > > > > > >  ___sys_sendmsg+0x86/0xd0
> > > > > > > >  ? __pte_offset_map+0x17/0x160
> > > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > > >  ? __call_rcu_common.constprop.0+0x147/0x610
> > > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > > >  ? preempt_count_add+0x69/0xa0
> > > > > > > >  ? _raw_spin_trylock+0x13/0x60
> > > > > > > >  ? trace_hardirqs_on+0x1d/0x80
> > > > > > > >  __sys_sendmsg+0x66/0xc0
> > > > > > > >  do_syscall_64+0x6c/0x180
> > > > > > > >  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > > > > > RIP: 0033:0x7f41defe5b34
> > > > > > > > Code: 15 e1 12 0f 00 f7 d8 64 89 02 b8 ff ff ff ff eb bf 0f 1f 44 00 00
> > > > > > > > f3 0f 1e fa 80 3d 35 95 0f 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00
> > > > > > > > f0 ff ff 77 4c c3 0f 1f 00 55 48 89 e5 48 83 ec 20 89 55
> > > > > > > > RSP: 002b:00007ffe5336ecc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
> > > > > > > > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f41defe5b34
> > > > > > > > RDX: 0000000000000000 RSI: 00007ffe5336ed30 RDI: 0000000000000003
> > > > > > > > RBP: 00007ffe5336eda0 R08: 0000000000000010 R09: 0000000000000001
> > > > > > > > R10: 00007ffe5336f6f9 R11: 0000000000000202 R12: 0000000000000003
> > > > > > > > R13: 0000000067452259 R14: 0000556ccc28b040 R15: 0000000000000000
> > > > > > > >  </TASK>
> > > > > > > > [...]
> > > > > > > > ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> > > > > > > >
> > > > > > > > Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits")
> > > > > > > > Cc: <stable@vger.kernel.org> # v6.11+
> > > > > > > > Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
> > > > > > > > ---
> > > > > > > >  drivers/net/virtio_net.c | 37 +++++++++++++++++++++++++++++--------
> > > > > > > >  1 file changed, 29 insertions(+), 8 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > index 64c87bb48a41..3e36c0470600 100644
> > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > @@ -513,7 +513,7 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
> > > > > > > >                                                struct sk_buff *curr_skb,
> > > > > > > >                                                struct page *page, void *buf,
> > > > > > > >                                                int len, int truesize);
> > > > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num);
> > > > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
> > > > > > > >
> > > > > > > >  enum virtnet_xmit_type {
> > > > > > > >         VIRTNET_XMIT_TYPE_SKB,
> > > > > > > > @@ -580,7 +580,8 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > > > > -                           bool in_napi, struct virtnet_sq_free_stats *stats)
> > > > > > > > +                           bool in_napi, struct virtnet_sq_free_stats *stats,
> > > > > > > > +                           bool drain)
> > > > > > > >  {
> > > > > > > >         struct xdp_frame *frame;
> > > > > > > >         struct sk_buff *skb;
> > > > > > > > @@ -620,7 +621,8 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> > > > > > > >                         break;
> > > > > > > >                 }
> > > > > > > >         }
> > > > > > > > -       netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > > > > +       if (!drain)
> > > > > > > > +               netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > > > > @@ -628,10 +630,21 @@ static void virtnet_free_old_xmit(struct send_queue *sq,
> > > > > > > >                                   bool in_napi,
> > > > > > > >                                   struct virtnet_sq_free_stats *stats)
> > > > > > > >  {
> > > > > > > > -       __free_old_xmit(sq, txq, in_napi, stats);
> > > > > > > > +       __free_old_xmit(sq, txq, in_napi, stats, false);
> > > > > > > >
> > > > > > > >         if (stats->xsk)
> > > > > > > > -               virtnet_xsk_completed(sq, stats->xsk);
> > > > > > > > +               virtnet_xsk_completed(sq, stats->xsk, false);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void virtnet_drain_old_xmit(struct send_queue *sq,
> > > > > > > > +                                  struct netdev_queue *txq)
> > > > > > > > +{
> > > > > > > > +       struct virtnet_sq_free_stats stats = {0};
> > > > > > > > +
> > > > > > > > +       __free_old_xmit(sq, txq, false, &stats, true);
> > > > > > > > +
> > > > > > > > +       if (stats.xsk)
> > > > > > > > +               virtnet_xsk_completed(sq, stats.xsk, true);
> > > > > > > >  }
> > > > > > >
> > > > > > > Are we sure this can drain the queue? Note that the device is not stopped.
> > > > > >
> > > > > > Thanks for reviewing. netif_tx_wake_queue can be invoked before the "drain"
> > > > > > point I added e.g. via virtnet_config_changed_work, so it seems that I need
> > > > > > to ensure it's stopped (DRV_XOFF) before the "drain" and wake it afterwards.
> > > > > > Please let me know if I’m mistaken.
> > > > >
> > > > > Not sure I get you, but I meant we don't reset the device so it can
> > > >
> > > > I was wondering whether there would be a scenario where the tx queue is
> > > > woken up and some new packets from the upper layer reach dql_queued()
> > > > before the drain point, which also could cause the crash.
> > >
> > > Ok.
> > >
> > > >
> > > > > keep raising tx interrupts:
> > > > >
> > > > > virtnet_drain_old_xmit()
> > > > > netdev_tx_reset_queue()
> > > > > skb_xmit_done()
> > > > > napi_enable()
> > > > > netdev_tx_completed_queue() // here we might still surprise the bql?
> > > >
> > > > Indeed, virtqueue_disable_cb() is needed before the drain point.
> > >
> > > Two problems:
> > >
> > > 1) device/virtqueue is not reset, it can still process the packets
> > > after virtnet_drain_old_xmit()
> > > 2) virtqueue_disable_cb() just does its best effort, it can't
> > > guarantee no interrupt after that.
> > >
> > > To drain TX, the only reliable seems to be:
> > >
> > > 1) reset a virtqueue (or a device)
> > > 2) drain by using free_old_xmit()
> > > 3) netif_reset_tx_queue() // btw this seems to be better done in close not open
> >
> > Thank you for the clarification.
> > As for 1), I may be missing something but what if VIRTIO_F_RING_RESET is
> > not supported. A device reset in this context feels a bit excessive to me
> > (just my two cents though) if in virtnet_open, so as you said, it seems
> > better done in close in that regard as well.
> 
> Probably, most NIC has a way to stop its TX which is missed in
> virtio-net. So we don't have more choices other than device reset when
> there's no virtqueue reset.
> 
> Or we can limit the BQL to virtqueue reset, AF_XDP used to suffer from
> the exact issue when there's no virtqueue reset. That's why virtqueue
> reset is invented and AF_XDP is limited to virtqueue reset.

Thank you for the response.

Hmm, if we do a device reset, to me it seems better to (re-)initialize the
device in a sane way (i.e. ACKNOWLEDGE->DRIVER->...->DRIVER_OK) here as
well, to avoid potentially surprising the hyperviour side, IIUC. That would
essentially make it nearly equivalent to the current freeze/restore
handling. However, performing such a reset in open or close path would
likely require a fundamental restructuring/redefinition of the open/close
semantics for virtio-net, right? Otherwise I guess it would be a dirty hack
solely for this BQL issue. Limitting the BQL to virtqueue reset sounds
nicer, but I'm now inclined to your suggestion to simply drop
netdev_tx_reset_queue(), as mentioned in my response below.

> 
> >
> > >
> > > Or I wonder if this can be easily fixed by just removing
> > > netdev_tx_reset_queue()?
> >
> > Perhaps introducing a variant of dql_reset() that clears all stall history
> > would suffice? To avoid excessively long stall_max to be possibly recorded.
> 
> Not sure, it looks like the problem still, bql might still be
> surprised when it might get completed tx packets when it think it
> hasn't sent any?

I meant an API that would clear only stall_max, without requiring users to
do so in such cases. That said, I just noticed that commit message of
6025b9135f7a ("net: dqs: add NIC stall detector based on BQL") mentions
that "this mechanism does not ignore stalls caused by Tx being disabled, or
loss of link", primarily for simplicity's sake. Given this, simply dropping
netdev_tx_reset_queue() seems like a streamlined approach. I'll resend a
patch that does so. If I'm missing something or mistaken, please let me
know.

Thanks

> 
> Thanks
> 
> >
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > >  /* Converting between virtqueue no. and kernel tx/rx queue no.
> > > > > > > > @@ -1499,7 +1512,8 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> > > > > > > >         /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
> > > > > > > >          * free_old_xmit().
> > > > > > > >          */
> > > > > > > > -       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
> > > > > > > > +       __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
> > > > > > > > +                       &stats, false);
> > > > > > > >
> > > > > > > >         if (stats.xsk)
> > > > > > > >                 xsk_tx_completed(sq->xsk_pool, stats.xsk);
> > > > > > > > @@ -1556,10 +1570,13 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> > > > > > > >         return 0;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > -static void virtnet_xsk_completed(struct send_queue *sq, int num)
> > > > > > > > +static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
> > > > > > > >  {
> > > > > > > >         xsk_tx_completed(sq->xsk_pool, num);
> > > > > > > >
> > > > > > > > +       if (drain)
> > > > > > > > +               return;
> > > > > > > > +
> > > > > > > >         /* If this is called by rx poll, start_xmit and xdp xmit we should
> > > > > > > >          * wakeup the tx napi to consume the xsk tx queue, because the tx
> > > > > > > >          * interrupt may not be triggered.
> > > > > > > > @@ -3041,6 +3058,7 @@ static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > >
> > > > > > > >  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > >  {
> > > > > > > > +       struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
> > > > > > > >         struct net_device *dev = vi->dev;
> > > > > > > >         int err;
> > > > > > > >
> > > > > > > > @@ -3054,7 +3072,10 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > >         if (err < 0)
> > > > > > > >                 goto err_xdp_reg_mem_model;
> > > > > > > >
> > > > > > > > -       netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
> > > > > > > > +       /* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
> > > > > > > > +       virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
> > > > > > > > +
> > > > > > > > +       netdev_tx_reset_queue(txq);
> > > > > > > >         virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> > > > > > > >         virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);
> > > > > > > >
> > > > > > > > --
> > > > > > > > 2.43.0
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 64c87bb48a41..3e36c0470600 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -513,7 +513,7 @@  static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
 					       struct sk_buff *curr_skb,
 					       struct page *page, void *buf,
 					       int len, int truesize);
-static void virtnet_xsk_completed(struct send_queue *sq, int num);
+static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain);
 
 enum virtnet_xmit_type {
 	VIRTNET_XMIT_TYPE_SKB,
@@ -580,7 +580,8 @@  static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
 }
 
 static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
-			    bool in_napi, struct virtnet_sq_free_stats *stats)
+			    bool in_napi, struct virtnet_sq_free_stats *stats,
+			    bool drain)
 {
 	struct xdp_frame *frame;
 	struct sk_buff *skb;
@@ -620,7 +621,8 @@  static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
 			break;
 		}
 	}
-	netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
+	if (!drain)
+		netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
 }
 
 static void virtnet_free_old_xmit(struct send_queue *sq,
@@ -628,10 +630,21 @@  static void virtnet_free_old_xmit(struct send_queue *sq,
 				  bool in_napi,
 				  struct virtnet_sq_free_stats *stats)
 {
-	__free_old_xmit(sq, txq, in_napi, stats);
+	__free_old_xmit(sq, txq, in_napi, stats, false);
 
 	if (stats->xsk)
-		virtnet_xsk_completed(sq, stats->xsk);
+		virtnet_xsk_completed(sq, stats->xsk, false);
+}
+
+static void virtnet_drain_old_xmit(struct send_queue *sq,
+				   struct netdev_queue *txq)
+{
+	struct virtnet_sq_free_stats stats = {0};
+
+	__free_old_xmit(sq, txq, false, &stats, true);
+
+	if (stats.xsk)
+		virtnet_xsk_completed(sq, stats.xsk, true);
 }
 
 /* Converting between virtqueue no. and kernel tx/rx queue no.
@@ -1499,7 +1512,8 @@  static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
 	/* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
 	 * free_old_xmit().
 	 */
-	__free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
+	__free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true,
+			&stats, false);
 
 	if (stats.xsk)
 		xsk_tx_completed(sq->xsk_pool, stats.xsk);
@@ -1556,10 +1570,13 @@  static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
 	return 0;
 }
 
-static void virtnet_xsk_completed(struct send_queue *sq, int num)
+static void virtnet_xsk_completed(struct send_queue *sq, int num, bool drain)
 {
 	xsk_tx_completed(sq->xsk_pool, num);
 
+	if (drain)
+		return;
+
 	/* If this is called by rx poll, start_xmit and xdp xmit we should
 	 * wakeup the tx napi to consume the xsk tx queue, because the tx
 	 * interrupt may not be triggered.
@@ -3041,6 +3058,7 @@  static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
 
 static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
 {
+	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, qp_index);
 	struct net_device *dev = vi->dev;
 	int err;
 
@@ -3054,7 +3072,10 @@  static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
 	if (err < 0)
 		goto err_xdp_reg_mem_model;
 
-	netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index));
+	/* Drain any unconsumed TX skbs transmitted before the last virtnet_close */
+	virtnet_drain_old_xmit(&vi->sq[qp_index], txq);
+
+	netdev_tx_reset_queue(txq);
 	virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
 	virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi);