diff mbox

ipoib: clean ib tx ring periodically

Message ID 589591340739f0ceeea9ca449b6de3df01caadc4.1487259121.git.pabeni@redhat.com (mailing list archive)
State Deferred
Headers show

Commit Message

Paolo Abeni Feb. 16, 2017, 3:35 p.m. UTC
The skbs transmitted via ipoib_send() are freed only if there are
16 or more outstanding work requests or if the send queue is full.

If there is very little networking activity, the transmitted skbs
can be held by the device driver for an unlimited amount of time,
starving other subsystems.

E.g. assuming the ipv6 is enabled, with the following sequence:

systemctl start firewalld
modprobe ib_ipoib
ip addr add dev ib0 fc00::1/64
systemctl stop firewalld

a cpu will hang: rmmod conntrack will keep a core busy
spinning for nf_conntrack_untracked going to 0, since some ICMP6
ND packets are generated and transmitted when the ipv6 address
is attached to the device, and such packets get a notrack ct
entry.

This change address the issue introducing a periodic timer performing
"garbage collection" on the send ring at low frequency (once every
second).

This new timer runs independently from the currently used poll_timer,
so that no additional delay is introduced to clean the ring after
errors or ring full event.

Reported-by: Thomas Cameron <tcameron@redhat.com>
Fixes: f56bcd801356 ("IPoIB: Use separate CQ for UD send completions")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h    |  2 ++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 35 +++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 6 deletions(-)

Comments

Leon Romanovsky Feb. 16, 2017, 5:09 p.m. UTC | #1
On Thu, Feb 16, 2017 at 04:35:31PM +0100, Paolo Abeni wrote:
> The skbs transmitted via ipoib_send() are freed only if there are
> 16 or more outstanding work requests or if the send queue is full.
>
> If there is very little networking activity, the transmitted skbs
> can be held by the device driver for an unlimited amount of time,
> starving other subsystems.
>
> E.g. assuming the ipv6 is enabled, with the following sequence:
>
> systemctl start firewalld
> modprobe ib_ipoib
> ip addr add dev ib0 fc00::1/64
> systemctl stop firewalld
>
> a cpu will hang: rmmod conntrack will keep a core busy
> spinning for nf_conntrack_untracked going to 0, since some ICMP6
> ND packets are generated and transmitted when the ipv6 address
> is attached to the device, and such packets get a notrack ct
> entry.
>
> This change address the issue introducing a periodic timer performing
> "garbage collection" on the send ring at low frequency (once every
> second).
>
> This new timer runs independently from the currently used poll_timer,
> so that no additional delay is introduced to clean the ring after
> errors or ring full event.
>
> Reported-by: Thomas Cameron <tcameron@redhat.com>
> Fixes: f56bcd801356 ("IPoIB: Use separate CQ for UD send completions")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h    |  2 ++
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 35 +++++++++++++++++++++++++++------
>  2 files changed, 31 insertions(+), 6 deletions(-)
>

Hi Paolo,

This patch crashes in our verification system on Mellanox CX3 HCA.

[  105.569758] console [netcon0] enabled
[  105.569801] netconsole: network logging started
[  105.570316] sysrq: SysRq : Changing Loglevel
[  105.570393] sysrq: Loglevel set to 8
[  127.303248] IPv6: ADDRCONF(NETDEV_UP): ib0: link is not ready
[  135.501051] BUG: unable to handle kernel NULL pointer dereference at           (null)
[  135.501182] IP:           (null)
[  135.501231] PGD 34d5e067
[  135.501231] PUD 34eae067
[  135.501311] PMD 0
[  135.501351]
[  135.501431] Oops: 0010 [#1] SMP
[  135.501491] Modules linked in: netconsole nfsv3 nfs fscache rdma_ucm ib_ucm
rdma_cm iw_cm ib_ipoib ib_cm ib_uverbs ib_umad mlx4_en ptp pps_core ib_core
dm_mirror dm_region_hash dm_log dm_mod ppdev pcspkr nfsd parport_pc parport
i2c_piix4 acpi_cpufreq auth_rpcgss nfs_acl lockd grace sunrpc ip_tables
ata_generic cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
ttm serio_raw pata_acpi drm virtio_console mlx4_core e1000 devlink i2c_core
floppy ata_piix [last unloaded: mlx4_ib]
[  135.501812] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.10.0-rc8-2017-02-16_18-01-48_Paolo_Abeni__pabeni_redhat_com #1
[  135.501895] Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
[  135.501985] task: ffff88013a429480 task.stack: ffffc90000694000
[  135.502057] RIP: 0010:          (null)
[  135.502106] RSP: 0018:ffff88013fcc3e28 EFLAGS: 00010286
[  135.502155] RAX: ffff880138dd0c00 RBX: ffff880138bac900 RCX: 00000000fffd7780
[  135.502221] RDX: ffff880138bace88 RSI: 0000000000000010 RDI: ffff880138dd1400
[  135.502285] RBP: ffff88013fcc3e78 R08: ffff88013fcd0630 R09: ffff88013fcc3ef8
[  135.502357] R10: 0000000000000200 R11: 0000000000000020 R12: ffff880138bace88
[  135.502423] R13: 0000000000000001 R14: 0000000000000003 R15: ffff880138bac000
[  135.502487] FS:  0000000000000000(0000) GS:ffff88013fcc0000(0000) knlGS:0000000000000000
[  135.502558] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  135.502615] CR2: 0000000000000000 CR3: 000000008d77b000 CR4: 00000000000006e0
[  135.502675] Call Trace:
[  135.502711]  <IRQ>
[  135.502766]  ? poll_tx+0x39/0x250 [ib_ipoib]
[  135.502821]  ? cpu_load_update+0xdc/0x150
[  135.502863]  ipoib_tx_gc_timer_func+0x115/0x130 [ib_ipoib]
[  135.502910]  ? poll_tx+0x250/0x250 [ib_ipoib]
[  135.502951]  call_timer_fn+0x35/0x140
[  135.502994]  run_timer_softirq+0x1d7/0x460
[  135.503034]  ? kvm_sched_clock_read+0x1e/0x30
[  135.504133]  ? sched_clock+0x9/0x10
[  135.505240]  ? sched_clock_cpu+0x72/0xa0
[  135.506293]  __do_softirq+0xd7/0x2a8
[  135.507328]  irq_exit+0xb5/0xc0
[  135.508339]  smp_apic_timer_interrupt+0x3d/0x50
[  135.509342]  apic_timer_interrupt+0x89/0x90
[  135.510346] RIP: 0010:native_safe_halt+0x6/0x10
[  135.511360] RSP: 0018:ffffc90000697ea0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
[  135.512405] RAX: 0000000000000000 RBX: ffff88013a429480 RCX: 0000000000000000
[  135.513457] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  135.514496] RBP: ffffc90000697ea0 R08: 00000000d5555b28 R09: 0000000000000000
[  135.515535] R10: 0000000000000200 R11: ffffc900022b7ef0 R12: 0000000000000003
[  135.516579] R13: ffff88013a429480 R14: 0000000000000000 R15: 0000000000000000
[  135.517620]  </IRQ>
[  135.518650]  default_idle+0x1e/0xd0
[  135.519672]  arch_cpu_idle+0xf/0x20
[  135.520673]  default_idle_call+0x35/0x40
[  135.521678]  do_idle+0x15b/0x200
[  135.522687]  cpu_startup_entry+0x1d/0x30
[  135.523706]  start_secondary+0x103/0x130
[  135.524734]  start_cpu+0x14/0x14
[  135.525793] Code:  Bad RIP value.
[  135.526798] RIP:           (null) RSP: ffff88013fcc3e28
[  135.527767] CR2: 0000000000000000
[  135.528727] ---[ end trace 8acb66738f095ba7 ]---
[  135.529639] Kernel panic - not syncing: Fatal exception in interrupt
[  135.531590] Kernel Offset: disabled
[  135.532496] ---[ end Kernel panic - not syncing: Fatal exception in interrupt

Thanks
Paolo Abeni Feb. 16, 2017, 5:47 p.m. UTC | #2
On Thu, 2017-02-16 at 19:09 +0200, Leon Romanovsky wrote:
> On Thu, Feb 16, 2017 at 04:35:31PM +0100, Paolo Abeni wrote:
> > The skbs transmitted via ipoib_send() are freed only if there are
> > 16 or more outstanding work requests or if the send queue is full.
> > 
> > If there is very little networking activity, the transmitted skbs
> > can be held by the device driver for an unlimited amount of time,
> > starving other subsystems.
> > 
> > E.g. assuming the ipv6 is enabled, with the following sequence:
> > 
> > systemctl start firewalld
> > modprobe ib_ipoib
> > ip addr add dev ib0 fc00::1/64
> > systemctl stop firewalld
> > 
> > a cpu will hang: rmmod conntrack will keep a core busy
> > spinning for nf_conntrack_untracked going to 0, since some ICMP6
> > ND packets are generated and transmitted when the ipv6 address
> > is attached to the device, and such packets get a notrack ct
> > entry.
> > 
> > This change address the issue introducing a periodic timer performing
> > "garbage collection" on the send ring at low frequency (once every
> > second).
> > 
> > This new timer runs independently from the currently used poll_timer,
> > so that no additional delay is introduced to clean the ring after
> > errors or ring full event.
> > 
> > Reported-by: Thomas Cameron <tcameron@redhat.com>
> > Fixes: f56bcd801356 ("IPoIB: Use separate CQ for UD send completions")
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> >  drivers/infiniband/ulp/ipoib/ipoib.h    |  2 ++
> >  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 35 +++++++++++++++++++++++++++------
> >  2 files changed, 31 insertions(+), 6 deletions(-)
> > 
> 
> Hi Paolo,
> 
> This patch crashes in our verification system on Mellanox CX3 HCA.
> 
> [  105.569758] console [netcon0] enabled
> [  105.569801] netconsole: network logging started
> [  105.570316] sysrq: SysRq : Changing Loglevel
> [  105.570393] sysrq: Loglevel set to 8
> [  127.303248] IPv6: ADDRCONF(NETDEV_UP): ib0: link is not ready
> [  135.501051] BUG: unable to handle kernel NULL pointer dereference at           (null)
> [  135.501182] IP:           (null)
> [  135.501231] PGD 34d5e067
> [  135.501231] PUD 34eae067
> [  135.501311] PMD 0
> [  135.501351]
> [  135.501431] Oops: 0010 [#1] SMP
> [  135.501491] Modules linked in: netconsole nfsv3 nfs fscache rdma_ucm ib_ucm
> rdma_cm iw_cm ib_ipoib ib_cm ib_uverbs ib_umad mlx4_en ptp pps_core ib_core
> dm_mirror dm_region_hash dm_log dm_mod ppdev pcspkr nfsd parport_pc parport
> i2c_piix4 acpi_cpufreq auth_rpcgss nfs_acl lockd grace sunrpc ip_tables
> ata_generic cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
> ttm serio_raw pata_acpi drm virtio_console mlx4_core e1000 devlink i2c_core
> floppy ata_piix [last unloaded: mlx4_ib]
> [  135.501812] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.10.0-rc8-2017-02-16_18-01-48_Paolo_Abeni__pabeni_redhat_com #1
> [  135.501895] Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
> [  135.501985] task: ffff88013a429480 task.stack: ffffc90000694000
> [  135.502057] RIP: 0010:          (null)
> [  135.502106] RSP: 0018:ffff88013fcc3e28 EFLAGS: 00010286
> [  135.502155] RAX: ffff880138dd0c00 RBX: ffff880138bac900 RCX: 00000000fffd7780
> [  135.502221] RDX: ffff880138bace88 RSI: 0000000000000010 RDI: ffff880138dd1400
> [  135.502285] RBP: ffff88013fcc3e78 R08: ffff88013fcd0630 R09: ffff88013fcc3ef8
> [  135.502357] R10: 0000000000000200 R11: 0000000000000020 R12: ffff880138bace88
> [  135.502423] R13: 0000000000000001 R14: 0000000000000003 R15: ffff880138bac000
> [  135.502487] FS:  0000000000000000(0000) GS:ffff88013fcc0000(0000) knlGS:0000000000000000
> [  135.502558] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  135.502615] CR2: 0000000000000000 CR3: 000000008d77b000 CR4: 00000000000006e0
> [  135.502675] Call Trace:
> [  135.502711]  <IRQ>
> [  135.502766]  ? poll_tx+0x39/0x250 [ib_ipoib]
> [  135.502821]  ? cpu_load_update+0xdc/0x150
> [  135.502863]  ipoib_tx_gc_timer_func+0x115/0x130 [ib_ipoib]
> [  135.502910]  ? poll_tx+0x250/0x250 [ib_ipoib]
> [  135.502951]  call_timer_fn+0x35/0x140
> [  135.502994]  run_timer_softirq+0x1d7/0x460
> [  135.503034]  ? kvm_sched_clock_read+0x1e/0x30
> [  135.504133]  ? sched_clock+0x9/0x10
> [  135.505240]  ? sched_clock_cpu+0x72/0xa0
> [  135.506293]  __do_softirq+0xd7/0x2a8
> [  135.507328]  irq_exit+0xb5/0xc0
> [  135.508339]  smp_apic_timer_interrupt+0x3d/0x50
> [  135.509342]  apic_timer_interrupt+0x89/0x90
> [  135.510346] RIP: 0010:native_safe_halt+0x6/0x10
> [  135.511360] RSP: 0018:ffffc90000697ea0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
> [  135.512405] RAX: 0000000000000000 RBX: ffff88013a429480 RCX: 0000000000000000
> [  135.513457] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [  135.514496] RBP: ffffc90000697ea0 R08: 00000000d5555b28 R09: 0000000000000000
> [  135.515535] R10: 0000000000000200 R11: ffffc900022b7ef0 R12: 0000000000000003
> [  135.516579] R13: ffff88013a429480 R14: 0000000000000000 R15: 0000000000000000
> [  135.517620]  </IRQ>
> [  135.518650]  default_idle+0x1e/0xd0
> [  135.519672]  arch_cpu_idle+0xf/0x20
> [  135.520673]  default_idle_call+0x35/0x40
> [  135.521678]  do_idle+0x15b/0x200
> [  135.522687]  cpu_startup_entry+0x1d/0x30
> [  135.523706]  start_secondary+0x103/0x130
> [  135.524734]  start_cpu+0x14/0x14
> [  135.525793] Code:  Bad RIP value.
> [  135.526798] RIP:           (null) RSP: ffff88013fcc3e28
> [  135.527767] CR2: 0000000000000000
> [  135.528727] ---[ end trace 8acb66738f095ba7 ]---
> [  135.529639] Kernel panic - not syncing: Fatal exception in interrupt
> [  135.531590] Kernel Offset: disabled
> [  135.532496] ---[ end Kernel panic - not syncing: Fatal exception in interrupt

Can you please add more info on the testing setup ? I can't reproduce
the above in my [pretty much trivial] test-bed

Thank you!

Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Erez Shitrit March 1, 2017, 7:28 a.m. UTC | #3
On Thu, Feb 16, 2017 at 5:35 PM, Paolo Abeni <pabeni@redhat.com> wrote:
> The skbs transmitted via ipoib_send() are freed only if there are
> 16 or more outstanding work requests or if the send queue is full.
>
> If there is very little networking activity, the transmitted skbs
> can be held by the device driver for an unlimited amount of time,
> starving other subsystems.
>
> E.g. assuming the ipv6 is enabled, with the following sequence:
>
> systemctl start firewalld
> modprobe ib_ipoib
> ip addr add dev ib0 fc00::1/64
> systemctl stop firewalld
>
> a cpu will hang: rmmod conntrack will keep a core busy
> spinning for nf_conntrack_untracked going to 0, since some ICMP6
> ND packets are generated and transmitted when the ipv6 address
> is attached to the device, and such packets get a notrack ct
> entry.
>
> This change address the issue introducing a periodic timer performing
> "garbage collection" on the send ring at low frequency (once every
> second).
>
> This new timer runs independently from the currently used poll_timer,
> so that no additional delay is introduced to clean the ring after
> errors or ring full event.

Hi,

Adding a new timer is not the required solution, it is a w/a over the
TX part in the ipoib driver.
The real solution, IMHO, is to use the napi mechanism for the TX in a
similar way as it done in the RX. (as it done in many network drivers)

We (Mellanox) are planning to send such solution in the next few days.

Thanks,

>
> Reported-by: Thomas Cameron <tcameron@redhat.com>
> Fixes: f56bcd801356 ("IPoIB: Use separate CQ for UD send completions")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h    |  2 ++
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 35 +++++++++++++++++++++++++++------
>  2 files changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index da12717..3b5039b 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -402,6 +402,8 @@ struct ipoib_dev_priv {
>         u64     hca_caps;
>         struct ipoib_ethtool_st ethtool;
>         struct timer_list poll_timer;
> +       struct timer_list tx_gc_timer;
> +       unsigned long drain_tx_cq_stamp;
>         unsigned max_send_sge;
>         bool sm_fullmember_sendonly_support;
>  };
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 5038f9d..e565848 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -490,13 +490,20 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
>         napi_schedule(&priv->napi);
>  }
>
> +static void __drain_tx_cq(struct ipoib_dev_priv *priv)
> +{
> +       while (poll_tx(priv))
> +               ; /* nothing */
> +
> +       priv->drain_tx_cq_stamp = jiffies;
> +}
> +
>  static void drain_tx_cq(struct net_device *dev)
>  {
>         struct ipoib_dev_priv *priv = netdev_priv(dev);
>
>         netif_tx_lock(dev);
> -       while (poll_tx(priv))
> -               ; /* nothing */
> +       __drain_tx_cq(priv);
>
>         if (netif_queue_stopped(dev))
>                 mod_timer(&priv->poll_timer, jiffies + 1);
> @@ -637,8 +644,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
>         }
>
>         if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
> -               while (poll_tx(priv))
> -                       ; /* nothing */
> +               __drain_tx_cq(priv);
>  }
>
>  static void __ipoib_reap_ah(struct net_device *dev)
> @@ -697,6 +703,19 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
>         drain_tx_cq((struct net_device *)ctx);
>  }
>
> +static void ipoib_tx_gc_timer_func(unsigned long ctx)
> +{
> +       struct net_device *dev = (struct net_device *)ctx;
> +       struct ipoib_dev_priv *priv = netdev_priv(dev);
> +
> +       netif_tx_lock(dev);
> +       if (time_after(jiffies, priv->drain_tx_cq_stamp + HZ))
> +               __drain_tx_cq(priv);
> +       netif_tx_unlock(dev);
> +
> +       mod_timer(&priv->tx_gc_timer, jiffies + HZ);
> +}
> +
>  int ipoib_ib_dev_open(struct net_device *dev)
>  {
>         struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -834,8 +853,7 @@ void ipoib_drain_cq(struct net_device *dev)
>                 }
>         } while (n == IPOIB_NUM_WC);
>
> -       while (poll_tx(priv))
> -               ; /* nothing */
> +       __drain_tx_cq(priv);
>
>         local_bh_enable();
>  }
> @@ -906,6 +924,7 @@ int ipoib_ib_dev_stop(struct net_device *dev)
>
>  timeout:
>         del_timer_sync(&priv->poll_timer);
> +       del_timer_sync(&priv->tx_gc_timer);
>         qp_attr.qp_state = IB_QPS_RESET;
>         if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
>                 ipoib_warn(priv, "Failed to modify QP to RESET state\n");
> @@ -932,6 +951,10 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
>
>         setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
>                     (unsigned long) dev);
> +       setup_timer(&priv->tx_gc_timer, ipoib_tx_gc_timer_func,
> +                   (unsigned long) dev);
> +       mod_timer(&priv->tx_gc_timer, jiffies + HZ);
> +       priv->drain_tx_cq_stamp = jiffies;
>
>         if (dev->flags & IFF_UP) {
>                 if (ipoib_ib_dev_open(dev)) {
> --
> 2.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Abeni March 1, 2017, 9:07 a.m. UTC | #4
On Wed, 2017-03-01 at 09:28 +0200, Erez Shitrit wrote:
> On Thu, Feb 16, 2017 at 5:35 PM, Paolo Abeni <pabeni@redhat.com> wrote:
> > The skbs transmitted via ipoib_send() are freed only if there are
> > 16 or more outstanding work requests or if the send queue is full.
> > 
> > If there is very little networking activity, the transmitted skbs
> > can be held by the device driver for an unlimited amount of time,
> > starving other subsystems.
> > 
> > E.g. assuming the ipv6 is enabled, with the following sequence:
> > 
> > systemctl start firewalld
> > modprobe ib_ipoib
> > ip addr add dev ib0 fc00::1/64
> > systemctl stop firewalld
> > 
> > a cpu will hang: rmmod conntrack will keep a core busy
> > spinning for nf_conntrack_untracked going to 0, since some ICMP6
> > ND packets are generated and transmitted when the ipv6 address
> > is attached to the device, and such packets get a notrack ct
> > entry.
> > 
> > This change address the issue introducing a periodic timer performing
> > "garbage collection" on the send ring at low frequency (once every
> > second).
> > 
> > This new timer runs independently from the currently used poll_timer,
> > so that no additional delay is introduced to clean the ring after
> > errors or ring full event.
> 
> Hi,
> 
> Adding a new timer is not the required solution, it is a w/a over the
> TX part in the ipoib driver.
> The real solution, IMHO, is to use the napi mechanism for the TX in a
> similar way as it done in the RX. (as it done in many network drivers)
> 
> We (Mellanox) are planning to send such solution in the next few days.

Thank you for jumping-in on this.

I think that the tx napi polling implementation for the ipoib driver is
not so straight-forward because, afaics, the ib completion callback is
intentionally avoided for tx - unless in exceptional scenarios -
possibly for performance reason.

Anyway, if you can fix this in a cleaner way, I'll be more than happy. 

Thank you,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Erez Shitrit March 1, 2017, 9:39 a.m. UTC | #5
On Wed, Mar 1, 2017 at 11:07 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> On Wed, 2017-03-01 at 09:28 +0200, Erez Shitrit wrote:
>> On Thu, Feb 16, 2017 at 5:35 PM, Paolo Abeni <pabeni@redhat.com> wrote:
>> > The skbs transmitted via ipoib_send() are freed only if there are
>> > 16 or more outstanding work requests or if the send queue is full.
>> >
>> > If there is very little networking activity, the transmitted skbs
>> > can be held by the device driver for an unlimited amount of time,
>> > starving other subsystems.
>> >
>> > E.g. assuming the ipv6 is enabled, with the following sequence:
>> >
>> > systemctl start firewalld
>> > modprobe ib_ipoib
>> > ip addr add dev ib0 fc00::1/64
>> > systemctl stop firewalld
>> >
>> > a cpu will hang: rmmod conntrack will keep a core busy
>> > spinning for nf_conntrack_untracked going to 0, since some ICMP6
>> > ND packets are generated and transmitted when the ipv6 address
>> > is attached to the device, and such packets get a notrack ct
>> > entry.
>> >
>> > This change address the issue introducing a periodic timer performing
>> > "garbage collection" on the send ring at low frequency (once every
>> > second).
>> >
>> > This new timer runs independently from the currently used poll_timer,
>> > so that no additional delay is introduced to clean the ring after
>> > errors or ring full event.
>>
>> Hi,
>>
>> Adding a new timer is not the required solution, it is a w/a over the
>> TX part in the ipoib driver.
>> The real solution, IMHO, is to use the napi mechanism for the TX in a
>> similar way as it done in the RX. (as it done in many network drivers)
>>
>> We (Mellanox) are planning to send such solution in the next few days.
>
> Thank you for jumping-in on this.
>
> I think that the tx napi polling implementation for the ipoib driver is
> not so straight-forward because, afaics, the ib completion callback is
> intentionally avoided for tx - unless in exceptional scenarios -
> possibly for performance reason.

Not sure that was the reason, probably other historical reasons.

You can see that in the CM mode the IPoIB driver uses napi for the tx
completion.

(it works for us in the UD mode, and without napi we will not be able
to implement time sensitive features on top of ipoib (time stamping
for example)

>
> Anyway, if you can fix this in a cleaner way, I'll be more than happy.
>
> Thank you,
>
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index da12717..3b5039b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -402,6 +402,8 @@  struct ipoib_dev_priv {
 	u64	hca_caps;
 	struct ipoib_ethtool_st ethtool;
 	struct timer_list poll_timer;
+	struct timer_list tx_gc_timer;
+	unsigned long drain_tx_cq_stamp;
 	unsigned max_send_sge;
 	bool sm_fullmember_sendonly_support;
 };
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 5038f9d..e565848 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -490,13 +490,20 @@  void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 	napi_schedule(&priv->napi);
 }
 
+static void __drain_tx_cq(struct ipoib_dev_priv *priv)
+{
+	while (poll_tx(priv))
+		; /* nothing */
+
+	priv->drain_tx_cq_stamp = jiffies;
+}
+
 static void drain_tx_cq(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	netif_tx_lock(dev);
-	while (poll_tx(priv))
-		; /* nothing */
+	__drain_tx_cq(priv);
 
 	if (netif_queue_stopped(dev))
 		mod_timer(&priv->poll_timer, jiffies + 1);
@@ -637,8 +644,7 @@  void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
-		while (poll_tx(priv))
-			; /* nothing */
+		__drain_tx_cq(priv);
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)
@@ -697,6 +703,19 @@  static void ipoib_ib_tx_timer_func(unsigned long ctx)
 	drain_tx_cq((struct net_device *)ctx);
 }
 
+static void ipoib_tx_gc_timer_func(unsigned long ctx)
+{
+	struct net_device *dev = (struct net_device *)ctx;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	netif_tx_lock(dev);
+	if (time_after(jiffies, priv->drain_tx_cq_stamp + HZ))
+		__drain_tx_cq(priv);
+	netif_tx_unlock(dev);
+
+	mod_timer(&priv->tx_gc_timer, jiffies + HZ);
+}
+
 int ipoib_ib_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -834,8 +853,7 @@  void ipoib_drain_cq(struct net_device *dev)
 		}
 	} while (n == IPOIB_NUM_WC);
 
-	while (poll_tx(priv))
-		; /* nothing */
+	__drain_tx_cq(priv);
 
 	local_bh_enable();
 }
@@ -906,6 +924,7 @@  int ipoib_ib_dev_stop(struct net_device *dev)
 
 timeout:
 	del_timer_sync(&priv->poll_timer);
+	del_timer_sync(&priv->tx_gc_timer);
 	qp_attr.qp_state = IB_QPS_RESET;
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
@@ -932,6 +951,10 @@  int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 
 	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
 		    (unsigned long) dev);
+	setup_timer(&priv->tx_gc_timer, ipoib_tx_gc_timer_func,
+		    (unsigned long) dev);
+	mod_timer(&priv->tx_gc_timer, jiffies + HZ);
+	priv->drain_tx_cq_stamp = jiffies;
 
 	if (dev->flags & IFF_UP) {
 		if (ipoib_ib_dev_open(dev)) {