Message ID | 20231120203501.321587-3-jinpu.wang@ionos.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | bugfix for ipoib | expand |
Hi Jack, kernel test robot noticed the following build errors: [auto build test ERROR on rdma/for-next] [also build test ERROR on linus/master v6.7-rc2 next-20231120] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Jack-Wang/ipoib-Fix-error-code-return-in-ipoib_mcast_join/20231121-044240 base: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next patch link: https://lore.kernel.org/r/20231120203501.321587-3-jinpu.wang%40ionos.com patch subject: [PATCH 2/2] ipoib: Add tx timeout work to recover queue stop situation config: x86_64-buildonly-randconfig-001-20231121 (https://download.01.org/0day-ci/archive/20231121/202311211231.oyOBtdMM-lkp@intel.com/config) compiler: gcc-9 (Debian 9.3.0-22) 9.3.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311211231.oyOBtdMM-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202311211231.oyOBtdMM-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/infiniband/ulp/ipoib/ipoib_ib.c: In function 'ipoib_napi_schedule_work': >> drivers/infiniband/ulp/ipoib/ipoib_ib.c:542:9: error: implicit declaration of function 'napi_reschedule'; did you mean 'napi_schedule'? [-Werror=implicit-function-declaration] 542 | ret = napi_reschedule(&priv->send_napi); | ^~~~~~~~~~~~~~~ | napi_schedule cc1: some warnings being treated as errors vim +542 drivers/infiniband/ulp/ipoib/ipoib_ib.c 533 534 /* The function will force napi_schedule */ 535 void ipoib_napi_schedule_work(struct work_struct *work) 536 { 537 struct ipoib_dev_priv *priv = 538 container_of(work, struct ipoib_dev_priv, reschedule_napi_work); 539 bool ret; 540 541 do { > 542 ret = napi_reschedule(&priv->send_napi); 543 if (!ret) 544 msleep(3); 545 } while (!ret && netif_queue_stopped(priv->dev) && 546 test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)); 547 } 548
Hi Jack, kernel test robot noticed the following build errors: [auto build test ERROR on rdma/for-next] [also build test ERROR on linus/master v6.7-rc2 next-20231121] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Jack-Wang/ipoib-Fix-error-code-return-in-ipoib_mcast_join/20231121-044240 base: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next patch link: https://lore.kernel.org/r/20231120203501.321587-3-jinpu.wang%40ionos.com patch subject: [PATCH 2/2] ipoib: Add tx timeout work to recover queue stop situation config: i386-allmodconfig (https://download.01.org/0day-ci/archive/20231121/202311211607.9eJYoznW-lkp@intel.com/config) compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311211607.9eJYoznW-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202311211607.9eJYoznW-lkp@intel.com/ All errors (new ones prefixed by >>): >> drivers/infiniband/ulp/ipoib/ipoib_ib.c:542:9: error: call to undeclared function 'napi_reschedule'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] ret = napi_reschedule(&priv->send_napi); ^ drivers/infiniband/ulp/ipoib/ipoib_ib.c:542:9: note: did you mean 'napi_schedule'? include/linux/netdevice.h:520:20: note: 'napi_schedule' declared here static inline bool napi_schedule(struct napi_struct *n) ^ drivers/infiniband/ulp/ipoib/ipoib_ib.c:554:8: error: call to undeclared function 'napi_reschedule'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] ret = napi_reschedule(&priv->send_napi); ^ 2 errors generated. vim +/napi_reschedule +542 drivers/infiniband/ulp/ipoib/ipoib_ib.c 533 534 /* The function will force napi_schedule */ 535 void ipoib_napi_schedule_work(struct work_struct *work) 536 { 537 struct ipoib_dev_priv *priv = 538 container_of(work, struct ipoib_dev_priv, reschedule_napi_work); 539 bool ret; 540 541 do { > 542 ret = napi_reschedule(&priv->send_napi); 543 if (!ret) 544 msleep(3); 545 } while (!ret && netif_queue_stopped(priv->dev) && 546 test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)); 547 } 548
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 35e9c8a330e2..963e936da5e3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -351,10 +351,12 @@ struct ipoib_dev_priv { struct workqueue_struct *wq; struct delayed_work mcast_task; struct work_struct carrier_on_task; + struct work_struct reschedule_napi_work; struct work_struct flush_light; struct work_struct flush_normal; struct work_struct flush_heavy; struct work_struct restart_task; + struct work_struct tx_timeout_work; struct delayed_work ah_reap_task; struct delayed_work neigh_reap_task; struct ib_device *ca; @@ -499,6 +501,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ib_ah *address, u32 dqpn); void ipoib_reap_ah(struct work_struct *work); +void ipoib_napi_schedule_work(struct work_struct *work); struct ipoib_path *__path_find(struct net_device *dev, void *gid); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); @@ -510,6 +513,7 @@ void ipoib_ib_tx_timer_func(struct timer_list *t); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_ib_tx_timeout_work(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 7f84d9866cef..29ba2e443738 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -531,11 +531,35 @@ void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr) napi_schedule(&priv->recv_napi); } +/* The function will force napi_schedule */ +void ipoib_napi_schedule_work(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, reschedule_napi_work); + bool ret; + + do { + ret = napi_reschedule(&priv->send_napi); + if (!ret) + msleep(3); + } while (!ret && netif_queue_stopped(priv->dev) && + test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)); +} + void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr) { struct ipoib_dev_priv *priv = ctx_ptr; + bool ret; - napi_schedule(&priv->send_napi); + ret = napi_reschedule(&priv->send_napi); + /* + * if the queue is closed the driver must be able to schedule napi, + * otherwise we can end with closed queue forever, because no new + * packets to send and napi callback might not get new event after + * its re-arm of the napi. + */ + if (!ret && netif_queue_stopped(priv->dev)) + schedule_work(&priv->reschedule_napi_work); } static inline int post_send(struct ipoib_dev_priv *priv, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 967004ccad98..7a5be705d718 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1200,7 +1200,34 @@ static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, priv->global_tx_head, priv->global_tx_tail); - /* XXX reset QP, etc. */ + + schedule_work(&priv->tx_timeout_work); +} + +void ipoib_ib_tx_timeout_work(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, + struct ipoib_dev_priv, + tx_timeout_work); + int err; + + rtnl_lock(); + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + goto unlock; + + ipoib_stop(priv->dev); + err = ipoib_open(priv->dev); + if (err) { + ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", + err); + goto unlock; + } + + netif_tx_wake_all_queues(priv->dev); +unlock: + rtnl_unlock(); + } static int ipoib_hard_header(struct sk_buff *skb, @@ -2112,7 +2139,7 @@ void ipoib_setup_common(struct net_device *dev) ipoib_set_ethtool_ops(dev); - dev->watchdog_timeo = HZ; + dev->watchdog_timeo = 10 * HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; @@ -2150,10 +2177,12 @@ static void ipoib_build_priv(struct net_device *dev) INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); }
As we sometime run into tx timeout from ipoib, queue seems stopped and can't recover. Diff with mellanox OFED show mellanox driver has timeout work to recover in such case. Add tx timeout work/napi work to recover such case. Also increase the watchdog_timeo to 10 seconds, so more tolerant to error. Signed-off-by: Jack Wang <jinpu.wang@ionos.com> --- drivers/infiniband/ulp/ipoib/ipoib.h | 4 +++ drivers/infiniband/ulp/ipoib/ipoib_ib.c | 26 +++++++++++++++++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 33 +++++++++++++++++++++-- 3 files changed, 60 insertions(+), 3 deletions(-)