diff mbox series

[V2,for-next] RDMA/hns: Add support function clear when removing module

Message ID 1556028391-107055-1-git-send-email-oulijun@huawei.com (mailing list archive)
State Changes Requested
Headers show
Series [V2,for-next] RDMA/hns: Add support function clear when removing module | expand

Commit Message

Lijun Ou April 23, 2019, 2:06 p.m. UTC
If carried out unloading operation when application is running, the
hardware is too late to release some hardware resource that remain
in it. Under these circumstances, it maybe happen error if loaded hns
driver and run app again. In order to resolve it, the hardware adds
a function to stop this function and clear the residual hardware
resources in the function.

Signed-off-by: chenglang <chenglang@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
V1->V2:
1. rewrite the commit.
---
 drivers/infiniband/hw/hns/hns_roce_device.h |   1 +
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 164 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  27 ++++-
 3 files changed, 191 insertions(+), 1 deletion(-)

Comments

Leon Romanovsky April 23, 2019, 3:48 p.m. UTC | #1
On Tue, Apr 23, 2019 at 10:06:31PM +0800, Lijun Ou wrote:
> If carried out unloading operation when application is running, the
> hardware is too late to release some hardware resource that remain
> in it. Under these circumstances, it maybe happen error if loaded hns
> driver and run app again. In order to resolve it, the hardware adds
> a function to stop this function and clear the residual hardware
> resources in the function.
>
> Signed-off-by: chenglang <chenglang@huawei.com>
> Signed-off-by: Lijun Ou <oulijun@huawei.com>
> ---
> V1->V2:
> 1. rewrite the commit.
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h |   1 +
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 164 ++++++++++++++++++++++++++++
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  27 ++++-
>  3 files changed, 191 insertions(+), 1 deletion(-)
>

The same comment, as for v1, you should find the reason where resources
were leaked and fix it, instead of adding reset function with timeouts
and msleep.

Thanks
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 563cf39..2e35469 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -990,6 +990,7 @@  struct hns_roce_dev {
 	void			*priv;
 	struct workqueue_struct *irq_workq;
 	const struct hns_roce_dfx_hw *dfx;
+	u32			func_num;
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index f155d2d..efaf4ee 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1129,6 +1129,165 @@  static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
 	return 0;
 }
 
+static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long reset_cnt;
+	bool sw_resetting;
+	bool hw_resetting;
+
+	reset_cnt = ops->ae_dev_reset_cnt(handle);
+	hw_resetting = ops->get_hw_reset_stat(handle);
+	sw_resetting = ops->ae_dev_resetting(handle);
+
+	if (reset_cnt != hr_dev->reset_cnt || hw_resetting || sw_resetting)
+		return true;
+
+	return false;
+}
+
+static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval,
+				      int flag)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long instance_stage;
+	unsigned long reset_cnt;
+	unsigned long end;
+	bool sw_resetting;
+	bool hw_resetting;
+
+	instance_stage = handle->rinfo.instance_state;
+	reset_cnt = ops->ae_dev_reset_cnt(handle);
+	hw_resetting = ops->get_hw_reset_stat(handle);
+	sw_resetting = ops->ae_dev_resetting(handle);
+
+	if (reset_cnt != hr_dev->reset_cnt) {
+		hr_dev->dis_db = true;
+		hr_dev->is_reset = true;
+		dev_info(hr_dev->dev, "Func clear success after reset.\n");
+	} else if (hw_resetting) {
+		hr_dev->dis_db = true;
+
+		dev_warn(hr_dev->dev,
+			 "Func clear is pending, device in resetting state.\n");
+		end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies;
+		while (time_before(jiffies, end)) {
+			if (!ops->get_hw_reset_stat(handle)) {
+				hr_dev->is_reset = true;
+				dev_info(hr_dev->dev,
+					 "Func clear success after reset.\n");
+				return;
+			}
+			msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+		}
+
+		dev_warn(hr_dev->dev, "Func clear failed.\n");
+	} else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT) {
+		hr_dev->dis_db = true;
+
+		dev_warn(hr_dev->dev,
+			 "Func clear is pending, device in resetting state.\n");
+		end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies;
+		while (time_before(jiffies, end)) {
+			if (ops->ae_dev_reset_cnt(handle) !=
+			    hr_dev->reset_cnt) {
+				hr_dev->is_reset = true;
+				dev_info(hr_dev->dev,
+					 "Func clear success after sw reset\n");
+				return;
+			}
+			msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+		}
+
+		dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n");
+	} else {
+		if (retval && !flag)
+			dev_warn(hr_dev->dev,
+				 "Func clear read failed, ret = %d.\n", retval);
+
+		dev_warn(hr_dev->dev, "Func clear failed.\n");
+	}
+}
+
+static void hns_roce_query_func_num(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_pf_func_num *resp;
+	struct hns_roce_cmq_desc desc;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_VF_NUM, true);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		dev_err(hr_dev->dev, "Query vf count fail, ret = %d.\n",
+			 ret);
+		return;
+	}
+
+	resp = (struct hns_roce_pf_func_num *)desc.data;
+	hr_dev->func_num = resp->pf_own_func_num;
+}
+
+static void hns_roce_clear_func(struct hns_roce_dev *hr_dev, int vf_id)
+{
+	bool fclr_write_fail_flag = false;
+	struct hns_roce_func_clear *resp;
+	struct hns_roce_cmq_desc desc;
+	unsigned long end;
+	int ret = 0;
+
+	if (hns_roce_func_clr_chk_rst(hr_dev))
+		goto out;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false);
+	resp = (struct hns_roce_func_clear *)desc.data;
+	resp->rst_funcid_en = vf_id;
+
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		fclr_write_fail_flag = true;
+		dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n",
+			 ret);
+		goto out;
+	}
+
+	end = msecs_to_jiffies(HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS) + jiffies;
+
+	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
+	while (time_before(jiffies, end)) {
+		if (hns_roce_func_clr_chk_rst(hr_dev))
+			goto out;
+
+		hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR,
+					      true);
+		resp->rst_funcid_en = vf_id;
+
+		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+		if (ret) {
+			msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
+			continue;
+		}
+
+		if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) {
+			if (vf_id == 0)
+				hr_dev->is_reset = true;
+			return;
+		}
+	}
+
+out:
+	dev_err(hr_dev->dev, "Func clear read vf_id %d fail.\n", vf_id);
+	hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag);
+}
+
+static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_clear_func(hr_dev, 0);
+}
+
 static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_query_fw_info *resp;
@@ -1479,6 +1638,8 @@  static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
+	hns_roce_query_func_num(hr_dev);
+
 	if (hr_dev->pci_dev->revision == 0x21) {
 		ret = hns_roce_query_pf_timer_resource(hr_dev);
 		if (ret) {
@@ -1890,6 +2051,9 @@  static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
+	if (hr_dev->pci_dev->revision == 0x21)
+		hns_roce_function_clear(hr_dev);
+
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index edfdbe2..3f2c85f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -96,7 +96,10 @@ 
 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE	2
 #define HNS_ROCE_V2_RSV_QPS			8
 
-#define HNS_ROCE_V2_HW_RST_TIMEOUT             1000
+/* Time out for hardware to complete reset */
+#define HNS_ROCE_V2_HW_RST_TIMEOUT		1000
+
+#define HNS_ROCE_V2_HW_RST_COMPLETION_WAIT	20
 
 #define HNS_ROCE_CONTEXT_HOP_NUM		1
 #define HNS_ROCE_SCCC_HOP_NUM			1
@@ -236,11 +239,13 @@  enum hns_roce_opcode_type {
 	HNS_ROCE_OPC_CFG_EXT_LLM			= 0x8403,
 	HNS_ROCE_OPC_CFG_TMOUT_LLM			= 0x8404,
 	HNS_ROCE_OPC_QUERY_PF_TIMER_RES			= 0x8406,
+	HNS_ROCE_OPC_QUERY_VF_NUM			= 0x8407,
 	HNS_ROCE_OPC_CFG_SGID_TB			= 0x8500,
 	HNS_ROCE_OPC_CFG_SMAC_TB			= 0x8501,
 	HNS_ROCE_OPC_POST_MB				= 0x8504,
 	HNS_ROCE_OPC_QUERY_MB_ST			= 0x8505,
 	HNS_ROCE_OPC_CFG_BT_ATTR			= 0x8506,
+	HNS_ROCE_OPC_FUNC_CLEAR				= 0x8508,
 	HNS_ROCE_OPC_CLR_SCCC				= 0x8509,
 	HNS_ROCE_OPC_QUERY_SCCC				= 0x850a,
 	HNS_ROCE_OPC_RESET_SCCC				= 0x850b,
@@ -1226,6 +1231,26 @@  struct hns_roce_query_fw_info {
 	__le32 rsv[5];
 };
 
+struct hns_roce_func_clear {
+	__le32 rst_funcid_en;
+	__le32 func_done;
+	__le32 rsv[4];
+};
+
+struct hns_roce_pf_func_num {
+	__le32 pf_own_func_num;
+	__le32 func_done;
+	__le32 rsv[4];
+};
+
+#define FUNC_CLEAR_RST_FUN_EN_S 8
+
+#define FUNC_CLEAR_RST_FUN_DONE_S 0
+
+#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(512 * 100)
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20
+
 struct hns_roce_cfg_llm_a {
 	__le32 base_addr_l;
 	__le32 base_addr_h;