diff mbox series

[V3,for-next] RDMA/hns: reset function when removing module

Message ID 1560524163-94676-1-git-send-email-oulijun@huawei.com (mailing list archive)
State Mainlined
Commit 89a6da3cb8f30ee0aeca924d84bef688f22f883e
Delegated to: Doug Ledford
Headers show
Series [V3,for-next] RDMA/hns: reset function when removing module | expand

Commit Message

Lijun Ou June 14, 2019, 2:56 p.m. UTC
From: Lang Cheng <chenglang@huawei.com>

During removing the driver, we needs to notify the roce engine to
stop working immediately,and symmetrically recycle the hardware
resources requested during initialization.

The hardware provides a command called function clear that can package
these operations,so that the driver can only focus on releasing
resources that applied from the operating system.
This patch implements the call of this command.

Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
V2->V3:
1. Remove other reset state operations that are not related to
   function clear
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 44 ++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 17 ++++++++++++
 2 files changed, 61 insertions(+)

Comments

Doug Ledford June 20, 2019, 7:09 p.m. UTC | #1
On Fri, 2019-06-14 at 22:56 +0800, Lijun Ou wrote:
> From: Lang Cheng <chenglang@huawei.com>
> 
> During removing the driver, we needs to notify the roce engine to
> stop working immediately,and symmetrically recycle the hardware
> resources requested during initialization.
> 
> The hardware provides a command called function clear that can package
> these operations,so that the driver can only focus on releasing
> resources that applied from the operating system.
> This patch implements the call of this command.
> 
> Signed-off-by: Lang Cheng <chenglang@huawei.com>
> Signed-off-by: Lijun Ou <oulijun@huawei.com>
> ---
> V2->V3:
> 1. Remove other reset state operations that are not related to
>    function clear



> +	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
> +	end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
> +	while (end) {
> +		msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
> +		end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;



> +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
> +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
> +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20

I absolutely despise code that does a possible *50* second blocking
delay using msleep.  However, because I suspect that this is something
that should *rarely* ever happen, and instead the common case is that
the reset will proceed much faster, and because this is in the
teardown/shutdown path of the device where it is a little more
acceptable to have a blocking delay, I've taken this to for-next.
Jason Gunthorpe June 20, 2019, 7:34 p.m. UTC | #2
On Thu, Jun 20, 2019 at 03:09:37PM -0400, Doug Ledford wrote:
> On Fri, 2019-06-14 at 22:56 +0800, Lijun Ou wrote:
> > From: Lang Cheng <chenglang@huawei.com>
> > 
> > During removing the driver, we needs to notify the roce engine to
> > stop working immediately,and symmetrically recycle the hardware
> > resources requested during initialization.
> > 
> > The hardware provides a command called function clear that can package
> > these operations,so that the driver can only focus on releasing
> > resources that applied from the operating system.
> > This patch implements the call of this command.
> > 
> > Signed-off-by: Lang Cheng <chenglang@huawei.com>
> > Signed-off-by: Lijun Ou <oulijun@huawei.com>
> > V2->V3:
> > 1. Remove other reset state operations that are not related to
> >    function clear
> 
> 
> 
> > +	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
> > +	end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
> > +	while (end) {
> > +		msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
> > +		end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
> 
> 
> 
> > +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
> > +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
> > +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20
> 
> I absolutely despise code that does a possible *50* second blocking
> delay using msleep.  However, because I suspect that this is something
> that should *rarely* ever happen, and instead the common case is that
> the reset will proceed much faster, and because this is in the
> teardown/shutdown path of the device where it is a little more
> acceptable to have a blocking delay, I've taken this to for-next.

I've been telling hns they have to do proper locking and wouldn't
accept these stupid msleep loops. No kernel code should ever do that,
it just shows the locking and concurrency model is wrong.

Jason
Doug Ledford June 20, 2019, 7:48 p.m. UTC | #3
On Thu, 2019-06-20 at 16:34 -0300, Jason Gunthorpe wrote:
> On Thu, Jun 20, 2019 at 03:09:37PM -0400, Doug Ledford wrote:
> > On Fri, 2019-06-14 at 22:56 +0800, Lijun Ou wrote:
> > > From: Lang Cheng <chenglang@huawei.com>
> > > 
> > > During removing the driver, we needs to notify the roce engine to
> > > stop working immediately,and symmetrically recycle the hardware
> > > resources requested during initialization.
> > > 
> > > The hardware provides a command called function clear that can
> > > package
> > > these operations,so that the driver can only focus on releasing
> > > resources that applied from the operating system.
> > > This patch implements the call of this command.
> > > 
> > > Signed-off-by: Lang Cheng <chenglang@huawei.com>
> > > Signed-off-by: Lijun Ou <oulijun@huawei.com>
> > > V2->V3:
> > > 1. Remove other reset state operations that are not related to
> > >    function clear
> > 
> > 
> > > +	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
> > > +	end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
> > > +	while (end) {
> > > +		msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
> > > +		end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
> > 
> > 
> > > +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
> > > +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
> > > +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20
> > 
> > I absolutely despise code that does a possible *50* second blocking
> > delay using msleep.  However, because I suspect that this is
> > something
> > that should *rarely* ever happen, and instead the common case is
> > that
> > the reset will proceed much faster, and because this is in the
> > teardown/shutdown path of the device where it is a little more
> > acceptable to have a blocking delay, I've taken this to for-next.
> 
> I've been telling hns they have to do proper locking and wouldn't
> accept these stupid msleep loops. No kernel code should ever do that,
> it just shows the locking and concurrency model is wrong.

It's an msleep() waiting for a hardware command to complete.  Waiting
synchronously for a command that has the purpose of stopping the card's
operation does not sound like an incorrect locking or concurrency model
to me.  It sounds sane, albeit annoying.
Jason Gunthorpe June 20, 2019, 8:05 p.m. UTC | #4
On Thu, Jun 20, 2019 at 03:48:23PM -0400, Doug Ledford wrote:

> It's an msleep() waiting for a hardware command to complete.  Waiting
> synchronously for a command that has the purpose of stopping the card's
> operation does not sound like an incorrect locking or concurrency model
> to me.  It sounds sane, albeit annoying.

If it was the only sleep loop you might have a point, but it isn't,
every other patch series lately seems to be adding more sleep
loops. This sleep loop is already wrapping another sleep loop under
__hns_roce_cmq_send() - which, for some reason, doesn't have an
interrupt driven completion path.

Nor is there any explanation why we need a sleep loop on top of a
sleep loop, or why the command is allowed to fail or why retrying the
failed command is even a good idea, or why it can't be properly
interrupt driven!

I'm frankly sick of it, maybe you should review HNS patches for a
while..

Jason
Doug Ledford June 20, 2019, 8:30 p.m. UTC | #5
On Thu, 2019-06-20 at 17:05 -0300, Jason Gunthorpe wrote:
> On Thu, Jun 20, 2019 at 03:48:23PM -0400, Doug Ledford wrote:
> 
> > It's an msleep() waiting for a hardware command to
> > complete.  Waiting
> > synchronously for a command that has the purpose of stopping the
> > card's
> > operation does not sound like an incorrect locking or concurrency
> > model
> > to me.  It sounds sane, albeit annoying.
> 
> If it was the only sleep loop you might have a point, but it isn't,
> every other patch series lately seems to be adding more sleep
> loops. This sleep loop is already wrapping another sleep loop under
> __hns_roce_cmq_send() - which, for some reason, doesn't have an
> interrupt driven completion path.
> 
> Nor is there any explanation why we need a sleep loop on top of a
> sleep loop, or why the command is allowed to fail or why retrying the
> failed command is even a good idea, or why it can't be properly
> interrupt driven!
> 
> I'm frankly sick of it, maybe you should review HNS patches for a
> while..

Are you sure this hasn't changed over time and you didn't realize it? 
I'm not seeing all the sleeps you are talking about. In fact, if I grep
for "sleep" in hw/hns/ I only find 9 instances: 5 in hns_roce_hw_v1 and
4 in hns_roce_hw_v2, so really only 5 at most as those two files are
just duplicates of each other for the different hardware.  And even
then, when I checked on all the sleeps in hw_v2, they were all in init
or reset code paths, certainly none in send.  And I didn't find any
include file wrappers that use sleeps.  I get how over use of sleeps can
be a big issue, I guess I'm just having a hard time finding where it's
being abused as badly as you say.  Of course, I may just not be looking
in the right place...
Jason Gunthorpe June 21, 2019, 12:57 p.m. UTC | #6
On Thu, Jun 20, 2019 at 04:30:03PM -0400, Doug Ledford wrote:
> On Thu, 2019-06-20 at 17:05 -0300, Jason Gunthorpe wrote:
> > On Thu, Jun 20, 2019 at 03:48:23PM -0400, Doug Ledford wrote:
> > 
> > > It's an msleep() waiting for a hardware command to
> > > complete.  Waiting
> > > synchronously for a command that has the purpose of stopping the
> > > card's
> > > operation does not sound like an incorrect locking or concurrency
> > > model
> > > to me.  It sounds sane, albeit annoying.
> > 
> > If it was the only sleep loop you might have a point, but it isn't,
> > every other patch series lately seems to be adding more sleep
> > loops. This sleep loop is already wrapping another sleep loop under
> > __hns_roce_cmq_send() - which, for some reason, doesn't have an
> > interrupt driven completion path.
> > 
> > Nor is there any explanation why we need a sleep loop on top of a
> > sleep loop, or why the command is allowed to fail or why retrying the
> > failed command is even a good idea, or why it can't be properly
> > interrupt driven!
> > 
> > I'm frankly sick of it, maybe you should review HNS patches for a
> > while..
> 
> Are you sure this hasn't changed over time and you didn't realize it? 
> I'm not seeing all the sleeps you are talking about. 

It is because I wasn't applying those patches :)

> In fact, if I grep for "sleep" in hw/hns/ I only find 9 instances: 5
> in hns_roce_hw_v1 and 4 in hns_roce_hw_v2, so really only 5 at most

Most of our drivers have 0 sleep loops (excluding the hfi/qib
uglyness), so 9 is aleady absurd for a 2010 era driver.

Jason
Doug Ledford June 21, 2019, 2:46 p.m. UTC | #7
On Fri, 2019-06-21 at 09:57 -0300, Jason Gunthorpe wrote:
> On Thu, Jun 20, 2019 at 04:30:03PM -0400, Doug Ledford wrote:
> > On Thu, 2019-06-20 at 17:05 -0300, Jason Gunthorpe wrote:
> > > On Thu, Jun 20, 2019 at 03:48:23PM -0400, Doug Ledford wrote:
> > > 
> > > > It's an msleep() waiting for a hardware command to
> > > > complete.  Waiting
> > > > synchronously for a command that has the purpose of stopping the
> > > > card's
> > > > operation does not sound like an incorrect locking or
> > > > concurrency
> > > > model
> > > > to me.  It sounds sane, albeit annoying.
> > > 
> > > If it was the only sleep loop you might have a point, but it
> > > isn't,
> > > every other patch series lately seems to be adding more sleep
> > > loops. This sleep loop is already wrapping another sleep loop
> > > under
> > > __hns_roce_cmq_send() - which, for some reason, doesn't have an
> > > interrupt driven completion path.
> > > 
> > > Nor is there any explanation why we need a sleep loop on top of a
> > > sleep loop, or why the command is allowed to fail or why retrying
> > > the
> > > failed command is even a good idea, or why it can't be properly
> > > interrupt driven!
> > > 
> > > I'm frankly sick of it, maybe you should review HNS patches for a
> > > while..
> > 
> > Are you sure this hasn't changed over time and you didn't realize
> > it? 
> > I'm not seeing all the sleeps you are talking about. 
> 
> It is because I wasn't applying those patches :)

Ok, well, I promise if they send a patch that adds an msleep in the
middle of the send routine, I'll reject that.  But this one is on the
module remove path (or so the subject says).  I'm not really concerned
about it here.

> > In fact, if I grep for "sleep" in hw/hns/ I only find 9 instances: 5
> > in hns_roce_hw_v1 and 4 in hns_roce_hw_v2, so really only 5 at most
> 
> Most of our drivers have 0 sleep loops (excluding the hfi/qib
> uglyness), so 9 is aleady absurd for a 2010 era driver.

Yeah, but they get away from that with a workqueue/waitq type setup. 
That's fine for a running module, but is a bit messy when doing a module
unload where you have to protect against deleting the workqueue/waitq
until after you are done using it, so again, I'm fine with a sleep here
instead.
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index e7024b3..5c8551b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1130,6 +1130,47 @@  static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
 	return 0;
 }
 
+static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+{
+	bool fclr_write_fail_flag = false;
+	struct hns_roce_func_clear *resp;
+	struct hns_roce_cmq_desc desc;
+	unsigned long end;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false);
+	resp = (struct hns_roce_func_clear *)desc.data;
+
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		fclr_write_fail_flag = true;
+		dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n",
+			 ret);
+		return;
+	}
+
+	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
+	end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
+	while (end) {
+		msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
+		end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
+
+		hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR,
+					      true);
+
+		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+		if (ret)
+			continue;
+
+		if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) {
+			hr_dev->is_reset = true;
+			return;
+		}
+	}
+
+	dev_err(hr_dev->dev, "Func clear fail.\n");
+}
+
 static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_query_fw_info *resp;
@@ -1894,6 +1935,9 @@  static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
+	if (hr_dev->pci_dev->revision == 0x21)
+		hns_roce_function_clear(hr_dev);
+
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index bce21fd..478f5a5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -241,6 +241,7 @@  enum hns_roce_opcode_type {
 	HNS_ROCE_OPC_POST_MB				= 0x8504,
 	HNS_ROCE_OPC_QUERY_MB_ST			= 0x8505,
 	HNS_ROCE_OPC_CFG_BT_ATTR			= 0x8506,
+	HNS_ROCE_OPC_FUNC_CLEAR				= 0x8508,
 	HNS_ROCE_OPC_CLR_SCCC				= 0x8509,
 	HNS_ROCE_OPC_QUERY_SCCC				= 0x850a,
 	HNS_ROCE_OPC_RESET_SCCC				= 0x850b,
@@ -1230,6 +1231,22 @@  struct hns_roce_query_fw_info {
 	__le32 rsv[5];
 };
 
+struct hns_roce_func_clear {
+	__le32 rst_funcid_en;
+	__le32 func_done;
+	__le32 rsv[4];
+};
+
+#define FUNC_CLEAR_RST_FUN_DONE_S 0
+/* Each physical function manages up to 248 virtual functions;
+ * it takes up to 100ms for each function to execute clear;
+ * if an abnormal reset occurs, it is executed twice at most;
+ * so it takes up to 249 * 2 * 100ms.
+ */
+#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20
+
 struct hns_roce_cfg_llm_a {
 	__le32 base_addr_l;
 	__le32 base_addr_h;