[RFC,v1] net: ethernet: nb8800: Reset HW block in ndo_open

Message ID	823b1540-b528-bbd7-7f99-5dc39a08868a@sigmadesigns.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org> To: Florian Fainelli <f.fainelli@gmail.com>, Mans Rullgard <mans@mansr.com>, "David S. Miller" <davem@davemloft.net> From: Marc Gonzalez <marc_gonzalez@sigmadesigns.com> Subject: [RFC PATCH v1] net: ethernet: nb8800: Reset HW block in ndo_open Message-ID: <823b1540-b528-bbd7-7f99-5dc39a08868a@sigmadesigns.com> Date: Fri, 28 Jul 2017 18:13:15 +0200 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0 SeaMonkey/2.49.1 MIME-Version: 1.0 Precedence: list Cc: netdev <netdev@vger.kernel.org>, Linux ARM <linux-arm-kernel@lists.infradead.org>, Mason <slash.tmp@free.fr> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

Marc Gonzalez July 28, 2017, 4:13 p.m. UTC

ndo_stop breaks RX in a way that ndo_open is unable to undo.
Work around the issue by resetting the HW in ndo_open.
This will provide the basis for suspend/resume support.

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
---
 drivers/net/ethernet/aurora/nb8800.c | 40 +++++++++++++++++-------------------
 drivers/net/ethernet/aurora/nb8800.h |  1 +
 2 files changed, 20 insertions(+), 21 deletions(-)

Måns Rullgård July 28, 2017, 4:17 p.m. UTC | #1

Marc Gonzalez <marc_gonzalez@sigmadesigns.com> writes:

> ndo_stop breaks RX in a way that ndo_open is unable to undo.

Please elaborate.  Why can't it be fixed in a less heavy-handed way?

> Work around the issue by resetting the HW in ndo_open.
> This will provide the basis for suspend/resume support.
>
> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> ---
>  drivers/net/ethernet/aurora/nb8800.c | 40 +++++++++++++++++-------------------
>  drivers/net/ethernet/aurora/nb8800.h |  1 +
>  2 files changed, 20 insertions(+), 21 deletions(-)

I'm pretty sure this doesn't preserve everything it should.

> diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c
> index e94159507847..954a28542c3b 100644
> --- a/drivers/net/ethernet/aurora/nb8800.c
> +++ b/drivers/net/ethernet/aurora/nb8800.c
> @@ -39,6 +39,7 @@
>
>  #include "nb8800.h"
>
> +static void nb8800_init(struct net_device *dev);
>  static void nb8800_tx_done(struct net_device *dev);
>  static int nb8800_dma_stop(struct net_device *dev);
>
> @@ -957,6 +958,8 @@ static int nb8800_open(struct net_device *dev)
>  	struct phy_device *phydev;
>  	int err;
>
> +	nb8800_init(dev);
> +
>  	/* clear any pending interrupts */
>  	nb8800_writel(priv, NB8800_RXC_SR, 0xf);
>  	nb8800_writel(priv, NB8800_TXC_SR, 0xf);
> @@ -1350,6 +1353,20 @@ static const struct of_device_id nb8800_dt_ids[] = {
>  };
>  MODULE_DEVICE_TABLE(of, nb8800_dt_ids);
>
> +static void nb8800_init(struct net_device *dev)
> +{
> +	struct nb8800_priv *priv = netdev_priv(dev);
> +	const struct nb8800_ops *ops = priv->ops;
> +
> +	if (ops && ops->reset)
> +		ops->reset(dev);
> +	nb8800_hw_init(dev);
> +	if (ops && ops->init)
> +		ops->init(dev);
> +	nb8800_update_mac_addr(dev);
> +	priv->speed = 0;
> +}
> +
>  static int nb8800_probe(struct platform_device *pdev)
>  {
>  	const struct of_device_id *match;
> @@ -1389,6 +1406,7 @@ static int nb8800_probe(struct platform_device *pdev)
>
>  	priv = netdev_priv(dev);
>  	priv->base = base;
> +	priv->ops = ops;
>
>  	priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
>  	if (priv->phy_mode < 0)
> @@ -1407,12 +1425,6 @@ static int nb8800_probe(struct platform_device *pdev)
>
>  	spin_lock_init(&priv->tx_lock);
>
> -	if (ops && ops->reset) {
> -		ret = ops->reset(dev);
> -		if (ret)
> -			goto err_disable_clk;
> -	}
> -
>  	bus = devm_mdiobus_alloc(&pdev->dev);
>  	if (!bus) {
>  		ret = -ENOMEM;
> @@ -1454,16 +1466,6 @@ static int nb8800_probe(struct platform_device *pdev)
>
>  	priv->mii_bus = bus;
>
> -	ret = nb8800_hw_init(dev);
> -	if (ret)
> -		goto err_deregister_fixed_link;
> -
> -	if (ops && ops->init) {
> -		ret = ops->init(dev);
> -		if (ret)
> -			goto err_deregister_fixed_link;
> -	}
> -
>  	dev->netdev_ops = &nb8800_netdev_ops;
>  	dev->ethtool_ops = &nb8800_ethtool_ops;
>  	dev->flags |= IFF_MULTICAST;
> @@ -1476,14 +1478,12 @@ static int nb8800_probe(struct platform_device *pdev)
>  	if (!is_valid_ether_addr(dev->dev_addr))
>  		eth_hw_addr_random(dev);
>
> -	nb8800_update_mac_addr(dev);
> -
>  	netif_carrier_off(dev);
>
>  	ret = register_netdev(dev);
>  	if (ret) {
>  		netdev_err(dev, "failed to register netdev\n");
> -		goto err_free_dma;
> +		goto err_deregister_fixed_link;
>  	}
>
>  	netif_napi_add(dev, &priv->napi, nb8800_poll, NAPI_POLL_WEIGHT);
> @@ -1492,8 +1492,6 @@ static int nb8800_probe(struct platform_device *pdev)
>
>  	return 0;
>
> -err_free_dma:
> -	nb8800_dma_free(dev);
>  err_deregister_fixed_link:
>  	if (of_phy_is_fixed_link(pdev->dev.of_node))
>  		of_phy_deregister_fixed_link(pdev->dev.of_node);
> diff --git a/drivers/net/ethernet/aurora/nb8800.h b/drivers/net/ethernet/aurora/nb8800.h
> index 6ec4a956e1e5..d5f4481a2c7b 100644
> --- a/drivers/net/ethernet/aurora/nb8800.h
> +++ b/drivers/net/ethernet/aurora/nb8800.h
> @@ -305,6 +305,7 @@ struct nb8800_priv {
>  	dma_addr_t			tx_desc_dma;
>
>  	struct clk			*clk;
> +	const struct nb8800_ops		*ops;
>  };
>
>  struct nb8800_ops {
> -- 
> 2.11.0
>

Marc Gonzalez July 28, 2017, 4:43 p.m. UTC | #2

On 28/07/2017 18:17, Måns Rullgård wrote:

> Marc Gonzalez wrote:
> 
>> ndo_stop breaks RX in a way that ndo_open is unable to undo.
> 
> Please elaborate.  Why can't it be fixed in a less heavy-handed way?

I'm not sure what "elaborate" means. After we've been through
ndo_stop once, the board can send packets, but it doesn't see
any replies from remote systems. RX is wedged.

I think ndo_stop is rare enough an event that doing a full
reset is not an issue, in terms of performance. Also I will
need this infrastructure anyway for suspend/resume support.

It might also make sense to put the HW in reset at close
(to save power). I will try measuring the power savings,
if any.

>> Work around the issue by resetting the HW in ndo_open.
>> This will provide the basis for suspend/resume support.
>>
>> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>> ---
>>  drivers/net/ethernet/aurora/nb8800.c | 40 +++++++++++++++++-------------------
>>  drivers/net/ethernet/aurora/nb8800.h |  1 +
>>  2 files changed, 20 insertions(+), 21 deletions(-)
> 
> I'm pretty sure this doesn't preserve everything it should.

Hmmm, we're supposed to start fresh ("full reset").
What could there be to preserve?
You mentioned flow control and multicast elsewhere.
I will take a closer look. Thanks for the heads up.

Regards.

Måns Rullgård July 28, 2017, 6:56 p.m. UTC | #3

Marc Gonzalez <marc_gonzalez@sigmadesigns.com> writes:

> On 28/07/2017 18:17, Måns Rullgård wrote:
>
>> Marc Gonzalez wrote:
>> 
>>> ndo_stop breaks RX in a way that ndo_open is unable to undo.
>> 
>> Please elaborate.  Why can't it be fixed in a less heavy-handed way?
>
> I'm not sure what "elaborate" means. After we've been through
> ndo_stop once, the board can send packets, but it doesn't see
> any replies from remote systems. RX is wedged.

So you say, but you have not explained why this happens.  Until we know
why, we can't decide on the proper fix.

> I think ndo_stop is rare enough an event that doing a full
> reset is not an issue, in terms of performance.

Performance isn't the issue.  Doing the right thing is.

> Also I will need this infrastructure anyway for suspend/resume
> support.

> It might also make sense to put the HW in reset at close
> (to save power). I will try measuring the power savings,
> if any.
>
>>> Work around the issue by resetting the HW in ndo_open.
>>> This will provide the basis for suspend/resume support.
>>>
>>> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>>> ---
>>>  drivers/net/ethernet/aurora/nb8800.c | 40 +++++++++++++++++-------------------
>>>  drivers/net/ethernet/aurora/nb8800.h |  1 +
>>>  2 files changed, 20 insertions(+), 21 deletions(-)
>> 
>> I'm pretty sure this doesn't preserve everything it should.
>
> Hmmm, we're supposed to start fresh ("full reset").
> What could there be to preserve?
> You mentioned flow control and multicast elsewhere.
> I will take a closer look. Thanks for the heads up.

Yes, those settings are definitely lost with your patch.  Now I'm not
sure whether the networking core expects these to survive a stop/start
cycle, so please check that.  There might also be other less obvious
things that need to be preserved.

Mason July 28, 2017, 9:53 p.m. UTC | #4

On 28/07/2017 20:56, Måns Rullgård wrote:

> Marc Gonzalez writes:
> 
>> On 28/07/2017 18:17, Måns Rullgård wrote:
>>
>>> Marc Gonzalez wrote:
>>>
>>>> ndo_stop breaks RX in a way that ndo_open is unable to undo.
>>>
>>> Please elaborate.  Why can't it be fixed in a less heavy-handed way?
>>
>> I'm not sure what "elaborate" means. After we've been through
>> ndo_stop once, the board can send packets, but it doesn't see
>> any replies from remote systems. RX is wedged.
> 
> So you say, but you have not explained why this happens.  Until we know
> why, we can't decide on the proper fix.

I'll try adding delays in strategic places, and see if
I can trigger the same bug on tango4. If I can't, then
this work around is all we've got.

And I need nb8800_init for resume anyway, so I might
as well use it in ndo_open.

TODO: test power savings from holding HW in reset.

>> I think ndo_stop is rare enough an event that doing a full
>> reset is not an issue, in terms of performance.
> 
> Performance isn't the issue.  Doing the right thing is.

I don't have always have time to figure out exactly how
broken HW is broken. It's already bad enough that disabling
DMA requires sending a fake packet through the loop back...

>>> I'm pretty sure this doesn't preserve everything it should.
>>
>> Hmmm, we're supposed to start fresh ("full reset").
>> What could there be to preserve?
>> You mentioned flow control and multicast elsewhere.
>> I will take a closer look. Thanks for the heads up.
> 
> Yes, those settings are definitely lost with your patch.  Now I'm not
> sure whether the networking core expects these to survive a stop/start
> cycle, so please check that.  There might also be other less obvious
> things that need to be preserved.

The original code calls nb8800_pause_config() every
time the link comes up. The proposed patch doesn't
change that.

Regards.

Måns Rullgård July 29, 2017, 11:24 a.m. UTC | #5

Mason <slash.tmp@free.fr> writes:

> On 28/07/2017 20:56, Måns Rullgård wrote:
>
>> Marc Gonzalez writes:
>> 
>>> On 28/07/2017 18:17, Måns Rullgård wrote:
>>>
>>>> Marc Gonzalez wrote:
>>>>
>>>>> ndo_stop breaks RX in a way that ndo_open is unable to undo.
>>>>
>>>> Please elaborate.  Why can't it be fixed in a less heavy-handed way?
>>>
>>> I'm not sure what "elaborate" means. After we've been through
>>> ndo_stop once, the board can send packets, but it doesn't see
>>> any replies from remote systems. RX is wedged.
>> 
>> So you say, but you have not explained why this happens.  Until we know
>> why, we can't decide on the proper fix.
>
> I'll try adding delays in strategic places, and see if
> I can trigger the same bug on tango4. If I can't, then
> this work around is all we've got.
>
> And I need nb8800_init for resume anyway, so I might
> as well use it in ndo_open.
>
> TODO: test power savings from holding HW in reset.
>
>>> I think ndo_stop is rare enough an event that doing a full
>>> reset is not an issue, in terms of performance.
>> 
>> Performance isn't the issue.  Doing the right thing is.
>
> I don't have always have time to figure out exactly how
> broken HW is broken. It's already bad enough that disabling
> DMA requires sending a fake packet through the loop back...

Until you figure out why it's getting stuck, we can't be sure it isn't
caused by something that could trigger at any time.

>>>> I'm pretty sure this doesn't preserve everything it should.
>>>
>>> Hmmm, we're supposed to start fresh ("full reset").
>>> What could there be to preserve?
>>> You mentioned flow control and multicast elsewhere.
>>> I will take a closer look. Thanks for the heads up.
>> 
>> Yes, those settings are definitely lost with your patch.  Now I'm not
>> sure whether the networking core expects these to survive a stop/start
>> cycle, so please check that.  There might also be other less obvious
>> things that need to be preserved.
>
> The original code calls nb8800_pause_config() every
> time the link comes up. The proposed patch doesn't
> change that.

Yes, but by then you've reset those parameters to the defaults.

Mason July 29, 2017, 12:02 p.m. UTC | #6

On 29/07/2017 13:24, Måns Rullgård wrote:

> Until you figure out why it's getting stuck, we can't be sure
> it isn't caused by something that could trigger at any time.
Would you take a look at it, if I can reproduce on tango4?

I have identified a 100% reproducible flaw.
I have proposed a work-around that brings this down to 0
(tested 1000 cycles of link up / ping / link down).

In my opinion, upstream should consider this work-around
for inclusion. I'd like to hear David's and Florian's
opinion on the topic. It's always a pain to maintain
out-of-tree patches.

> Yes, but by then you've reset those parameters to the defaults.

Good catch. There is some non HW-related init in
nb8800_hw_init().

I'll take this opportunity to change flow control to
off by default (it breaks several 100 Mbps switches).
I'll take a closer look at priv assignments on Monday.

Regards.

Måns Rullgård July 29, 2017, 12:05 p.m. UTC | #7

Mason <slash.tmp@free.fr> writes:

> On 29/07/2017 13:24, Måns Rullgård wrote:
>
>> Until you figure out why it's getting stuck, we can't be sure
>> it isn't caused by something that could trigger at any time.
> Would you take a look at it, if I can reproduce on tango4?
>
> I have identified a 100% reproducible flaw.
> I have proposed a work-around that brings this down to 0
> (tested 1000 cycles of link up / ping / link down).
>
> In my opinion, upstream should consider this work-around
> for inclusion. I'd like to hear David's and Florian's
> opinion on the topic. It's always a pain to maintain
> out-of-tree patches.

I'm not saying it shouldn't be fixed.  I am saying we should make sure
we make the right fix, not just paper over one instance of a wider issue.

>> Yes, but by then you've reset those parameters to the defaults.
>
> Good catch. There is some non HW-related init in
> nb8800_hw_init().
>
> I'll take this opportunity to change flow control to
> off by default (it breaks several 100 Mbps switches).

I was told to have it on by default.  This is what most other drivers do
too.  If you have faulty switches, that's your problem.

Mason July 29, 2017, 12:44 p.m. UTC | #8

On 29/07/2017 14:05, Måns Rullgård wrote:

> Mason writes:
> 
>> I'll take this opportunity to change flow control to
>> off by default (it breaks several 100 Mbps switches).
> 
> I was told to have it on by default.  This is what most other drivers
> do too.  If you have faulty switches, that's your problem.

Or maybe the fault is in the HW implementation, and
it is the board's fault that the connection is hosed.

How many switches did you test the driver on?

We tested 4 switches, and DHCP failed on 3 of them.
Disabling pause frames "fixed" that.

I could spend weeks testing dozens of switches, but
I have to prioritize. Ethernet flow control is simply
not an important feature in the market the SoC is
intended for.

Regards.

Måns Rullgård July 29, 2017, 12:51 p.m. UTC | #9

Mason <slash.tmp@free.fr> writes:

> On 29/07/2017 14:05, Måns Rullgård wrote:
>
>> Mason writes:
>> 
>>> I'll take this opportunity to change flow control to
>>> off by default (it breaks several 100 Mbps switches).
>> 
>> I was told to have it on by default.  This is what most other drivers
>> do too.  If you have faulty switches, that's your problem.
>
> Or maybe the fault is in the HW implementation, and
> it is the board's fault that the connection is hosed.
>
> How many switches did you test the driver on?
>
> We tested 4 switches, and DHCP failed on 3 of them.
> Disabling pause frames "fixed" that.

Do those switches work with other NICs?

> I could spend weeks testing dozens of switches, but
> I have to prioritize. Ethernet flow control is simply
> not an important feature in the market the SoC is
> intended for.

Sure, flow control is rarely needed with well behaved systems.

Florian Fainelli July 29, 2017, 3:18 p.m. UTC | #10

On 07/29/2017 05:02 AM, Mason wrote:
> On 29/07/2017 13:24, Måns Rullgård wrote:
> 
>> Until you figure out why it's getting stuck, we can't be sure
>> it isn't caused by something that could trigger at any time.
> Would you take a look at it, if I can reproduce on tango4?
> 
> I have identified a 100% reproducible flaw.
> I have proposed a work-around that brings this down to 0
> (tested 1000 cycles of link up / ping / link down).

Can you also try to get help from your HW resources to eventually help
you find out what is going on here? If anybody has access to that it
would be you.

> 
> In my opinion, upstream should consider this work-around
> for inclusion. I'd like to hear David's and Florian's
> opinion on the topic. It's always a pain to maintain
> out-of-tree patches.

I have to agree with Mans here that the commit message explanation is
not good enough to understand how the RX path is hosed after a call to
ndo_stop() it would be good, both for you and for the people maintaining
this driver to understand what happens exactly so the fix is correct,
understood and maintainable. The patch itself looks reasonable with the
limited description given, but it's the description itself that needs
changing. BTW, this should probably come with a Fixes: tag to identify
which commit (presumably the one introducing the driver) is being fixed.

Florian Fainelli July 29, 2017, 8:15 p.m. UTC | #11

On 07/29/2017 05:44 AM, Mason wrote:
> On 29/07/2017 14:05, Måns Rullgård wrote:
> 
>> Mason writes:
>>
>>> I'll take this opportunity to change flow control to
>>> off by default (it breaks several 100 Mbps switches).
>>
>> I was told to have it on by default.  This is what most other drivers
>> do too.  If you have faulty switches, that's your problem.
> 
> Or maybe the fault is in the HW implementation, and
> it is the board's fault that the connection is hosed.

If there really is a linkage with pause frames then it's a pretty binary
thing at the electrical interface level, either the (RG)MII connection
works, or or it does not, but pause frames are normal frames as far as
the electrical interface is concerned. Their special handling is at the
MAC level, see below.

> 
> How many switches did you test the driver on?
> 
> We tested 4 switches, and DHCP failed on 3 of them.
> Disabling pause frames "fixed" that.

OK, so it is this problem that you reported about before? Pause frames
are tricky in that receiving pause frames means you should backpressure
your transmitter and sending pause frames happens when your receiver
cannot keep up. It is somewhat conceivable that your HW implementation
is bogus and that you can get the HW in a state where it gets
permanently backpressured for instance? And then only a full re-init
would get you out of this stuck state presumably? Are there significant
differences at the DMA/Ethernet controller level between Tango 3 (is
that the one Mans worked on?) and Tango 4 for instance that could
explain a behavioral difference?

Some Ethernet NICs allow you to actually observe pause frames (AFAIR a
few Intel ones do) so you could conceivably set up a bridge with such a
NIC and snoop traffic between your tango4 board and your switch and see
what happens.

> 
> I could spend weeks testing dozens of switches, but
> I have to prioritize. Ethernet flow control is simply
> not an important feature in the market the SoC is
> intended for.

Mason July 29, 2017, 10:48 p.m. UTC | #12

On 29/07/2017 22:15, Florian Fainelli wrote:

> On 07/29/2017 05:44 AM, Mason wrote:
>
>> We tested 4 switches, and DHCP failed on 3 of them.
>> Disabling pause frames "fixed" that.
> 
> OK, so it is this problem that you reported about before?

The "Ethernet flow control / pause frames" issue
is separate from the "link down wedges RX" issue.

We discussed the former back in November 2016:

https://www.mail-archive.com/netdev@vger.kernel.org/msg137094.html
https://patchwork.ozlabs.org/patch/694577/

Wait a second... I see that you and Mans had the
following exchange:

https://www.mail-archive.com/netdev@vger.kernel.org/msg138175.html

Mans mentions disabling DMA to be able to change
the flow control bits. The current theory is that
it is disabling DMA in ndo_stop that wedges RX.

So maybe the two issues are related after all...

I hate all these hardware quirks. Why can't HW
engineers make stuff that "just works"...

> Pause frames are tricky in that receiving pause frames means you
> should backpressure your transmitter and sending pause frames happens
> when your receiver cannot keep up. It is somewhat conceivable that
> your HW implementation is bogus and that you can get the HW in a
> state where it gets permanently backpressured for instance? And then
> only a full re-init would get you out of this stuck state presumably?
> Are there significant differences at the DMA/Ethernet controller
> level between Tango 3 (is that the one Mans worked on?) and Tango 4
> for instance that could explain a behavioral difference?

I'll have to take a look at the issue in light of
the new information. FWIW, Mans has tango3&4 boards.
I work on newer boards. The HW dev *swears* there
have been no functional differences in the eth block
"forever". However, bus accesses are faster in recent
chips, which could change who wins specific races.

Regards.

Mason July 31, 2017, 11:49 a.m. UTC | #13

On 29/07/2017 17:18, Florian Fainelli wrote:

> On 07/29/2017 05:02 AM, Mason wrote:
>
>> I have identified a 100% reproducible flaw.
>> I have proposed a work-around that brings this down to 0
>> (tested 1000 cycles of link up / ping / link down).
> 
> Can you also try to get help from your HW resources to eventually help
> you find out what is going on here?

The patch I proposed /is/ based on the feedback from the HW team :-(
"Just reset the HW block, and everything will work as expected."

>> In my opinion, upstream should consider this work-around
>> for inclusion. I'd like to hear David's and Florian's
>> opinion on the topic. It's always a pain to maintain
>> out-of-tree patches.
> 
> I have to agree with Mans here that the commit message explanation is
> not good enough to understand how the RX path is hosed after a call to
> ndo_stop() it would be good, both for you and for the people maintaining
> this driver to understand what happens exactly so the fix is correct,
> understood and maintainable. The patch itself looks reasonable with the
> limited description given, but it's the description itself that needs
> changing.

I have logged all register reads/writes occurring while
nb8800_stop() is executing.



1) BOARD A -- EVERYTHING WORKS AS EXPECTED

# test_eth.sh 
[   13.293669] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.41/0.41/0.41
[   15.874627] nb8800_stop from __dev_close_many
[   15.879044] ++ETH++ gw32 reg=f0026020 val=00920000
[   15.883900] ++ETH++ gw32 reg=f0026020 val=80920000
[   15.888969] ++ETH++ gr32 reg=f0026024 val=0000ec00
[   15.893809] ++ETH++ gw32 reg=f0026020 val=04920000
[   15.898697] ++ETH++ gw32 reg=f0026020 val=84920000
[   15.903582] ++ETH++ gw32 reg=f0026020 val=00930000
[   15.908423] ++ETH++ gw32 reg=f0026020 val=80930000
[   15.913272] ++ETH++ gr32 reg=f0026024 val=00000000
[   15.918160] ++ETH++ gr8  reg=f0026004 val=2b
[   15.922459] ++ETH++ gw8  reg=f0026004 val=0b
[   15.926782] ++ETH++ gr8  reg=f0026044 val=81
[   15.931123] ++ETH++ gw8  reg=f0026044 val=85
[   15.935457] ++ETH++ gw32 reg=f002610c val=9de74000
[   15.940317] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   15.945187] ENTER nb8800_irq
[   15.948077] ++ETH++ gr32 reg=f0026104 val=00000004
[   15.952887] ++ETH++ gw32 reg=f0026104 val=00000004
[   15.957697] ++ETH++ gr32 reg=f0026204 val=00000004
[   15.962507] ++ETH++ gw32 reg=f0026204 val=00000004
[   15.967316] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   15.972149] ENTER nb8800_irq
[   15.975039] ++ETH++ gr32 reg=f0026104 val=00000001
[   15.979848] ++ETH++ gw32 reg=f0026104 val=00000001
[   15.984658] ++ETH++ gr32 reg=f0026204 val=00000000
[   16.045509] ++ETH++ gw32 reg=f002610c val=9de74000
[   16.050329] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.055150] ENTER nb8800_irq
[   16.058042] ++ETH++ gr32 reg=f0026104 val=00000004
[   16.062852] ++ETH++ gw32 reg=f0026104 val=00000004
[   16.067662] ++ETH++ gr32 reg=f0026204 val=00000004
[   16.072470] ++ETH++ gw32 reg=f0026204 val=00000004
[   16.077279] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   16.082100] ENTER nb8800_irq
[   16.084993] ++ETH++ gr32 reg=f0026104 val=00000001
[   16.089802] ++ETH++ gw32 reg=f0026104 val=00000001
[   16.094611] ++ETH++ gr32 reg=f0026204 val=00000000
[   16.099454] ++ETH++ gr8  reg=f0026004 val=0b
[   16.103752] ++ETH++ gw8  reg=f0026004 val=2b
[   16.108057] ++ETH++ gr8  reg=f0026044 val=85
[   16.112353] ++ETH++ gw8  reg=f0026044 val=81
[   16.116699] ++ETH++ gw32 reg=f002620c val=9f7b2000
[   16.121528] ++ETH++ gr8  reg=f0026004 val=2b
[   16.125827] ++ETH++ gw8  reg=f0026004 val=2a
[   16.130126] ++ETH++ gr32 reg=f0026100 val=00080afe
[   16.134945] ++ETH++ gr8  reg=f0026000 val=1d
[   16.139238] ++ETH++ gw8  reg=f0026000 val=1c
[   16.143534] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.148363] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.153209] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.158027] ++ETH++ gw32 reg=f0026020 val=04920000
[   16.162856] ++ETH++ gw32 reg=f0026020 val=84920000
[   16.167702] ++ETH++ gw32 reg=f0026020 val=00930000
[   16.172531] ++ETH++ gw32 reg=f0026020 val=80930000
[   16.177377] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.182338] nb8800 26000.ethernet eth0: Link is Down
[   16.187361] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.192194] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.197052] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.201887] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.206717] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.211575] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.216394] ++ETH++ gw32 reg=f0026020 val=00800000
[   16.221235] ++ETH++ gw32 reg=f0026020 val=80800000
[   16.226084] ++ETH++ gr32 reg=f0026024 val=00001000
[   16.230913] ++ETH++ gw32 reg=f0026020 val=04801800
[   16.235742] ++ETH++ gw32 reg=f0026020 val=84801800
[   16.240620] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.245451] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.250310] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.255134] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.259964] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.264821] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.269642] ++ETH++ gw32 reg=f0026020 val=00800000
[   16.274470] ++ETH++ gw32 reg=f0026020 val=80800000
[   16.279316] ++ETH++ gr32 reg=f0026024 val=00001800
[   16.284134] ++ETH++ gw32 reg=f0026020 val=04801800
[   16.288963] ++ETH++ gw32 reg=f0026020 val=84801800
[   16.293872] EXIT nb8800_stop
[   20.087916] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.27/0.27/0.27



1) BOARD B -- RX WEDGED AFTER nb8800_stop

# test_eth.sh 
[   26.369255] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.34/0.34/0.34
[   28.907583] nb8800_stop from __dev_close_many
[   28.911997] ++ETH++ gw32 reg=f0026020 val=00920000
[   28.916856] ++ETH++ gw32 reg=f0026020 val=80920000
[   28.921732] ++ETH++ gr32 reg=f0026024 val=0000ec00
[   28.926565] ++ETH++ gw32 reg=f0026020 val=04920000
[   28.931422] ++ETH++ gw32 reg=f0026020 val=84920000
[   28.936285] ++ETH++ gw32 reg=f0026020 val=00930000
[   28.941134] ++ETH++ gw32 reg=f0026020 val=80930000
[   28.945993] ++ETH++ gr32 reg=f0026024 val=00000000
[   28.950857] ++ETH++ gr8  reg=f0026004 val=2b
[   28.955161] ++ETH++ gw8  reg=f0026004 val=0b
[   28.959463] ++ETH++ gr8  reg=f0026044 val=81
[   28.963767] ++ETH++ gw8  reg=f0026044 val=85
[   28.968067] ++ETH++ gw32 reg=f002610c val=9eed8000
[   28.972896] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   28.977731] ENTER nb8800_irq
[   28.980632] ++ETH++ gr32 reg=f0026104 val=00000004
[   28.985450] ++ETH++ gw32 reg=f0026104 val=00000004
[   28.990268] ++ETH++ gr32 reg=f0026204 val=00000004
[   28.995085] ++ETH++ gw32 reg=f0026204 val=00000004
[   28.999903] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   29.004730] ENTER nb8800_irq
[   29.007625] ++ETH++ gr32 reg=f0026104 val=00000001
[   29.012442] ++ETH++ gw32 reg=f0026104 val=00000001
[   29.017259] ++ETH++ gr32 reg=f0026204 val=00000000
[   29.077759] ++ETH++ gw32 reg=f002610c val=9eed8000
[   29.082590] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   29.087422] ENTER nb8800_irq
[   29.090322] ++ETH++ gr32 reg=f0026104 val=00000004
[   29.095140] ++ETH++ gw32 reg=f0026104 val=00000004
[   29.099958] ++ETH++ gr32 reg=f0026204 val=00000004
[   29.104774] ++ETH++ gw32 reg=f0026204 val=00000004
[   29.109591] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   29.114415] ENTER nb8800_irq
[   29.117311] ++ETH++ gr32 reg=f0026104 val=00000001
[   29.122127] ++ETH++ gw32 reg=f0026104 val=00000001
[   29.126944] ++ETH++ gr32 reg=f0026204 val=00000000
[   29.187460] ++ETH++ gw32 reg=f002610c val=9eed8000
[   29.192291] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   29.197123] ENTER nb8800_irq
[   29.198119] ++ETH++ gr8  reg=f0026004 val=0b
[   29.198121] ++ETH++ gw8  reg=f0026004 val=2b
[   29.198123] ++ETH++ gr8  reg=f0026044 val=85
[   29.198125] ++ETH++ gw8  reg=f0026044 val=81
[   29.198139] ++ETH++ gw32 reg=f002620c val=9d818000
[   29.198141] ++ETH++ gr8  reg=f0026004 val=2b
[   29.198143] ++ETH++ gw8  reg=f0026004 val=2a
[   29.198145] ++ETH++ gr32 reg=f0026100 val=00080afe
[   29.198147] ++ETH++ gr8  reg=f0026000 val=1d
[   29.198148] ++ETH++ gw8  reg=f0026000 val=1c
[   29.198151] ++ETH++ gw32 reg=f0026020 val=00920000
[   29.198163] ++ETH++ gw32 reg=f0026020 val=80920000
[   29.198193] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.198195] ++ETH++ gw32 reg=f0026020 val=04920000
[   29.198207] ++ETH++ gw32 reg=f0026020 val=84920000
[   29.198237] ++ETH++ gw32 reg=f0026020 val=00930000
[   29.198249] ++ETH++ gw32 reg=f0026020 val=80930000
[   29.198279] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.282484] ++ETH++ gr32 reg=f0026104 val=00000005
[   29.287301] ++ETH++ gw32 reg=f0026104 val=00000005
[   29.292118] ++ETH++ gr32 reg=f0026204 val=00000004
[   29.296935] ++ETH++ gw32 reg=f0026204 val=00000004
[   29.301752] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   29.306640] nb8800 26000.ethernet eth0: Link is Down
[   29.311664] ++ETH++ gw32 reg=f0026020 val=00920000
[   29.316508] ++ETH++ gw32 reg=f0026020 val=80920000
[   29.321363] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.326193] ++ETH++ gw32 reg=f0026020 val=00920000
[   29.331031] ++ETH++ gw32 reg=f0026020 val=80920000
[   29.335885] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.340712] ++ETH++ gw32 reg=f0026020 val=00800000
[   29.345550] ++ETH++ gw32 reg=f0026020 val=80800000
[   29.350405] ++ETH++ gr32 reg=f0026024 val=00001000
[   29.355234] ++ETH++ gw32 reg=f0026020 val=04801800
[   29.360069] ++ETH++ gw32 reg=f0026020 val=84801800
[   29.364940] ++ETH++ gw32 reg=f0026020 val=00920000
[   29.369777] ++ETH++ gw32 reg=f0026020 val=80920000
[   29.374635] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.379462] ++ETH++ gw32 reg=f0026020 val=00920000
[   29.384300] ++ETH++ gw32 reg=f0026020 val=80920000
[   29.389153] ++ETH++ gr32 reg=f0026024 val=00000000
[   29.393982] ++ETH++ gw32 reg=f0026020 val=00800000
[   29.398817] ++ETH++ gw32 reg=f0026020 val=80800000
[   29.403674] ++ETH++ gr32 reg=f0026024 val=00001800
[   29.408499] ++ETH++ gw32 reg=f0026020 val=04801800
[   29.413337] ++ETH++ gw32 reg=f0026020 val=84801800
[   29.418245] EXIT nb8800_stop
[   33.644357] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/0/100%


There are only small differences between these two logs.

1) Different TRANSMIT_DESCRIPTOR_ADDRESS (0x2610c)
=> Not unexpected

2) On BOARD B, an additional
[   29.187460] ++ETH++ gw32 reg=f002610c val=9eed8000
[   29.192291] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   29.197123] ENTER nb8800_irq
=> Is it possible for the ISR to be running simultaneously on two cores?
0x26100 = TRANSMIT_CHANNEL_CONTROL

3) Different RECEIVE_DESCRIPTOR_ADDRESS (0x2620c)
=> Not unexpected

4) ON BOARD B, an additional
[   29.282484] ++ETH++ gr32 reg=f0026104 val=00000005
[   29.287301] ++ETH++ gw32 reg=f0026104 val=00000005
[   29.292118] ++ETH++ gr32 reg=f0026204 val=00000004
[   29.296935] ++ETH++ gw32 reg=f0026204 val=00000004
[   29.301752] ++ETH++ gw32 reg=f0026218 val=003cc4a4
0x26104 = TRANSMIT_CHANNEL_STATUS
0x26204 = RECEIVE_CHANNEL_STATUS
0x26218 = RECEIVE_INTERRUPT_TIME
=> BOARD A didn't have to deal with two TX interrupts at the same time,
though it did deal with 1 and 4 separately.




I need to look if some of these register accesses are
racing on different cores.

Regards.

Måns Rullgård July 31, 2017, 11:59 a.m. UTC | #14

Mason <slash.tmp@free.fr> writes:

> On 29/07/2017 17:18, Florian Fainelli wrote:
>
>> On 07/29/2017 05:02 AM, Mason wrote:
>>
>>> I have identified a 100% reproducible flaw.
>>> I have proposed a work-around that brings this down to 0
>>> (tested 1000 cycles of link up / ping / link down).
>> 
>> Can you also try to get help from your HW resources to eventually help
>> you find out what is going on here?
>
> The patch I proposed /is/ based on the feedback from the HW team :-(
> "Just reset the HW block, and everything will work as expected."

Nobody is saying a reset won't recover the lockup.  The problem is that
we don't know what caused it to lock up in the first place.  How do we
know it can't happen during normal operation?  If we knew the cause, it
might also be possible to avoid the situation entirely.

Mason July 31, 2017, 2:08 p.m. UTC | #15

On 31/07/2017 13:59, Måns Rullgård wrote:

> Mason writes:
> 
>> On 29/07/2017 17:18, Florian Fainelli wrote:
>>
>>> On 07/29/2017 05:02 AM, Mason wrote:
>>>
>>>> I have identified a 100% reproducible flaw.
>>>> I have proposed a work-around that brings this down to 0
>>>> (tested 1000 cycles of link up / ping / link down).
>>>
>>> Can you also try to get help from your HW resources to eventually help
>>> you find out what is going on here?
>>
>> The patch I proposed /is/ based on the feedback from the HW team :-(
>> "Just reset the HW block, and everything will work as expected."
> 
> Nobody is saying a reset won't recover the lockup.  The problem is that
> we don't know what caused it to lock up in the first place.  How do we
> know it can't happen during normal operation?  If we knew the cause, it
> might also be possible to avoid the situation entirely.

How does one prove that something "can't happen during normal operation"?

The "put adapter in loop-back mode so we can send ourselves fake packets"
shenanigans seems completely insane, if you ask me.

Other things make no sense to me, for example in nb8800_dma_stop()
there is a polling loop:

	do {
		mdelay(100);
		nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
		wmb();
		mdelay(100);
		nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);

		mdelay(5500);

		err = readl_poll_timeout_atomic(priv->base + NB8800_RXC_CR,
						rxcr, !(rxcr & RCR_EN),
						1000, 100000);
		printk("err=%d retry=%d\n", err, retry);
	} while (err && --retry);


(It was me who added the delays.)

*Whatever* delays I insert, it always goes 3 times through the loop.

[   29.654492] ++ETH++ gw32 reg=f002610c val=9ecc8000
[   29.759320] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   35.364705] err=-110 retry=5
[   35.467609] ++ETH++ gw32 reg=f002610c val=9ecc8000
[   35.572436] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   41.177822] err=-110 retry=4
[   41.280726] ++ETH++ gw32 reg=f002610c val=9ecc8000
[   41.385553] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   46.890907] err=0 retry=3

How is that possible?

I've tried using spinlocks and delays to get parallel execution
down to a minimum, and have the same logs on both boards.

Regards.

Mason July 31, 2017, 3:18 p.m. UTC | #16

On 31/07/2017 16:08, Mason wrote:

> Other things make no sense to me, for example in nb8800_dma_stop()
> there is a polling loop:
> 
> 	do {
> 		mdelay(100);
> 		nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
> 		wmb();
> 		mdelay(100);
> 		nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);
> 
> 		mdelay(5500);
> 
> 		err = readl_poll_timeout_atomic(priv->base + NB8800_RXC_CR,
> 						rxcr, !(rxcr & RCR_EN),
> 						1000, 100000);
> 		printk("err=%d retry=%d\n", err, retry);
> 	} while (err && --retry);
> 
> 
> (It was me who added the delays.)
> 
> *Whatever* delays I insert, it always goes 3 times through the loop.
> 
> [   29.654492] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   29.759320] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   35.364705] err=-110 retry=5
> [   35.467609] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   35.572436] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   41.177822] err=-110 retry=4
> [   41.280726] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   41.385553] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   46.890907] err=0 retry=3
> 
> How is that possible?

First time through the loop, it doesn't matter how long we poll,
it *always* times out. Second time as well (only on BOARD B).

Third time, it succeeds quickly (first or second poll).
(This explains why various delays had no impact.)

In fact, requesting the transfer 3 times *before* polling
makes the polling succeed quickly:

	nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
	wmb();
	nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);

[   16.464596] ++ETH++ gw32 reg=f002610c val=9ef28000
[   16.469414] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.474231] ++ETH++ gw32 reg=f002610c val=9ef28000
[   16.479048] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.483865] ++ETH++ gw32 reg=f002610c val=9ef28000
[   16.488682] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.493500] ++ETH++ POLL reg=f0026200 val=06100a8f
[   16.499317] ++ETH++ POLL reg=f0026200 val=06100a8e
[   16.504134] err=0 retry=5


With my changes, I get *exactly* the same logs on BOARD A
and BOARD B (modulo the descriptors addresses).

Yet BOARD A stays functional, but BOARD B is hosed...

Depressing. I've run out of ideas.


BOARD A LOGS:

# test_eth.sh 
[   18.037782] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.39/0.39/0.39
[   20.617917] nb8800_stop from __dev_close_many
[   20.622314] ++ETH++ gw32 reg=f0026020 val=00920000
[   20.627135] ++ETH++ gw32 reg=f0026020 val=80920000
[   20.631973] ++ETH++ gr32 reg=f0026024 val=0000ec00
[   20.636782] ++ETH++ gw32 reg=f0026020 val=04920000
[   20.641601] ++ETH++ gw32 reg=f0026020 val=84920000
[   20.646440] ++ETH++ gw32 reg=f0026020 val=00930000
[   20.651258] ++ETH++ gw32 reg=f0026020 val=80930000
[   20.656095] ++ETH++ gr32 reg=f0026024 val=00000000
[   20.660909] ++ETH++ POLL reg=f0026100 val=005c0afe
[   20.665743] ++ETH++ gr8  reg=f0026004 val=2b
[   20.670028] ++ETH++ gw8  reg=f0026004 val=0b
[   20.674313] ++ETH++ gr8  reg=f0026044 val=81
[   20.678598] ++ETH++ gw8  reg=f0026044 val=85
[   20.682883] ++ETH++ gw32 reg=f002610c val=9de0c000
[   20.687693] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   20.692503] ++ETH++ gw32 reg=f002610c val=9de0c000
[   20.697312] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   20.702121] ++ETH++ gw32 reg=f002610c val=9de0c000
[   20.706929] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   20.712738] ++ETH++ POLL reg=f0026200 val=06100a8e
[   20.717547] err=0 retry=5
[   20.720172] ++ETH++ gr8  reg=f0026004 val=0b
[   20.724456] ++ETH++ gw8  reg=f0026004 val=2b
[   20.728742] ++ETH++ gr8  reg=f0026044 val=85
[   20.733026] ++ETH++ gw8  reg=f0026044 val=81
[   20.737349] ++ETH++ gw32 reg=f002620c val=9de04000
[   20.742158] nb8800_dma_stop=0
[   21.845224] ENTER nb8800_irq
[   21.848116] ++ETH++ gr32 reg=f0026104 val=00000005
[   21.852927] ++ETH++ gw32 reg=f0026104 val=00000005
[   21.857738] ++ETH++ gr32 reg=f0026204 val=00000004
[   21.862547] ++ETH++ gw32 reg=f0026204 val=00000004
[   21.867356] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   21.872164]  EXIT nb8800_irq
[   24.373448] ++ETH++ gr8  reg=f0026004 val=2b
[   24.377755] ++ETH++ gw8  reg=f0026004 val=2a
[   24.382054] ++ETH++ gr32 reg=f0026100 val=00080afe
[   24.386876] ++ETH++ gr8  reg=f0026000 val=1d
[   24.391192] ++ETH++ gw8  reg=f0026000 val=1c
[   25.706747] ++ETH++ gw32 reg=f0026020 val=00920000
[   25.711578] ++ETH++ gw32 reg=f0026020 val=80920000
[   25.716427] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.721248] ++ETH++ gw32 reg=f0026020 val=04920000
[   25.726091] ++ETH++ gw32 reg=f0026020 val=84920000
[   25.730942] ++ETH++ gw32 reg=f0026020 val=00930000
[   25.735782] ++ETH++ gw32 reg=f0026020 val=80930000
[   25.740631] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.745604] nb8800 26000.ethernet eth0: Link is Down
[   25.750617] ++ETH++ gw32 reg=f0026020 val=00920000
[   25.755464] ++ETH++ gw32 reg=f0026020 val=80920000
[   25.760377] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.765205] ++ETH++ gw32 reg=f0026020 val=00920000
[   25.770085] ++ETH++ gw32 reg=f0026020 val=80920000
[   25.774962] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.779784] ++ETH++ gw32 reg=f0026020 val=00800000
[   25.784614] ++ETH++ gw32 reg=f0026020 val=80800000
[   25.789510] ++ETH++ gr32 reg=f0026024 val=00001000
[   25.794333] ++ETH++ gw32 reg=f0026020 val=04801800
[   25.799199] ++ETH++ gw32 reg=f0026020 val=84801800
[   25.804081] ++ETH++ gw32 reg=f0026020 val=00920000
[   25.808918] ++ETH++ gw32 reg=f0026020 val=80920000
[   25.813825] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.818684] ++ETH++ gw32 reg=f0026020 val=00920000
[   25.823551] ++ETH++ gw32 reg=f0026020 val=80920000
[   25.828435] ++ETH++ gr32 reg=f0026024 val=00000000
[   25.833291] ++ETH++ gw32 reg=f0026020 val=00800000
[   25.838156] ++ETH++ gw32 reg=f0026020 val=80800000
[   25.843041] ++ETH++ gr32 reg=f0026024 val=00001800
[   25.847897] ++ETH++ gw32 reg=f0026020 val=04801800
[   25.852763] ++ETH++ gw32 reg=f0026020 val=84801800
[   25.857719] EXIT nb8800_stop
[   30.096432] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.37/0.37/0.37



BOARD B LOGS:

# test_eth.sh 
[   13.796651] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/1/0%, min/avg/max = 0.34/0.34/0.34
[   16.379613] nb8800_stop from __dev_close_many
[   16.384016] ++ETH++ gw32 reg=f0026020 val=00920000
[   16.388845] ++ETH++ gw32 reg=f0026020 val=80920000
[   16.393691] ++ETH++ gr32 reg=f0026024 val=0000ec00
[   16.398508] ++ETH++ gw32 reg=f0026020 val=04920000
[   16.403335] ++ETH++ gw32 reg=f0026020 val=84920000
[   16.408183] ++ETH++ gw32 reg=f0026020 val=00930000
[   16.413009] ++ETH++ gw32 reg=f0026020 val=80930000
[   16.417854] ++ETH++ gr32 reg=f0026024 val=00000000
[   16.422673] ++ETH++ POLL reg=f0026100 val=005c0afe
[   16.427514] ++ETH++ gr8  reg=f0026004 val=2b
[   16.431806] ++ETH++ gw8  reg=f0026004 val=0b
[   16.436100] ++ETH++ gr8  reg=f0026044 val=81
[   16.440392] ++ETH++ gw8  reg=f0026044 val=85
[   16.444684] ++ETH++ gw32 reg=f002610c val=9efa8000
[   16.449501] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.454318] ++ETH++ gw32 reg=f002610c val=9efa8000
[   16.459134] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.463951] ++ETH++ gw32 reg=f002610c val=9efa8000
[   16.468768] ++ETH++ gw32 reg=f0026100 val=005c0aff
[   16.474586] ++ETH++ POLL reg=f0026200 val=06100a8e
[   16.479403] err=0 retry=5
[   16.482034] ++ETH++ gr8  reg=f0026004 val=0b
[   16.486327] ++ETH++ gw8  reg=f0026004 val=2b
[   16.490619] ++ETH++ gr8  reg=f0026044 val=85
[   16.494912] ++ETH++ gw8  reg=f0026044 val=81
[   16.499217] ++ETH++ gw32 reg=f002620c val=9ed22000
[   16.504035] nb8800_dma_stop=0
[   17.607126] ENTER nb8800_irq
[   17.610032] ++ETH++ gr32 reg=f0026104 val=00000005
[   17.614851] ++ETH++ gw32 reg=f0026104 val=00000005
[   17.619669] ++ETH++ gr32 reg=f0026204 val=00000004
[   17.624486] ++ETH++ gw32 reg=f0026204 val=00000004
[   17.629303] ++ETH++ gw32 reg=f0026218 val=003cc4a4
[   17.634120]  EXIT nb8800_irq
[   20.108802] ++ETH++ gr8  reg=f0026004 val=2b
[   20.113115] ++ETH++ gw8  reg=f0026004 val=2a
[   20.117478] ++ETH++ gr32 reg=f0026100 val=00080afe
[   20.122348] ++ETH++ gr8  reg=f0026000 val=1d
[   20.126688] ++ETH++ gw8  reg=f0026000 val=1c
[   21.442246] ++ETH++ gw32 reg=f0026020 val=00920000
[   21.447107] ++ETH++ gw32 reg=f0026020 val=80920000
[   21.452027] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.456907] ++ETH++ gw32 reg=f0026020 val=04920000
[   21.461783] ++ETH++ gw32 reg=f0026020 val=84920000
[   21.466684] ++ETH++ gw32 reg=f0026020 val=00930000
[   21.471559] ++ETH++ gw32 reg=f0026020 val=80930000
[   21.476457] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.481395] nb8800 26000.ethernet eth0: Link is Down
[   21.486461] ++ETH++ gw32 reg=f0026020 val=00920000
[   21.491338] ++ETH++ gw32 reg=f0026020 val=80920000
[   21.496237] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.501102] ++ETH++ gw32 reg=f0026020 val=00920000
[   21.505983] ++ETH++ gw32 reg=f0026020 val=80920000
[   21.510877] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.515748] ++ETH++ gw32 reg=f0026020 val=00800000
[   21.520621] ++ETH++ gw32 reg=f0026020 val=80800000
[   21.525489] ++ETH++ gr32 reg=f0026024 val=00001000
[   21.530319] ++ETH++ gw32 reg=f0026020 val=04801800
[   21.535157] ++ETH++ gw32 reg=f0026020 val=84801800
[   21.540025] ++ETH++ gw32 reg=f0026020 val=00920000
[   21.544866] ++ETH++ gw32 reg=f0026020 val=80920000
[   21.549730] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.554560] ++ETH++ gw32 reg=f0026020 val=00920000
[   21.559400] ++ETH++ gw32 reg=f0026020 val=80920000
[   21.564257] ++ETH++ gr32 reg=f0026024 val=00000000
[   21.569083] ++ETH++ gw32 reg=f0026020 val=00800000
[   21.573921] ++ETH++ gw32 reg=f0026020 val=80800000
[   21.578777] ++ETH++ gr32 reg=f0026024 val=00001800
[   21.583603] ++ETH++ gw32 reg=f0026020 val=04801800
[   21.588441] ++ETH++ gw32 reg=f0026020 val=84801800
[   21.593349] EXIT nb8800_stop
[   25.811889] nb8800 26000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
172.27.64.1 : xmt/rcv/%loss = 1/0/100%

Måns Rullgård July 31, 2017, 3:18 p.m. UTC | #17

Mason <slash.tmp@free.fr> writes:

> On 31/07/2017 13:59, Måns Rullgård wrote:
>
>> Mason writes:
>> 
>>> On 29/07/2017 17:18, Florian Fainelli wrote:
>>>
>>>> On 07/29/2017 05:02 AM, Mason wrote:
>>>>
>>>>> I have identified a 100% reproducible flaw.
>>>>> I have proposed a work-around that brings this down to 0
>>>>> (tested 1000 cycles of link up / ping / link down).
>>>>
>>>> Can you also try to get help from your HW resources to eventually help
>>>> you find out what is going on here?
>>>
>>> The patch I proposed /is/ based on the feedback from the HW team :-(
>>> "Just reset the HW block, and everything will work as expected."
>> 
>> Nobody is saying a reset won't recover the lockup.  The problem is that
>> we don't know what caused it to lock up in the first place.  How do we
>> know it can't happen during normal operation?  If we knew the cause, it
>> might also be possible to avoid the situation entirely.
>
> How does one prove that something "can't happen during normal operation"?

One figures out what conditions cause the something and ensures they
never arise.

> The "put adapter in loop-back mode so we can send ourselves fake packets"
> shenanigans seems completely insane, if you ask me.

Blame the hardware designers.  The *only* way to stop the rx dma is to
have it receive a packet into a descriptor with the end of chain flag
set.  Thankfully the loopback mode means this can be made to happen at
will rather than waiting for actual network traffic.

> Other things make no sense to me, for example in nb8800_dma_stop()
> there is a polling loop:
>
> 	do {
> 		mdelay(100);
> 		nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
> 		wmb();
> 		mdelay(100);
> 		nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);
>
> 		mdelay(5500);
>
> 		err = readl_poll_timeout_atomic(priv->base + NB8800_RXC_CR,
> 						rxcr, !(rxcr & RCR_EN),
> 						1000, 100000);
> 		printk("err=%d retry=%d\n", err, retry);
> 	} while (err && --retry);
>
> (It was me who added the delays.)
>
> *Whatever* delays I insert, it always goes 3 times through the loop.
>
> [   29.654492] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   29.759320] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   35.364705] err=-110 retry=5
> [   35.467609] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   35.572436] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   41.177822] err=-110 retry=4
> [   41.280726] ++ETH++ gw32 reg=f002610c val=9ecc8000
> [   41.385553] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   46.890907] err=0 retry=3
>
> How is that possible?

One possibility is that the hardware loads three descriptors in advance
and doesn't see the newly set end of chain flag until its internal queue
has been used up.

> I've tried using spinlocks and delays to get parallel execution
> down to a minimum, and have the same logs on both boards.
>
> Regards.

Måns Rullgård July 31, 2017, 3:28 p.m. UTC | #18

Mason <slash.tmp@free.fr> writes:

> On 31/07/2017 16:08, Mason wrote:
>
>> Other things make no sense to me, for example in nb8800_dma_stop()
>> there is a polling loop:
>> 
>> 	do {
>> 		mdelay(100);
>> 		nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
>> 		wmb();
>> 		mdelay(100);
>> 		nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);
>> 
>> 		mdelay(5500);
>> 
>> 		err = readl_poll_timeout_atomic(priv->base + NB8800_RXC_CR,
>> 						rxcr, !(rxcr & RCR_EN),
>> 						1000, 100000);
>> 		printk("err=%d retry=%d\n", err, retry);
>> 	} while (err && --retry);
>> 
>> 
>> (It was me who added the delays.)
>> 
>> *Whatever* delays I insert, it always goes 3 times through the loop.
>> 
>> [   29.654492] ++ETH++ gw32 reg=f002610c val=9ecc8000
>> [   29.759320] ++ETH++ gw32 reg=f0026100 val=005c0aff
>> [   35.364705] err=-110 retry=5
>> [   35.467609] ++ETH++ gw32 reg=f002610c val=9ecc8000
>> [   35.572436] ++ETH++ gw32 reg=f0026100 val=005c0aff
>> [   41.177822] err=-110 retry=4
>> [   41.280726] ++ETH++ gw32 reg=f002610c val=9ecc8000
>> [   41.385553] ++ETH++ gw32 reg=f0026100 val=005c0aff
>> [   46.890907] err=0 retry=3
>> 
>> How is that possible?
>
> First time through the loop, it doesn't matter how long we poll,
> it *always* times out. Second time as well (only on BOARD B).
>
> Third time, it succeeds quickly (first or second poll).
> (This explains why various delays had no impact.)
>
> In fact, requesting the transfer 3 times *before* polling
> makes the polling succeed quickly:
>
> 	nb8800_writel(priv, NB8800_TX_DESC_ADDR, txb->dma_desc);
> 	wmb();
> 	nb8800_writel(priv, NB8800_TXC_CR, txcr | TCR_EN);
>
> [   16.464596] ++ETH++ gw32 reg=f002610c val=9ef28000
> [   16.469414] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   16.474231] ++ETH++ gw32 reg=f002610c val=9ef28000
> [   16.479048] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   16.483865] ++ETH++ gw32 reg=f002610c val=9ef28000
> [   16.488682] ++ETH++ gw32 reg=f0026100 val=005c0aff
> [   16.493500] ++ETH++ POLL reg=f0026200 val=06100a8f
> [   16.499317] ++ETH++ POLL reg=f0026200 val=06100a8e
> [   16.504134] err=0 retry=5

That strengthens my theory that the hardware has an internal queue of
three descriptors that are pre-loaded from memory.  Your hardware people
should be able to confirm this.

> With my changes, I get *exactly* the same logs on BOARD A
> and BOARD B (modulo the descriptors addresses).
>
> Yet BOARD A stays functional, but BOARD B is hosed...

What's the difference between board A and board B?

> Depressing. I've run out of ideas.

Get your hardware people involved.  Perhaps they can run some test in a
simulator.

[RFC,v1] net: ethernet: nb8800: Reset HW block in ndo_open

Commit Message

Comments

Patch