diff mbox series

[net] r8169: correct the reset timing of RTL8125 for link-change event

Message ID 20240906083539.154019-1-en-wei.wu@canonical.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] r8169: correct the reset timing of RTL8125 for link-change event | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16 this patch: 16
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 16 this patch: 16
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 16 this patch: 16
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 31 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-09-06--21-00 (tests: 721)

Commit Message

En-Wei Wu Sept. 6, 2024, 8:35 a.m. UTC
The commit 621735f59064 ("r8169: fix rare issue with broken rx after
link-down on RTL8125") set a reset work for RTL8125 in
r8169_phylink_handler() to avoid the MAC from locking up, this
makes the connection broken after unplugging then re-plugging the
Ethernet cable.

This is because the commit mistakenly put the reset work in the
link-down path rather than the link-up path (The commit message says
it should be put in the link-up path).

Moving the reset work from the link-down path to the link-up path fixes
the issue. Also, remove the unnecessary enum member.

Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

Comments

Heiner Kallweit Sept. 6, 2024, 9:16 p.m. UTC | #1
On 06.09.2024 10:35, En-Wei Wu wrote:
> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
> link-down on RTL8125") set a reset work for RTL8125 in
> r8169_phylink_handler() to avoid the MAC from locking up, this
> makes the connection broken after unplugging then re-plugging the
> Ethernet cable.
> 
> This is because the commit mistakenly put the reset work in the
> link-down path rather than the link-up path (The commit message says
> it should be put in the link-up path).
> 
That's not what the commit message is saying. It says vendor driver
r8125 does it in the link-up path.
I moved it intentionally to the link-down path, because traffic may
be flowing already after link-up.

> Moving the reset work from the link-down path to the link-up path fixes
> the issue. Also, remove the unnecessary enum member.
> 
The user who reported the issue at that time confirmed that the original
change fixed the issue for him.
Can you explain, from the NICs perspective, what exactly the difference
is when doing the reset after link-up?
Including an explanation how the original change suppresses the link-up
interrupt. And why that's not the case when doing the reset after link-up.

I simply want to be convinced enough that your change doesn't break
behavior for other users.

> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
> ---
>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> index 3507c2e28110..632e661fc74b 100644
> --- a/drivers/net/ethernet/realtek/r8169_main.c
> +++ b/drivers/net/ethernet/realtek/r8169_main.c
> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
>  enum rtl_flag {
>  	RTL_FLAG_TASK_ENABLED = 0,
>  	RTL_FLAG_TASK_RESET_PENDING,
> -	RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
>  	RTL_FLAG_TASK_TX_TIMEOUT,
>  	RTL_FLAG_MAX
>  };
> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
>  reset:
>  		rtl_reset_work(tp);
>  		netif_wake_queue(tp->dev);
> -	} else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
> -		rtl_reset_work(tp);
>  	}
>  out_unlock:
>  	rtnl_unlock();
> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
>  	if (netif_carrier_ok(ndev)) {
>  		rtl_link_chg_patch(tp);
>  		pm_request_resume(d);
> -		netif_wake_queue(tp->dev);
> -	} else {
> +
>  		/* In few cases rx is broken after link-down otherwise */
>  		if (rtl_is_8125(tp))
> -			rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
> +			rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
> +		else
> +			netif_wake_queue(tp->dev);

This call to netif_wake_queue() isn't needed any longer, it was introduced with
the original change only.

> +	} else {
>  		pm_runtime_idle(d);
>  	}
>
En-Wei Wu Sept. 9, 2024, 5:25 a.m. UTC | #2
Hi Heiner,

Thank you for the quick response.

On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 06.09.2024 10:35, En-Wei Wu wrote:
> > The commit 621735f59064 ("r8169: fix rare issue with broken rx after
> > link-down on RTL8125") set a reset work for RTL8125 in
> > r8169_phylink_handler() to avoid the MAC from locking up, this
> > makes the connection broken after unplugging then re-plugging the
> > Ethernet cable.
> >
> > This is because the commit mistakenly put the reset work in the
> > link-down path rather than the link-up path (The commit message says
> > it should be put in the link-up path).
> >
> That's not what the commit message is saying. It says vendor driver
> r8125 does it in the link-up path.
> I moved it intentionally to the link-down path, because traffic may
> be flowing already after link-up.
>
> > Moving the reset work from the link-down path to the link-up path fixes
> > the issue. Also, remove the unnecessary enum member.
> >
> The user who reported the issue at that time confirmed that the original
> change fixed the issue for him.
> Can you explain, from the NICs perspective, what exactly the difference
> is when doing the reset after link-up?
> Including an explanation how the original change suppresses the link-up
> interrupt. And why that's not the case when doing the reset after link-up.

The host-plug test under original change does have the link-up
interrupt and r8169_phylink_handler() called. There is not much clue
why calling reset in link-down path doesn't work but in link-up does.

After several new tests, I found that with the original change, the
link won't break if I unplug and then plug the cable within about 3
seconds. On the other hand, the connections always break if I re-plug
the cable after a few seconds.

With this new patch (reset in link-up path), both of the tests work
without any error.

>
> I simply want to be convinced enough that your change doesn't break
> behavior for other users.
>
> > Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
> > Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
> > ---
> >  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
> >  1 file changed, 5 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> > index 3507c2e28110..632e661fc74b 100644
> > --- a/drivers/net/ethernet/realtek/r8169_main.c
> > +++ b/drivers/net/ethernet/realtek/r8169_main.c
> > @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
> >  enum rtl_flag {
> >       RTL_FLAG_TASK_ENABLED = 0,
> >       RTL_FLAG_TASK_RESET_PENDING,
> > -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
> >       RTL_FLAG_TASK_TX_TIMEOUT,
> >       RTL_FLAG_MAX
> >  };
> > @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
> >  reset:
> >               rtl_reset_work(tp);
> >               netif_wake_queue(tp->dev);
> > -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
> > -             rtl_reset_work(tp);
> >       }
> >  out_unlock:
> >       rtnl_unlock();
> > @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
> >       if (netif_carrier_ok(ndev)) {
> >               rtl_link_chg_patch(tp);
> >               pm_request_resume(d);
> > -             netif_wake_queue(tp->dev);
> > -     } else {
> > +
> >               /* In few cases rx is broken after link-down otherwise */
> >               if (rtl_is_8125(tp))
> > -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
> > +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
> > +             else
> > +                     netif_wake_queue(tp->dev);
>
> This call to netif_wake_queue() isn't needed any longer, it was introduced with
> the original change only.
>
> > +     } else {
> >               pm_runtime_idle(d);
> >       }
> >
>

CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
this new patch works on your environment? Thanks!

En-Wei,
Best regards.
Heiner Kallweit Sept. 10, 2024, 5:06 p.m. UTC | #3
On 09.09.2024 07:25, En-Wei WU wrote:
> Hi Heiner,
> 
> Thank you for the quick response.
> 
> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 06.09.2024 10:35, En-Wei Wu wrote:
>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
>>> link-down on RTL8125") set a reset work for RTL8125 in
>>> r8169_phylink_handler() to avoid the MAC from locking up, this
>>> makes the connection broken after unplugging then re-plugging the
>>> Ethernet cable.
>>>
>>> This is because the commit mistakenly put the reset work in the
>>> link-down path rather than the link-up path (The commit message says
>>> it should be put in the link-up path).
>>>
>> That's not what the commit message is saying. It says vendor driver
>> r8125 does it in the link-up path.
>> I moved it intentionally to the link-down path, because traffic may
>> be flowing already after link-up.
>>
>>> Moving the reset work from the link-down path to the link-up path fixes
>>> the issue. Also, remove the unnecessary enum member.
>>>
>> The user who reported the issue at that time confirmed that the original
>> change fixed the issue for him.
>> Can you explain, from the NICs perspective, what exactly the difference
>> is when doing the reset after link-up?
>> Including an explanation how the original change suppresses the link-up
>> interrupt. And why that's not the case when doing the reset after link-up.
> 
> The host-plug test under original change does have the link-up
> interrupt and r8169_phylink_handler() called. There is not much clue
> why calling reset in link-down path doesn't work but in link-up does.
> 
> After several new tests, I found that with the original change, the
> link won't break if I unplug and then plug the cable within about 3
> seconds. On the other hand, the connections always break if I re-plug
> the cable after a few seconds.
> 
Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
because this has a 10s delay before the chip is transitioned to D3hot.
It makes more the impression that after 3s of link-down the chip (PHY?)
transitions to a mode where it doesn't wake up after re-plugging the cable.

Just a wild guess: It may be some feature like ALDPS (advanced link-down
power saving). Depending on the link partner this may result in not waking
up again, namely if the link partner uses ALDPS too.
What is the link partner in your case? If you put a simple switch in between,
does this help?

In the RTL8211F datasheet I found the following:

Link Down Power Saving Mode.
1: Reflects local device entered Link Down Power Saving Mode,
i.e., cable not plugged in (reflected after 3 sec)
0: With cable plugged in

This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
integrated PHY of RTL8125. The 3s delay described there perfectly matches
your finding.

> With this new patch (reset in link-up path), both of the tests work
> without any error.
> 
>>
>> I simply want to be convinced enough that your change doesn't break
>> behavior for other users.
>>
>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
>>> ---
>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
>>>  1 file changed, 5 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
>>> index 3507c2e28110..632e661fc74b 100644
>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
>>>  enum rtl_flag {
>>>       RTL_FLAG_TASK_ENABLED = 0,
>>>       RTL_FLAG_TASK_RESET_PENDING,
>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
>>>       RTL_FLAG_TASK_TX_TIMEOUT,
>>>       RTL_FLAG_MAX
>>>  };
>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
>>>  reset:
>>>               rtl_reset_work(tp);
>>>               netif_wake_queue(tp->dev);
>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
>>> -             rtl_reset_work(tp);
>>>       }
>>>  out_unlock:
>>>       rtnl_unlock();
>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
>>>       if (netif_carrier_ok(ndev)) {
>>>               rtl_link_chg_patch(tp);
>>>               pm_request_resume(d);
>>> -             netif_wake_queue(tp->dev);
>>> -     } else {
>>> +
>>>               /* In few cases rx is broken after link-down otherwise */
>>>               if (rtl_is_8125(tp))
>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>>> +             else
>>> +                     netif_wake_queue(tp->dev);
>>
>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
>> the original change only.
>>
>>> +     } else {
>>>               pm_runtime_idle(d);
>>>       }
>>>
>>
> 
> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
> this new patch works on your environment? Thanks!
> 
> En-Wei,
> Best regards.
En-Wei Wu Sept. 11, 2024, 7:01 a.m. UTC | #4
> What is the link partner in your case?
My link partner is FS S3900-48T4S switch.

>  If you put a simple switch in between, does this help?
I just put a simple D-link switch in between with the original kernel,
the issue remains (re-plugging it after 3 seconds).

> It makes more the impression that after 3s of link-down the chip (PHY?)
> transitions to a mode where it doesn't wake up after re-plugging the cable.
I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
and I found that the phy did wake up:

   kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
       |      phy_link_change() {
3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
 6.704 us   |        netif_carrier_on();
3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
            |        r8169_phylink_handler() {
3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
 0.257 us   |          rtl_link_chg_patch();
3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
 4.026 us   |          netif_tx_wake_queue();
3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
            |          phy_print_status() {
3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
 0.245 us   |            phy_duplex_to_str();
3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
 0.240 us   |            phy_speed_to_str();
3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
+ 12.798 us  |            netdev_info();
3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
+ 14.385 us  |          }
3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
+ 21.217 us  |        }
3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
+ 28.785 us  |      }

So I doubt that the issue isn't necessarily related to the ALDPS,
because the PHY seems to have woken up.

After looking at the reset function (plus the TX queue issue
previously reported by the user) , I'm wondering if the problem is
related to DMA:
static void rtl_reset_work(struct rtl8169_private *tp) {
    ....
    for (i = 0; i < NUM_RX_DESC; i++)
         rtl8169_mark_to_asic(tp->RxDescArray + i);
    ....
}

On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 09.09.2024 07:25, En-Wei WU wrote:
> > Hi Heiner,
> >
> > Thank you for the quick response.
> >
> > On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>
> >> On 06.09.2024 10:35, En-Wei Wu wrote:
> >>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
> >>> link-down on RTL8125") set a reset work for RTL8125 in
> >>> r8169_phylink_handler() to avoid the MAC from locking up, this
> >>> makes the connection broken after unplugging then re-plugging the
> >>> Ethernet cable.
> >>>
> >>> This is because the commit mistakenly put the reset work in the
> >>> link-down path rather than the link-up path (The commit message says
> >>> it should be put in the link-up path).
> >>>
> >> That's not what the commit message is saying. It says vendor driver
> >> r8125 does it in the link-up path.
> >> I moved it intentionally to the link-down path, because traffic may
> >> be flowing already after link-up.
> >>
> >>> Moving the reset work from the link-down path to the link-up path fixes
> >>> the issue. Also, remove the unnecessary enum member.
> >>>
> >> The user who reported the issue at that time confirmed that the original
> >> change fixed the issue for him.
> >> Can you explain, from the NICs perspective, what exactly the difference
> >> is when doing the reset after link-up?
> >> Including an explanation how the original change suppresses the link-up
> >> interrupt. And why that's not the case when doing the reset after link-up.
> >
> > The host-plug test under original change does have the link-up
> > interrupt and r8169_phylink_handler() called. There is not much clue
> > why calling reset in link-down path doesn't work but in link-up does.
> >
> > After several new tests, I found that with the original change, the
> > link won't break if I unplug and then plug the cable within about 3
> > seconds. On the other hand, the connections always break if I re-plug
> > the cable after a few seconds.
> >
> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
> because this has a 10s delay before the chip is transitioned to D3hot.
> It makes more the impression that after 3s of link-down the chip (PHY?)
> transitions to a mode where it doesn't wake up after re-plugging the cable.
>
> Just a wild guess: It may be some feature like ALDPS (advanced link-down
> power saving). Depending on the link partner this may result in not waking
> up again, namely if the link partner uses ALDPS too.
> What is the link partner in your case? If you put a simple switch in between,
> does this help?
>
> In the RTL8211F datasheet I found the following:
>
> Link Down Power Saving Mode.
> 1: Reflects local device entered Link Down Power Saving Mode,
> i.e., cable not plugged in (reflected after 3 sec)
> 0: With cable plugged in
>
> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
> integrated PHY of RTL8125. The 3s delay described there perfectly matches
> your finding.
>
> > With this new patch (reset in link-up path), both of the tests work
> > without any error.
> >
> >>
> >> I simply want to be convinced enough that your change doesn't break
> >> behavior for other users.
> >>
> >>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
> >>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
> >>> ---
> >>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
> >>>  1 file changed, 5 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> >>> index 3507c2e28110..632e661fc74b 100644
> >>> --- a/drivers/net/ethernet/realtek/r8169_main.c
> >>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
> >>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
> >>>  enum rtl_flag {
> >>>       RTL_FLAG_TASK_ENABLED = 0,
> >>>       RTL_FLAG_TASK_RESET_PENDING,
> >>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
> >>>       RTL_FLAG_TASK_TX_TIMEOUT,
> >>>       RTL_FLAG_MAX
> >>>  };
> >>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
> >>>  reset:
> >>>               rtl_reset_work(tp);
> >>>               netif_wake_queue(tp->dev);
> >>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
> >>> -             rtl_reset_work(tp);
> >>>       }
> >>>  out_unlock:
> >>>       rtnl_unlock();
> >>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
> >>>       if (netif_carrier_ok(ndev)) {
> >>>               rtl_link_chg_patch(tp);
> >>>               pm_request_resume(d);
> >>> -             netif_wake_queue(tp->dev);
> >>> -     } else {
> >>> +
> >>>               /* In few cases rx is broken after link-down otherwise */
> >>>               if (rtl_is_8125(tp))
> >>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
> >>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
> >>> +             else
> >>> +                     netif_wake_queue(tp->dev);
> >>
> >> This call to netif_wake_queue() isn't needed any longer, it was introduced with
> >> the original change only.
> >>
> >>> +     } else {
> >>>               pm_runtime_idle(d);
> >>>       }
> >>>
> >>
> >
> > CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
> > this new patch works on your environment? Thanks!
> >
> > En-Wei,
> > Best regards.
>
Heiner Kallweit Sept. 11, 2024, 9:14 a.m. UTC | #5
On 11.09.2024 09:01, En-Wei WU wrote:
>> What is the link partner in your case?
> My link partner is FS S3900-48T4S switch.
> 
>>  If you put a simple switch in between, does this help?
> I just put a simple D-link switch in between with the original kernel,
> the issue remains (re-plugging it after 3 seconds).
> 
>> It makes more the impression that after 3s of link-down the chip (PHY?)
>> transitions to a mode where it doesn't wake up after re-plugging the cable.
> I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
> and I found that the phy did wake up:
> 
>    kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
>        |      phy_link_change() {
> 3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
>  6.704 us   |        netif_carrier_on();
> 3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>             |        r8169_phylink_handler() {
> 3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>  0.257 us   |          rtl_link_chg_patch();
> 3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
>  4.026 us   |          netif_tx_wake_queue();
> 3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
>             |          phy_print_status() {
> 3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>  0.245 us   |            phy_duplex_to_str();
> 3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>  0.240 us   |            phy_speed_to_str();
> 3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> + 12.798 us  |            netdev_info();
> 3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 14.385 us  |          }
> 3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 21.217 us  |        }
> 3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 28.785 us  |      }
> 
> So I doubt that the issue isn't necessarily related to the ALDPS,
> because the PHY seems to have woken up.
> 
> After looking at the reset function (plus the TX queue issue
> previously reported by the user) , I'm wondering if the problem is
> related to DMA:
> static void rtl_reset_work(struct rtl8169_private *tp) {
>     ....
>     for (i = 0; i < NUM_RX_DESC; i++)
>          rtl8169_mark_to_asic(tp->RxDescArray + i);
>     ....
> }
> 
Thanks for re-testing. I don't think it's something on the MAC side.
For the MAC it should make no difference whether the reset is done
during link-down or after link-up. Therefore I believe it's something
on the PHY side.
Also wrt ALDPS: Do you have the firmware for the NIC loaded?

> On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 09.09.2024 07:25, En-Wei WU wrote:
>>> Hi Heiner,
>>>
>>> Thank you for the quick response.
>>>
>>> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> On 06.09.2024 10:35, En-Wei Wu wrote:
>>>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
>>>>> link-down on RTL8125") set a reset work for RTL8125 in
>>>>> r8169_phylink_handler() to avoid the MAC from locking up, this
>>>>> makes the connection broken after unplugging then re-plugging the
>>>>> Ethernet cable.
>>>>>
>>>>> This is because the commit mistakenly put the reset work in the
>>>>> link-down path rather than the link-up path (The commit message says
>>>>> it should be put in the link-up path).
>>>>>
>>>> That's not what the commit message is saying. It says vendor driver
>>>> r8125 does it in the link-up path.
>>>> I moved it intentionally to the link-down path, because traffic may
>>>> be flowing already after link-up.
>>>>
>>>>> Moving the reset work from the link-down path to the link-up path fixes
>>>>> the issue. Also, remove the unnecessary enum member.
>>>>>
>>>> The user who reported the issue at that time confirmed that the original
>>>> change fixed the issue for him.
>>>> Can you explain, from the NICs perspective, what exactly the difference
>>>> is when doing the reset after link-up?
>>>> Including an explanation how the original change suppresses the link-up
>>>> interrupt. And why that's not the case when doing the reset after link-up.
>>>
>>> The host-plug test under original change does have the link-up
>>> interrupt and r8169_phylink_handler() called. There is not much clue
>>> why calling reset in link-down path doesn't work but in link-up does.
>>>
>>> After several new tests, I found that with the original change, the
>>> link won't break if I unplug and then plug the cable within about 3
>>> seconds. On the other hand, the connections always break if I re-plug
>>> the cable after a few seconds.
>>>
>> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
>> because this has a 10s delay before the chip is transitioned to D3hot.
>> It makes more the impression that after 3s of link-down the chip (PHY?)
>> transitions to a mode where it doesn't wake up after re-plugging the cable.
>>
>> Just a wild guess: It may be some feature like ALDPS (advanced link-down
>> power saving). Depending on the link partner this may result in not waking
>> up again, namely if the link partner uses ALDPS too.
>> What is the link partner in your case? If you put a simple switch in between,
>> does this help?
>>
>> In the RTL8211F datasheet I found the following:
>>
>> Link Down Power Saving Mode.
>> 1: Reflects local device entered Link Down Power Saving Mode,
>> i.e., cable not plugged in (reflected after 3 sec)
>> 0: With cable plugged in
>>
>> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
>> integrated PHY of RTL8125. The 3s delay described there perfectly matches
>> your finding.
>>
>>> With this new patch (reset in link-up path), both of the tests work
>>> without any error.
>>>
>>>>
>>>> I simply want to be convinced enough that your change doesn't break
>>>> behavior for other users.
>>>>
>>>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
>>>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
>>>>> ---
>>>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
>>>>>  1 file changed, 5 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
>>>>> index 3507c2e28110..632e661fc74b 100644
>>>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
>>>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
>>>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
>>>>>  enum rtl_flag {
>>>>>       RTL_FLAG_TASK_ENABLED = 0,
>>>>>       RTL_FLAG_TASK_RESET_PENDING,
>>>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
>>>>>       RTL_FLAG_TASK_TX_TIMEOUT,
>>>>>       RTL_FLAG_MAX
>>>>>  };
>>>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
>>>>>  reset:
>>>>>               rtl_reset_work(tp);
>>>>>               netif_wake_queue(tp->dev);
>>>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
>>>>> -             rtl_reset_work(tp);
>>>>>       }
>>>>>  out_unlock:
>>>>>       rtnl_unlock();
>>>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
>>>>>       if (netif_carrier_ok(ndev)) {
>>>>>               rtl_link_chg_patch(tp);
>>>>>               pm_request_resume(d);
>>>>> -             netif_wake_queue(tp->dev);
>>>>> -     } else {
>>>>> +
>>>>>               /* In few cases rx is broken after link-down otherwise */
>>>>>               if (rtl_is_8125(tp))
>>>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
>>>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>>>>> +             else
>>>>> +                     netif_wake_queue(tp->dev);
>>>>
>>>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
>>>> the original change only.
>>>>
>>>>> +     } else {
>>>>>               pm_runtime_idle(d);
>>>>>       }
>>>>>
>>>>
>>>
>>> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
>>> this new patch works on your environment? Thanks!
>>>
>>> En-Wei,
>>> Best regards.
>>
Heiner Kallweit Sept. 11, 2024, 9:16 a.m. UTC | #6
On 11.09.2024 09:01, En-Wei WU wrote:
>> What is the link partner in your case?
> My link partner is FS S3900-48T4S switch.
> 
>>  If you put a simple switch in between, does this help?
> I just put a simple D-link switch in between with the original kernel,
> the issue remains (re-plugging it after 3 seconds).
> 
>> It makes more the impression that after 3s of link-down the chip (PHY?)
>> transitions to a mode where it doesn't wake up after re-plugging the cable.
> I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
> and I found that the phy did wake up:
> 
>    kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
>        |      phy_link_change() {
> 3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
>  6.704 us   |        netif_carrier_on();
> 3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>             |        r8169_phylink_handler() {
> 3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>  0.257 us   |          rtl_link_chg_patch();
> 3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
>  4.026 us   |          netif_tx_wake_queue();
> 3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
>             |          phy_print_status() {
> 3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>  0.245 us   |            phy_duplex_to_str();
> 3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>  0.240 us   |            phy_speed_to_str();
> 3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> + 12.798 us  |            netdev_info();
> 3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 14.385 us  |          }
> 3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 21.217 us  |        }
> 3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> + 28.785 us  |      }
> 
> So I doubt that the issue isn't necessarily related to the ALDPS,
> because the PHY seems to have woken up.
> 
> After looking at the reset function (plus the TX queue issue
> previously reported by the user) , I'm wondering if the problem is
> related to DMA:
> static void rtl_reset_work(struct rtl8169_private *tp) {
>     ....
>     for (i = 0; i < NUM_RX_DESC; i++)
>          rtl8169_mark_to_asic(tp->RxDescArray + i);
>     ....
> }
> 
> On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 09.09.2024 07:25, En-Wei WU wrote:
>>> Hi Heiner,
>>>
>>> Thank you for the quick response.
>>>
>>> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> On 06.09.2024 10:35, En-Wei Wu wrote:
>>>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
>>>>> link-down on RTL8125") set a reset work for RTL8125 in
>>>>> r8169_phylink_handler() to avoid the MAC from locking up, this
>>>>> makes the connection broken after unplugging then re-plugging the
>>>>> Ethernet cable.
>>>>>
>>>>> This is because the commit mistakenly put the reset work in the
>>>>> link-down path rather than the link-up path (The commit message says
>>>>> it should be put in the link-up path).
>>>>>
>>>> That's not what the commit message is saying. It says vendor driver
>>>> r8125 does it in the link-up path.
>>>> I moved it intentionally to the link-down path, because traffic may
>>>> be flowing already after link-up.
>>>>
>>>>> Moving the reset work from the link-down path to the link-up path fixes
>>>>> the issue. Also, remove the unnecessary enum member.
>>>>>
>>>> The user who reported the issue at that time confirmed that the original
>>>> change fixed the issue for him.
>>>> Can you explain, from the NICs perspective, what exactly the difference
>>>> is when doing the reset after link-up?
>>>> Including an explanation how the original change suppresses the link-up
>>>> interrupt. And why that's not the case when doing the reset after link-up.
>>>
>>> The host-plug test under original change does have the link-up
>>> interrupt and r8169_phylink_handler() called. There is not much clue
>>> why calling reset in link-down path doesn't work but in link-up does.
>>>
>>> After several new tests, I found that with the original change, the
>>> link won't break if I unplug and then plug the cable within about 3
>>> seconds. On the other hand, the connections always break if I re-plug
>>> the cable after a few seconds.
>>>
>> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
>> because this has a 10s delay before the chip is transitioned to D3hot.
>> It makes more the impression that after 3s of link-down the chip (PHY?)
>> transitions to a mode where it doesn't wake up after re-plugging the cable.
>>
>> Just a wild guess: It may be some feature like ALDPS (advanced link-down
>> power saving). Depending on the link partner this may result in not waking
>> up again, namely if the link partner uses ALDPS too.
>> What is the link partner in your case? If you put a simple switch in between,
>> does this help?
>>
>> In the RTL8211F datasheet I found the following:
>>
>> Link Down Power Saving Mode.
>> 1: Reflects local device entered Link Down Power Saving Mode,
>> i.e., cable not plugged in (reflected after 3 sec)
>> 0: With cable plugged in
>>
>> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
>> integrated PHY of RTL8125. The 3s delay described there perfectly matches
>> your finding.
>>
>>> With this new patch (reset in link-up path), both of the tests work
>>> without any error.
>>>
>>>>
>>>> I simply want to be convinced enough that your change doesn't break
>>>> behavior for other users.
>>>>
>>>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
>>>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
>>>>> ---
>>>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
>>>>>  1 file changed, 5 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
>>>>> index 3507c2e28110..632e661fc74b 100644
>>>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
>>>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
>>>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
>>>>>  enum rtl_flag {
>>>>>       RTL_FLAG_TASK_ENABLED = 0,
>>>>>       RTL_FLAG_TASK_RESET_PENDING,
>>>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
>>>>>       RTL_FLAG_TASK_TX_TIMEOUT,
>>>>>       RTL_FLAG_MAX
>>>>>  };
>>>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
>>>>>  reset:
>>>>>               rtl_reset_work(tp);
>>>>>               netif_wake_queue(tp->dev);
>>>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
>>>>> -             rtl_reset_work(tp);
>>>>>       }
>>>>>  out_unlock:
>>>>>       rtnl_unlock();
>>>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
>>>>>       if (netif_carrier_ok(ndev)) {
>>>>>               rtl_link_chg_patch(tp);
>>>>>               pm_request_resume(d);
>>>>> -             netif_wake_queue(tp->dev);
>>>>> -     } else {
>>>>> +
>>>>>               /* In few cases rx is broken after link-down otherwise */
>>>>>               if (rtl_is_8125(tp))
>>>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
>>>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>>>>> +             else
>>>>> +                     netif_wake_queue(tp->dev);
>>>>
>>>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
>>>> the original change only.
>>>>
>>>>> +     } else {
>>>>>               pm_runtime_idle(d);
>>>>>       }
>>>>>
>>>>
>>>
>>> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
>>> this new patch works on your environment? Thanks!
>>>
>>> En-Wei,
>>> Best regards.
>>

Just to be sure. Can you test with the following?


diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c
index 2c8845e08..cf29b1208 100644
--- a/drivers/net/ethernet/realtek/r8169_phy_config.c
+++ b/drivers/net/ethernet/realtek/r8169_phy_config.c
@@ -1060,6 +1060,7 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp,
 	phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000);
 	rtl8168g_enable_gphy_10m(phydev);
 
+	rtl8168g_disable_aldps(phydev);
 	rtl8125a_config_eee_phy(phydev);
 }
 
@@ -1099,6 +1100,7 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp,
 	phy_modify_paged(phydev, 0xbf8, 0x12, 0xe000, 0xa000);
 
 	rtl8125_legacy_force_mode(phydev);
+	rtl8168g_disable_aldps(phydev);
 	rtl8125b_config_eee_phy(phydev);
 }
En-Wei Wu Sept. 11, 2024, 10:38 a.m. UTC | #7
> Also wrt ALDPS: Do you have the firmware for the NIC loaded?
The firmware is rtl8125b-2_0.0.2 07/13/20

> Just to be sure. Can you test with the following?
Your patch works for our machine. Seems like the root cause is indeed the ALDPS.

On Wed, 11 Sept 2024 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 11.09.2024 09:01, En-Wei WU wrote:
> >> What is the link partner in your case?
> > My link partner is FS S3900-48T4S switch.
> >
> >>  If you put a simple switch in between, does this help?
> > I just put a simple D-link switch in between with the original kernel,
> > the issue remains (re-plugging it after 3 seconds).
> >
> >> It makes more the impression that after 3s of link-down the chip (PHY?)
> >> transitions to a mode where it doesn't wake up after re-plugging the cable.
> > I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
> > and I found that the phy did wake up:
> >
> >    kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
> >        |      phy_link_change() {
> > 3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
> >  6.704 us   |        netif_carrier_on();
> > 3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
> >             |        r8169_phylink_handler() {
> > 3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
> >  0.257 us   |          rtl_link_chg_patch();
> > 3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
> >  4.026 us   |          netif_tx_wake_queue();
> > 3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
> >             |          phy_print_status() {
> > 3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> >  0.245 us   |            phy_duplex_to_str();
> > 3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> >  0.240 us   |            phy_speed_to_str();
> > 3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> > + 12.798 us  |            netdev_info();
> > 3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> > + 14.385 us  |          }
> > 3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> > + 21.217 us  |        }
> > 3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> > + 28.785 us  |      }
> >
> > So I doubt that the issue isn't necessarily related to the ALDPS,
> > because the PHY seems to have woken up.
> >
> > After looking at the reset function (plus the TX queue issue
> > previously reported by the user) , I'm wondering if the problem is
> > related to DMA:
> > static void rtl_reset_work(struct rtl8169_private *tp) {
> >     ....
> >     for (i = 0; i < NUM_RX_DESC; i++)
> >          rtl8169_mark_to_asic(tp->RxDescArray + i);
> >     ....
> > }
> >
> > On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>
> >> On 09.09.2024 07:25, En-Wei WU wrote:
> >>> Hi Heiner,
> >>>
> >>> Thank you for the quick response.
> >>>
> >>> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>>>
> >>>> On 06.09.2024 10:35, En-Wei Wu wrote:
> >>>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
> >>>>> link-down on RTL8125") set a reset work for RTL8125 in
> >>>>> r8169_phylink_handler() to avoid the MAC from locking up, this
> >>>>> makes the connection broken after unplugging then re-plugging the
> >>>>> Ethernet cable.
> >>>>>
> >>>>> This is because the commit mistakenly put the reset work in the
> >>>>> link-down path rather than the link-up path (The commit message says
> >>>>> it should be put in the link-up path).
> >>>>>
> >>>> That's not what the commit message is saying. It says vendor driver
> >>>> r8125 does it in the link-up path.
> >>>> I moved it intentionally to the link-down path, because traffic may
> >>>> be flowing already after link-up.
> >>>>
> >>>>> Moving the reset work from the link-down path to the link-up path fixes
> >>>>> the issue. Also, remove the unnecessary enum member.
> >>>>>
> >>>> The user who reported the issue at that time confirmed that the original
> >>>> change fixed the issue for him.
> >>>> Can you explain, from the NICs perspective, what exactly the difference
> >>>> is when doing the reset after link-up?
> >>>> Including an explanation how the original change suppresses the link-up
> >>>> interrupt. And why that's not the case when doing the reset after link-up.
> >>>
> >>> The host-plug test under original change does have the link-up
> >>> interrupt and r8169_phylink_handler() called. There is not much clue
> >>> why calling reset in link-down path doesn't work but in link-up does.
> >>>
> >>> After several new tests, I found that with the original change, the
> >>> link won't break if I unplug and then plug the cable within about 3
> >>> seconds. On the other hand, the connections always break if I re-plug
> >>> the cable after a few seconds.
> >>>
> >> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
> >> because this has a 10s delay before the chip is transitioned to D3hot.
> >> It makes more the impression that after 3s of link-down the chip (PHY?)
> >> transitions to a mode where it doesn't wake up after re-plugging the cable.
> >>
> >> Just a wild guess: It may be some feature like ALDPS (advanced link-down
> >> power saving). Depending on the link partner this may result in not waking
> >> up again, namely if the link partner uses ALDPS too.
> >> What is the link partner in your case? If you put a simple switch in between,
> >> does this help?
> >>
> >> In the RTL8211F datasheet I found the following:
> >>
> >> Link Down Power Saving Mode.
> >> 1: Reflects local device entered Link Down Power Saving Mode,
> >> i.e., cable not plugged in (reflected after 3 sec)
> >> 0: With cable plugged in
> >>
> >> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
> >> integrated PHY of RTL8125. The 3s delay described there perfectly matches
> >> your finding.
> >>
> >>> With this new patch (reset in link-up path), both of the tests work
> >>> without any error.
> >>>
> >>>>
> >>>> I simply want to be convinced enough that your change doesn't break
> >>>> behavior for other users.
> >>>>
> >>>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
> >>>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
> >>>>> ---
> >>>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
> >>>>>  1 file changed, 5 insertions(+), 6 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> >>>>> index 3507c2e28110..632e661fc74b 100644
> >>>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
> >>>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
> >>>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
> >>>>>  enum rtl_flag {
> >>>>>       RTL_FLAG_TASK_ENABLED = 0,
> >>>>>       RTL_FLAG_TASK_RESET_PENDING,
> >>>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
> >>>>>       RTL_FLAG_TASK_TX_TIMEOUT,
> >>>>>       RTL_FLAG_MAX
> >>>>>  };
> >>>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
> >>>>>  reset:
> >>>>>               rtl_reset_work(tp);
> >>>>>               netif_wake_queue(tp->dev);
> >>>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
> >>>>> -             rtl_reset_work(tp);
> >>>>>       }
> >>>>>  out_unlock:
> >>>>>       rtnl_unlock();
> >>>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
> >>>>>       if (netif_carrier_ok(ndev)) {
> >>>>>               rtl_link_chg_patch(tp);
> >>>>>               pm_request_resume(d);
> >>>>> -             netif_wake_queue(tp->dev);
> >>>>> -     } else {
> >>>>> +
> >>>>>               /* In few cases rx is broken after link-down otherwise */
> >>>>>               if (rtl_is_8125(tp))
> >>>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
> >>>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
> >>>>> +             else
> >>>>> +                     netif_wake_queue(tp->dev);
> >>>>
> >>>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
> >>>> the original change only.
> >>>>
> >>>>> +     } else {
> >>>>>               pm_runtime_idle(d);
> >>>>>       }
> >>>>>
> >>>>
> >>>
> >>> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
> >>> this new patch works on your environment? Thanks!
> >>>
> >>> En-Wei,
> >>> Best regards.
> >>
>
> Just to be sure. Can you test with the following?
>
>
> diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c
> index 2c8845e08..cf29b1208 100644
> --- a/drivers/net/ethernet/realtek/r8169_phy_config.c
> +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c
> @@ -1060,6 +1060,7 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp,
>         phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000);
>         rtl8168g_enable_gphy_10m(phydev);
>
> +       rtl8168g_disable_aldps(phydev);
>         rtl8125a_config_eee_phy(phydev);
>  }
>
> @@ -1099,6 +1100,7 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp,
>         phy_modify_paged(phydev, 0xbf8, 0x12, 0xe000, 0xa000);
>
>         rtl8125_legacy_force_mode(phydev);
> +       rtl8168g_disable_aldps(phydev);
>         rtl8125b_config_eee_phy(phydev);
>  }
>
> --
> 2.46.0
>
>
Heiner Kallweit Sept. 11, 2024, 11:58 a.m. UTC | #8
On 11.09.2024 12:38, En-Wei WU wrote:
>> Also wrt ALDPS: Do you have the firmware for the NIC loaded?
> The firmware is rtl8125b-2_0.0.2 07/13/20
> 
Thanks. Question was because I found an older statement from Realtek
stating that ALDPS requires firmware to work correctly.

>> Just to be sure. Can you test with the following?
> Your patch works for our machine. Seems like the root cause is indeed the ALDPS.
> 
Great! Not sure what's going on, maybe a silicon bug. ALDPS may e.g. stop some
clock and hw misses to re-enable it on link-up. Then I will submit the change
to disable ALDPS. Later we maybe can remove the reset on link-down.

Not having ALDPS shouldn't be too much of an issue. Runtime PM (if enabled)
will put the NIC to D3hot after 10s anyway.

> On Wed, 11 Sept 2024 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>
>> On 11.09.2024 09:01, En-Wei WU wrote:
>>>> What is the link partner in your case?
>>> My link partner is FS S3900-48T4S switch.
>>>
>>>>  If you put a simple switch in between, does this help?
>>> I just put a simple D-link switch in between with the original kernel,
>>> the issue remains (re-plugging it after 3 seconds).
>>>
>>>> It makes more the impression that after 3s of link-down the chip (PHY?)
>>>> transitions to a mode where it doesn't wake up after re-plugging the cable.
>>> I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
>>> and I found that the phy did wake up:
>>>
>>>    kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
>>>        |      phy_link_change() {
>>> 3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
>>>  6.704 us   |        netif_carrier_on();
>>> 3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>>>             |        r8169_phylink_handler() {
>>> 3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
>>>  0.257 us   |          rtl_link_chg_patch();
>>> 3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
>>>  4.026 us   |          netif_tx_wake_queue();
>>> 3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
>>>             |          phy_print_status() {
>>> 3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>>>  0.245 us   |            phy_duplex_to_str();
>>> 3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>>>  0.240 us   |            phy_speed_to_str();
>>> 3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
>>> + 12.798 us  |            netdev_info();
>>> 3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
>>> + 14.385 us  |          }
>>> 3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
>>> + 21.217 us  |        }
>>> 3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
>>> + 28.785 us  |      }
>>>
>>> So I doubt that the issue isn't necessarily related to the ALDPS,
>>> because the PHY seems to have woken up.
>>>
>>> After looking at the reset function (plus the TX queue issue
>>> previously reported by the user) , I'm wondering if the problem is
>>> related to DMA:
>>> static void rtl_reset_work(struct rtl8169_private *tp) {
>>>     ....
>>>     for (i = 0; i < NUM_RX_DESC; i++)
>>>          rtl8169_mark_to_asic(tp->RxDescArray + i);
>>>     ....
>>> }
>>>
>>> On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> On 09.09.2024 07:25, En-Wei WU wrote:
>>>>> Hi Heiner,
>>>>>
>>>>> Thank you for the quick response.
>>>>>
>>>>> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>
>>>>>> On 06.09.2024 10:35, En-Wei Wu wrote:
>>>>>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
>>>>>>> link-down on RTL8125") set a reset work for RTL8125 in
>>>>>>> r8169_phylink_handler() to avoid the MAC from locking up, this
>>>>>>> makes the connection broken after unplugging then re-plugging the
>>>>>>> Ethernet cable.
>>>>>>>
>>>>>>> This is because the commit mistakenly put the reset work in the
>>>>>>> link-down path rather than the link-up path (The commit message says
>>>>>>> it should be put in the link-up path).
>>>>>>>
>>>>>> That's not what the commit message is saying. It says vendor driver
>>>>>> r8125 does it in the link-up path.
>>>>>> I moved it intentionally to the link-down path, because traffic may
>>>>>> be flowing already after link-up.
>>>>>>
>>>>>>> Moving the reset work from the link-down path to the link-up path fixes
>>>>>>> the issue. Also, remove the unnecessary enum member.
>>>>>>>
>>>>>> The user who reported the issue at that time confirmed that the original
>>>>>> change fixed the issue for him.
>>>>>> Can you explain, from the NICs perspective, what exactly the difference
>>>>>> is when doing the reset after link-up?
>>>>>> Including an explanation how the original change suppresses the link-up
>>>>>> interrupt. And why that's not the case when doing the reset after link-up.
>>>>>
>>>>> The host-plug test under original change does have the link-up
>>>>> interrupt and r8169_phylink_handler() called. There is not much clue
>>>>> why calling reset in link-down path doesn't work but in link-up does.
>>>>>
>>>>> After several new tests, I found that with the original change, the
>>>>> link won't break if I unplug and then plug the cable within about 3
>>>>> seconds. On the other hand, the connections always break if I re-plug
>>>>> the cable after a few seconds.
>>>>>
>>>> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
>>>> because this has a 10s delay before the chip is transitioned to D3hot.
>>>> It makes more the impression that after 3s of link-down the chip (PHY?)
>>>> transitions to a mode where it doesn't wake up after re-plugging the cable.
>>>>
>>>> Just a wild guess: It may be some feature like ALDPS (advanced link-down
>>>> power saving). Depending on the link partner this may result in not waking
>>>> up again, namely if the link partner uses ALDPS too.
>>>> What is the link partner in your case? If you put a simple switch in between,
>>>> does this help?
>>>>
>>>> In the RTL8211F datasheet I found the following:
>>>>
>>>> Link Down Power Saving Mode.
>>>> 1: Reflects local device entered Link Down Power Saving Mode,
>>>> i.e., cable not plugged in (reflected after 3 sec)
>>>> 0: With cable plugged in
>>>>
>>>> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
>>>> integrated PHY of RTL8125. The 3s delay described there perfectly matches
>>>> your finding.
>>>>
>>>>> With this new patch (reset in link-up path), both of the tests work
>>>>> without any error.
>>>>>
>>>>>>
>>>>>> I simply want to be convinced enough that your change doesn't break
>>>>>> behavior for other users.
>>>>>>
>>>>>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
>>>>>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
>>>>>>> ---
>>>>>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
>>>>>>>  1 file changed, 5 insertions(+), 6 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
>>>>>>> index 3507c2e28110..632e661fc74b 100644
>>>>>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
>>>>>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
>>>>>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
>>>>>>>  enum rtl_flag {
>>>>>>>       RTL_FLAG_TASK_ENABLED = 0,
>>>>>>>       RTL_FLAG_TASK_RESET_PENDING,
>>>>>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
>>>>>>>       RTL_FLAG_TASK_TX_TIMEOUT,
>>>>>>>       RTL_FLAG_MAX
>>>>>>>  };
>>>>>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
>>>>>>>  reset:
>>>>>>>               rtl_reset_work(tp);
>>>>>>>               netif_wake_queue(tp->dev);
>>>>>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
>>>>>>> -             rtl_reset_work(tp);
>>>>>>>       }
>>>>>>>  out_unlock:
>>>>>>>       rtnl_unlock();
>>>>>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
>>>>>>>       if (netif_carrier_ok(ndev)) {
>>>>>>>               rtl_link_chg_patch(tp);
>>>>>>>               pm_request_resume(d);
>>>>>>> -             netif_wake_queue(tp->dev);
>>>>>>> -     } else {
>>>>>>> +
>>>>>>>               /* In few cases rx is broken after link-down otherwise */
>>>>>>>               if (rtl_is_8125(tp))
>>>>>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
>>>>>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>>>>>>> +             else
>>>>>>> +                     netif_wake_queue(tp->dev);
>>>>>>
>>>>>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
>>>>>> the original change only.
>>>>>>
>>>>>>> +     } else {
>>>>>>>               pm_runtime_idle(d);
>>>>>>>       }
>>>>>>>
>>>>>>
>>>>>
>>>>> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
>>>>> this new patch works on your environment? Thanks!
>>>>>
>>>>> En-Wei,
>>>>> Best regards.
>>>>
>>
>> Just to be sure. Can you test with the following?
>>
>>
>> diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c
>> index 2c8845e08..cf29b1208 100644
>> --- a/drivers/net/ethernet/realtek/r8169_phy_config.c
>> +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c
>> @@ -1060,6 +1060,7 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp,
>>         phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000);
>>         rtl8168g_enable_gphy_10m(phydev);
>>
>> +       rtl8168g_disable_aldps(phydev);
>>         rtl8125a_config_eee_phy(phydev);
>>  }
>>
>> @@ -1099,6 +1100,7 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp,
>>         phy_modify_paged(phydev, 0xbf8, 0x12, 0xe000, 0xa000);
>>
>>         rtl8125_legacy_force_mode(phydev);
>> +       rtl8168g_disable_aldps(phydev);
>>         rtl8125b_config_eee_phy(phydev);
>>  }
>>
>> --
>> 2.46.0
>>
>>
En-Wei Wu Sept. 11, 2024, 2:07 p.m. UTC | #9
Thanks for your great support!


On Wed, 11 Sept 2024 at 19:58, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 11.09.2024 12:38, En-Wei WU wrote:
> >> Also wrt ALDPS: Do you have the firmware for the NIC loaded?
> > The firmware is rtl8125b-2_0.0.2 07/13/20
> >
> Thanks. Question was because I found an older statement from Realtek
> stating that ALDPS requires firmware to work correctly.
>
> >> Just to be sure. Can you test with the following?
> > Your patch works for our machine. Seems like the root cause is indeed the ALDPS.
> >
> Great! Not sure what's going on, maybe a silicon bug. ALDPS may e.g. stop some
> clock and hw misses to re-enable it on link-up. Then I will submit the change
> to disable ALDPS. Later we maybe can remove the reset on link-down.
>
> Not having ALDPS shouldn't be too much of an issue. Runtime PM (if enabled)
> will put the NIC to D3hot after 10s anyway.
>
> > On Wed, 11 Sept 2024 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>
> >> On 11.09.2024 09:01, En-Wei WU wrote:
> >>>> What is the link partner in your case?
> >>> My link partner is FS S3900-48T4S switch.
> >>>
> >>>>  If you put a simple switch in between, does this help?
> >>> I just put a simple D-link switch in between with the original kernel,
> >>> the issue remains (re-plugging it after 3 seconds).
> >>>
> >>>> It makes more the impression that after 3s of link-down the chip (PHY?)
> >>>> transitions to a mode where it doesn't wake up after re-plugging the cable.
> >>> I've done a ftrace on the r8169.ko and the phy driver (realtek.ko),
> >>> and I found that the phy did wake up:
> >>>
> >>>    kworker/u40:4-267   [003]   297.026314: funcgraph_entry:
> >>>        |      phy_link_change() {
> >>> 3533    kworker/u40:4-267   [003]   297.026315: funcgraph_entry:
> >>>  6.704 us   |        netif_carrier_on();
> >>> 3534    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
> >>>             |        r8169_phylink_handler() {
> >>> 3535    kworker/u40:4-267   [003]   297.026322: funcgraph_entry:
> >>>  0.257 us   |          rtl_link_chg_patch();
> >>> 3536    kworker/u40:4-267   [003]   297.026324: funcgraph_entry:
> >>>  4.026 us   |          netif_tx_wake_queue();
> >>> 3537    kworker/u40:4-267   [003]   297.026328: funcgraph_entry:
> >>>             |          phy_print_status() {
> >>> 3538    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> >>>  0.245 us   |            phy_duplex_to_str();
> >>> 3539    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> >>>  0.240 us   |            phy_speed_to_str();
> >>> 3540    kworker/u40:4-267   [003]   297.026329: funcgraph_entry:
> >>> + 12.798 us  |            netdev_info();
> >>> 3541    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> >>> + 14.385 us  |          }
> >>> 3542    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> >>> + 21.217 us  |        }
> >>> 3543    kworker/u40:4-267   [003]   297.026343: funcgraph_exit:
> >>> + 28.785 us  |      }
> >>>
> >>> So I doubt that the issue isn't necessarily related to the ALDPS,
> >>> because the PHY seems to have woken up.
> >>>
> >>> After looking at the reset function (plus the TX queue issue
> >>> previously reported by the user) , I'm wondering if the problem is
> >>> related to DMA:
> >>> static void rtl_reset_work(struct rtl8169_private *tp) {
> >>>     ....
> >>>     for (i = 0; i < NUM_RX_DESC; i++)
> >>>          rtl8169_mark_to_asic(tp->RxDescArray + i);
> >>>     ....
> >>> }
> >>>
> >>> On Wed, 11 Sept 2024 at 01:06, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>>>
> >>>> On 09.09.2024 07:25, En-Wei WU wrote:
> >>>>> Hi Heiner,
> >>>>>
> >>>>> Thank you for the quick response.
> >>>>>
> >>>>> On Sat, 7 Sept 2024 at 05:17, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> >>>>>>
> >>>>>> On 06.09.2024 10:35, En-Wei Wu wrote:
> >>>>>>> The commit 621735f59064 ("r8169: fix rare issue with broken rx after
> >>>>>>> link-down on RTL8125") set a reset work for RTL8125 in
> >>>>>>> r8169_phylink_handler() to avoid the MAC from locking up, this
> >>>>>>> makes the connection broken after unplugging then re-plugging the
> >>>>>>> Ethernet cable.
> >>>>>>>
> >>>>>>> This is because the commit mistakenly put the reset work in the
> >>>>>>> link-down path rather than the link-up path (The commit message says
> >>>>>>> it should be put in the link-up path).
> >>>>>>>
> >>>>>> That's not what the commit message is saying. It says vendor driver
> >>>>>> r8125 does it in the link-up path.
> >>>>>> I moved it intentionally to the link-down path, because traffic may
> >>>>>> be flowing already after link-up.
> >>>>>>
> >>>>>>> Moving the reset work from the link-down path to the link-up path fixes
> >>>>>>> the issue. Also, remove the unnecessary enum member.
> >>>>>>>
> >>>>>> The user who reported the issue at that time confirmed that the original
> >>>>>> change fixed the issue for him.
> >>>>>> Can you explain, from the NICs perspective, what exactly the difference
> >>>>>> is when doing the reset after link-up?
> >>>>>> Including an explanation how the original change suppresses the link-up
> >>>>>> interrupt. And why that's not the case when doing the reset after link-up.
> >>>>>
> >>>>> The host-plug test under original change does have the link-up
> >>>>> interrupt and r8169_phylink_handler() called. There is not much clue
> >>>>> why calling reset in link-down path doesn't work but in link-up does.
> >>>>>
> >>>>> After several new tests, I found that with the original change, the
> >>>>> link won't break if I unplug and then plug the cable within about 3
> >>>>> seconds. On the other hand, the connections always break if I re-plug
> >>>>> the cable after a few seconds.
> >>>>>
> >>>> Interesting finding. 3 seconds sounds like it's unrelated to runtime pm,
> >>>> because this has a 10s delay before the chip is transitioned to D3hot.
> >>>> It makes more the impression that after 3s of link-down the chip (PHY?)
> >>>> transitions to a mode where it doesn't wake up after re-plugging the cable.
> >>>>
> >>>> Just a wild guess: It may be some feature like ALDPS (advanced link-down
> >>>> power saving). Depending on the link partner this may result in not waking
> >>>> up again, namely if the link partner uses ALDPS too.
> >>>> What is the link partner in your case? If you put a simple switch in between,
> >>>> does this help?
> >>>>
> >>>> In the RTL8211F datasheet I found the following:
> >>>>
> >>>> Link Down Power Saving Mode.
> >>>> 1: Reflects local device entered Link Down Power Saving Mode,
> >>>> i.e., cable not plugged in (reflected after 3 sec)
> >>>> 0: With cable plugged in
> >>>>
> >>>> This is a 1Gbps PHY, but Realtek may use the same ALDPS mechanism with the
> >>>> integrated PHY of RTL8125. The 3s delay described there perfectly matches
> >>>> your finding.
> >>>>
> >>>>> With this new patch (reset in link-up path), both of the tests work
> >>>>> without any error.
> >>>>>
> >>>>>>
> >>>>>> I simply want to be convinced enough that your change doesn't break
> >>>>>> behavior for other users.
> >>>>>>
> >>>>>>> Fixes: 621735f59064 ("r8169: fix rare issue with broken rx after link-down on RTL8125")
> >>>>>>> Signed-off-by: En-Wei Wu <en-wei.wu@canonical.com>
> >>>>>>> ---
> >>>>>>>  drivers/net/ethernet/realtek/r8169_main.c | 11 +++++------
> >>>>>>>  1 file changed, 5 insertions(+), 6 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> >>>>>>> index 3507c2e28110..632e661fc74b 100644
> >>>>>>> --- a/drivers/net/ethernet/realtek/r8169_main.c
> >>>>>>> +++ b/drivers/net/ethernet/realtek/r8169_main.c
> >>>>>>> @@ -590,7 +590,6 @@ struct rtl8169_tc_offsets {
> >>>>>>>  enum rtl_flag {
> >>>>>>>       RTL_FLAG_TASK_ENABLED = 0,
> >>>>>>>       RTL_FLAG_TASK_RESET_PENDING,
> >>>>>>> -     RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
> >>>>>>>       RTL_FLAG_TASK_TX_TIMEOUT,
> >>>>>>>       RTL_FLAG_MAX
> >>>>>>>  };
> >>>>>>> @@ -4698,8 +4697,6 @@ static void rtl_task(struct work_struct *work)
> >>>>>>>  reset:
> >>>>>>>               rtl_reset_work(tp);
> >>>>>>>               netif_wake_queue(tp->dev);
> >>>>>>> -     } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
> >>>>>>> -             rtl_reset_work(tp);
> >>>>>>>       }
> >>>>>>>  out_unlock:
> >>>>>>>       rtnl_unlock();
> >>>>>>> @@ -4729,11 +4726,13 @@ static void r8169_phylink_handler(struct net_device *ndev)
> >>>>>>>       if (netif_carrier_ok(ndev)) {
> >>>>>>>               rtl_link_chg_patch(tp);
> >>>>>>>               pm_request_resume(d);
> >>>>>>> -             netif_wake_queue(tp->dev);
> >>>>>>> -     } else {
> >>>>>>> +
> >>>>>>>               /* In few cases rx is broken after link-down otherwise */
> >>>>>>>               if (rtl_is_8125(tp))
> >>>>>>> -                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
> >>>>>>> +                     rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
> >>>>>>> +             else
> >>>>>>> +                     netif_wake_queue(tp->dev);
> >>>>>>
> >>>>>> This call to netif_wake_queue() isn't needed any longer, it was introduced with
> >>>>>> the original change only.
> >>>>>>
> >>>>>>> +     } else {
> >>>>>>>               pm_runtime_idle(d);
> >>>>>>>       }
> >>>>>>>
> >>>>>>
> >>>>>
> >>>>> CC. Martin Kjær Jørgensen  <me@lagy.org>, could you kindly test if
> >>>>> this new patch works on your environment? Thanks!
> >>>>>
> >>>>> En-Wei,
> >>>>> Best regards.
> >>>>
> >>
> >> Just to be sure. Can you test with the following?
> >>
> >>
> >> diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c
> >> index 2c8845e08..cf29b1208 100644
> >> --- a/drivers/net/ethernet/realtek/r8169_phy_config.c
> >> +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c
> >> @@ -1060,6 +1060,7 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp,
> >>         phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000);
> >>         rtl8168g_enable_gphy_10m(phydev);
> >>
> >> +       rtl8168g_disable_aldps(phydev);
> >>         rtl8125a_config_eee_phy(phydev);
> >>  }
> >>
> >> @@ -1099,6 +1100,7 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp,
> >>         phy_modify_paged(phydev, 0xbf8, 0x12, 0xe000, 0xa000);
> >>
> >>         rtl8125_legacy_force_mode(phydev);
> >> +       rtl8168g_disable_aldps(phydev);
> >>         rtl8125b_config_eee_phy(phydev);
> >>  }
> >>
> >> --
> >> 2.46.0
> >>
> >>
>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 3507c2e28110..632e661fc74b 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -590,7 +590,6 @@  struct rtl8169_tc_offsets {
 enum rtl_flag {
 	RTL_FLAG_TASK_ENABLED = 0,
 	RTL_FLAG_TASK_RESET_PENDING,
-	RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE,
 	RTL_FLAG_TASK_TX_TIMEOUT,
 	RTL_FLAG_MAX
 };
@@ -4698,8 +4697,6 @@  static void rtl_task(struct work_struct *work)
 reset:
 		rtl_reset_work(tp);
 		netif_wake_queue(tp->dev);
-	} else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) {
-		rtl_reset_work(tp);
 	}
 out_unlock:
 	rtnl_unlock();
@@ -4729,11 +4726,13 @@  static void r8169_phylink_handler(struct net_device *ndev)
 	if (netif_carrier_ok(ndev)) {
 		rtl_link_chg_patch(tp);
 		pm_request_resume(d);
-		netif_wake_queue(tp->dev);
-	} else {
+
 		/* In few cases rx is broken after link-down otherwise */
 		if (rtl_is_8125(tp))
-			rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE);
+			rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
+		else
+			netif_wake_queue(tp->dev);
+	} else {
 		pm_runtime_idle(d);
 	}