diff mbox series

[net-next,6/6] selftests: drv-net-hw: add test for memory allocation failures with page pool

Message ID 20240426232400.624864-7-kuba@kernel.org (mailing list archive)
State Accepted
Commit 9da271f825e42156058a2eb09360bc993853bbba
Headers show
Series selftests: net: page_poll allocation error injection | expand

Commit Message

Jakub Kicinski April 26, 2024, 11:23 p.m. UTC
Bugs in memory allocation failure paths are quite common.
Add a test exercising those paths based on qstat and page pool
failure hook.

Running on bnxt:

  # ./drivers/net/hw/pp_alloc_fail.py
  KTAP version 1
  1..1
  # ethtool -G change retval: success
  ok 1 pp_alloc_fail.test_pp_alloc
  # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0

I initially wrote this test to validate commit be43b7489a3c ("net/mlx5e:
RX, Fix page_pool allocation failure recovery for striding rq") but mlx5
still doesn't have qstat. So I run it on bnxt, and while bnxt survives
I found the problem fixed in commit 730117730709 ("eth: bnxt: fix counting
packets discarded due to OOM and netpoll").

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../testing/selftests/drivers/net/hw/Makefile |   1 +
 .../selftests/drivers/net/hw/pp_alloc_fail.py | 129 ++++++++++++++++++
 tools/testing/selftests/net/lib/py/ksft.py    |   4 +
 3 files changed, 134 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py

Comments

Willem de Bruijn April 27, 2024, 1:49 p.m. UTC | #1
Jakub Kicinski wrote:
> Bugs in memory allocation failure paths are quite common.
> Add a test exercising those paths based on qstat and page pool
> failure hook.
> 
> Running on bnxt:
> 
>   # ./drivers/net/hw/pp_alloc_fail.py
>   KTAP version 1
>   1..1
>   # ethtool -G change retval: success
>   ok 1 pp_alloc_fail.test_pp_alloc
>   # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
> 
> I initially wrote this test to validate commit be43b7489a3c ("net/mlx5e:
> RX, Fix page_pool allocation failure recovery for striding rq") but mlx5
> still doesn't have qstat. So I run it on bnxt, and while bnxt survives
> I found the problem fixed in commit 730117730709 ("eth: bnxt: fix counting
> packets discarded due to OOM and netpoll").
> 
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>  .../testing/selftests/drivers/net/hw/Makefile |   1 +
>  .../selftests/drivers/net/hw/pp_alloc_fail.py | 129 ++++++++++++++++++
>  tools/testing/selftests/net/lib/py/ksft.py    |   4 +
>  3 files changed, 134 insertions(+)
>  create mode 100755 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
> 
> diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
> index 95f32158b095..1dd732855d76 100644
> --- a/tools/testing/selftests/drivers/net/hw/Makefile
> +++ b/tools/testing/selftests/drivers/net/hw/Makefile
> @@ -9,6 +9,7 @@ TEST_PROGS = \
>  	hw_stats_l3.sh \
>  	hw_stats_l3_gre.sh \
>  	loopback.sh \
> +	pp_alloc_fail.py \
>  	#
>  
>  TEST_FILES := \
> diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
> new file mode 100755
> index 000000000000..026d98976c35
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
> @@ -0,0 +1,129 @@
> +#!/usr/bin/env python3
> +# SPDX-License-Identifier: GPL-2.0
> +
> +import time
> +import os
> +from lib.py import ksft_run, ksft_exit, ksft_pr
> +from lib.py import KsftSkipEx, KsftFailEx
> +from lib.py import NetdevFamily, NlError
> +from lib.py import NetDrvEpEnv
> +from lib.py import cmd, tool, GenerateTraffic
> +
> +
> +def _write_fail_config(config):
> +    for key, value in config.items():
> +        with open("/sys/kernel/debug/fail_function/" + key, "w") as fp:
> +            fp.write(str(value) + "\n")
> +
> +
> +def _enable_pp_allocation_fail():
> +    if not os.path.exists("/sys/kernel/debug/fail_function"):
> +        raise KsftSkipEx("Kernel built without function error injection (or DebugFS)")
> +
> +    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
> +        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
> +            fp.write("page_pool_alloc_pages\n")
> +
> +    _write_fail_config({
> +        "verbose": 0,
> +        "interval": 511,
> +        "probability": 100,
> +        "times": -1,
> +    })
> +
> +
> +def _disable_pp_allocation_fail():
> +    if not os.path.exists("/sys/kernel/debug/fail_function"):
> +        return
> +
> +    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
> +        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
> +            fp.write("\n")
> +
> +    _write_fail_config({
> +        "probability": 0,
> +        "times": 0,
> +    })
> +
> +
> +def test_pp_alloc(cfg, netdevnl):
> +    def get_stats():
> +        return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
> +
> +    def check_traffic_flowing():
> +        stat1 = get_stats()
> +        time.sleep(1)
> +        stat2 = get_stats()
> +        if stat2['rx-packets'] - stat1['rx-packets'] < 15000:
> +            raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets'])
> +
> +
> +    try:
> +        stats = get_stats()
> +    except NlError as e:
> +        if e.nl_msg.error == -95:
> +            stats = {}
> +        else:
> +            raise
> +    if 'rx-alloc-fail' not in stats:
> +        raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats")
> +
> +    set_g = False
> +    traffic = None
> +    try:
> +        traffic = GenerateTraffic(cfg)
> +
> +        check_traffic_flowing()
> +
> +        _enable_pp_allocation_fail()
> +
> +        s1 = get_stats()
> +        time.sleep(3)
> +        s2 = get_stats()
> +
> +        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1:
> +            raise KsftSkipEx("Allocation failures not increasing")
> +        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100:
> +            raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'],
> +                             "packets:", s2['rx-packets'] - s1['rx-packets'])
> +
> +        # Basic failures are fine, try to wobble some settings to catch extra failures
> +        check_traffic_flowing()
> +        g = tool("ethtool", "-g " + cfg.ifname, json=True)[0]
> +        if 'rx' in g and g["rx"] * 2 <= g["rx-max"]:
> +            new_g = g['rx'] * 2
> +        elif 'rx' in g:
> +            new_g = g['rx'] // 2
> +        else:
> +            new_g = None
> +
> +        if new_g:
> +            set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0
> +            if set_g:
> +                ksft_pr("ethtool -G change retval: success")
> +            else:
> +                ksft_pr("ethtool -G change retval: did not succeed", new_g)
> +        else:
> +                ksft_pr("ethtool -G change retval: did not try")
> +
> +        time.sleep(0.1)
> +        check_traffic_flowing()
> +    finally:
> +        _disable_pp_allocation_fail()
> +        if traffic:
> +            traffic.stop()

Very cool!

Eventually probably want a more generic fault injection class.

And for both fault injection and background traffic the with object
construct to ensure cleanup in all cases.

Maybe even the same for ethtool, as ip and ethtool config changes that
need to be reverted to original state will be common.

To be clear, not at all suggesting to revise this series for that.

> +        time.sleep(0.1)
> +        if set_g:
> +            cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}")
> +
> +
> +def main() -> None:
> +    netdevnl = NetdevFamily()
> +    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
> +
> +        ksft_run([test_pp_alloc], args=(cfg, netdevnl, ))
> +    ksft_exit()
> +
> +
> +if __name__ == "__main__":
> +    main()
> diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
> index f84e9fdd0032..4769b4eb1ea1 100644
> --- a/tools/testing/selftests/net/lib/py/ksft.py
> +++ b/tools/testing/selftests/net/lib/py/ksft.py
> @@ -11,6 +11,10 @@ KSFT_RESULT = None
>  KSFT_RESULT_ALL = True
>  
>  
> +class KsftFailEx(Exception):
> +    pass
> +
> +
>  class KsftSkipEx(Exception):
>      pass
>  
> -- 
> 2.44.0
>
Jakub Kicinski April 29, 2024, 2:51 p.m. UTC | #2
On Sat, 27 Apr 2024 09:49:28 -0400 Willem de Bruijn wrote:
> Eventually probably want a more generic fault injection class.
> 
> And for both fault injection and background traffic the with object
> construct to ensure cleanup in all cases.
> 
> Maybe even the same for ethtool, as ip and ethtool config changes that
> need to be reverted to original state will be common.

Agreed, the nice way of wrapping all that has not revealed itself to me
yet. When we discussed it with Petr a while back he was suggesting
"with", and I was thinking of creating an object with test as the
parent. The with is nicer but here we'd end up doing:

	with a():
		# some code
		with b():
			# more code
				with c():
					# check traffic

which offends my sensibilities.

There are many options, hard to say which one is best without having 
a bunch of tests to convert as a litmus test :S So I stuck to "finally"
Willem de Bruijn April 29, 2024, 3:02 p.m. UTC | #3
Jakub Kicinski wrote:
> On Sat, 27 Apr 2024 09:49:28 -0400 Willem de Bruijn wrote:
> > Eventually probably want a more generic fault injection class.
> > 
> > And for both fault injection and background traffic the with object
> > construct to ensure cleanup in all cases.
> > 
> > Maybe even the same for ethtool, as ip and ethtool config changes that
> > need to be reverted to original state will be common.
> 
> Agreed, the nice way of wrapping all that has not revealed itself to me
> yet. When we discussed it with Petr a while back he was suggesting
> "with", and I was thinking of creating an object with test as the
> parent. The with is nicer but here we'd end up doing:
> 
> 	with a():
> 		# some code
> 		with b():
> 			# more code
> 				with c():
> 					# check traffic
> 
> which offends my sensibilities.
> 
> There are many options, hard to say which one is best without having 
> a bunch of tests to convert as a litmus test :S So I stuck to "finally"

Entirely reasonable.

Btw, I have a preliminary tools/testing/selftests/net/csum test on
top of this series.

The only interesting points so far are the use of deploy (which I
assume you have on some internal patch already) and that with bkg
would not fail the test if the background process exits with error.
Andrew Lunn April 29, 2024, 3:12 p.m. UTC | #4
On Fri, Apr 26, 2024 at 04:23:59PM -0700, Jakub Kicinski wrote:
> Bugs in memory allocation failure paths are quite common.
> Add a test exercising those paths based on qstat and page pool
> failure hook.
> 
> Running on bnxt:
> 
>   # ./drivers/net/hw/pp_alloc_fail.py
>   KTAP version 1
>   1..1
>   # ethtool -G change retval: success
>   ok 1 pp_alloc_fail.test_pp_alloc
>   # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0

If i'm reading the traffic generator correctly, this test runs for 24
hours. Do we want some sort of warning here about the test duration?
Some sort of alive indication very minute?

     Andrew
Jakub Kicinski April 29, 2024, 4:16 p.m. UTC | #5
On Mon, 29 Apr 2024 17:12:29 +0200 Andrew Lunn wrote:
> >   # ./drivers/net/hw/pp_alloc_fail.py
> >   KTAP version 1
> >   1..1
> >   # ethtool -G change retval: success
> >   ok 1 pp_alloc_fail.test_pp_alloc
> >   # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0  
> 
> If i'm reading the traffic generator correctly, this test runs for 24
> hours. Do we want some sort of warning here about the test duration?
> Some sort of alive indication very minute?

That's just the max value for the time param.
The generator is stopped / killed after we go thru the test steps.
Jakub Kicinski April 29, 2024, 4:27 p.m. UTC | #6
On Mon, 29 Apr 2024 11:02:51 -0400 Willem de Bruijn wrote:
> The only interesting points so far are the use of deploy (which I
> assume you have on some internal patch already)

Yup, they need a touch more cleaning up but the PSP tests use it.

> and that with bkg would not fail the test if the background process
> exits with error.

Ah, that's a bug, yes. We should record the value of fail from 
the constructor and use it in __exit__().
diff mbox series

Patch

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 95f32158b095..1dd732855d76 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -9,6 +9,7 @@  TEST_PROGS = \
 	hw_stats_l3.sh \
 	hw_stats_l3_gre.sh \
 	loopback.sh \
+	pp_alloc_fail.py \
 	#
 
 TEST_FILES := \
diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
new file mode 100755
index 000000000000..026d98976c35
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
@@ -0,0 +1,129 @@ 
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+import os
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import KsftSkipEx, KsftFailEx
+from lib.py import NetdevFamily, NlError
+from lib.py import NetDrvEpEnv
+from lib.py import cmd, tool, GenerateTraffic
+
+
+def _write_fail_config(config):
+    for key, value in config.items():
+        with open("/sys/kernel/debug/fail_function/" + key, "w") as fp:
+            fp.write(str(value) + "\n")
+
+
+def _enable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        raise KsftSkipEx("Kernel built without function error injection (or DebugFS)")
+
+    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("page_pool_alloc_pages\n")
+
+    _write_fail_config({
+        "verbose": 0,
+        "interval": 511,
+        "probability": 100,
+        "times": -1,
+    })
+
+
+def _disable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        return
+
+    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("\n")
+
+    _write_fail_config({
+        "probability": 0,
+        "times": 0,
+    })
+
+
+def test_pp_alloc(cfg, netdevnl):
+    def get_stats():
+        return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
+
+    def check_traffic_flowing():
+        stat1 = get_stats()
+        time.sleep(1)
+        stat2 = get_stats()
+        if stat2['rx-packets'] - stat1['rx-packets'] < 15000:
+            raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets'])
+
+
+    try:
+        stats = get_stats()
+    except NlError as e:
+        if e.nl_msg.error == -95:
+            stats = {}
+        else:
+            raise
+    if 'rx-alloc-fail' not in stats:
+        raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats")
+
+    set_g = False
+    traffic = None
+    try:
+        traffic = GenerateTraffic(cfg)
+
+        check_traffic_flowing()
+
+        _enable_pp_allocation_fail()
+
+        s1 = get_stats()
+        time.sleep(3)
+        s2 = get_stats()
+
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1:
+            raise KsftSkipEx("Allocation failures not increasing")
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100:
+            raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'],
+                             "packets:", s2['rx-packets'] - s1['rx-packets'])
+
+        # Basic failures are fine, try to wobble some settings to catch extra failures
+        check_traffic_flowing()
+        g = tool("ethtool", "-g " + cfg.ifname, json=True)[0]
+        if 'rx' in g and g["rx"] * 2 <= g["rx-max"]:
+            new_g = g['rx'] * 2
+        elif 'rx' in g:
+            new_g = g['rx'] // 2
+        else:
+            new_g = None
+
+        if new_g:
+            set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0
+            if set_g:
+                ksft_pr("ethtool -G change retval: success")
+            else:
+                ksft_pr("ethtool -G change retval: did not succeed", new_g)
+        else:
+                ksft_pr("ethtool -G change retval: did not try")
+
+        time.sleep(0.1)
+        check_traffic_flowing()
+    finally:
+        _disable_pp_allocation_fail()
+        if traffic:
+            traffic.stop()
+        time.sleep(0.1)
+        if set_g:
+            cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}")
+
+
+def main() -> None:
+    netdevnl = NetdevFamily()
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+
+        ksft_run([test_pp_alloc], args=(cfg, netdevnl, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index f84e9fdd0032..4769b4eb1ea1 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -11,6 +11,10 @@  KSFT_RESULT = None
 KSFT_RESULT_ALL = True
 
 
+class KsftFailEx(Exception):
+    pass
+
+
 class KsftSkipEx(Exception):
     pass