diff mbox series

generic/764: fsstress + migrate_pages() test

Message ID 20250326185101.2237319-1-mcgrof@kernel.org (mailing list archive)
State New
Headers show
Series generic/764: fsstress + migrate_pages() test | expand

Commit Message

Luis Chamberlain March 26, 2025, 6:50 p.m. UTC
0-day reported a page migration kernel warning with folios which happen
to be buffer-heads [0]. I'm having a terribly hard time reproducing the bug
and so I wrote this test to force page migration filesystems.

It turns out we have have no tests for page migration on fstests or ltp,
and its no surprise, other than compaction covered by generic/750 there
is no easy way to trigger page migration right now unless you have a
numa system.

We should evaluate if we want to help stress test page migration
artificially by later implementing a way to do page migration on simple
systems to an artificial target.

So far, this doesn't trigger any kernel splats, not even warnings for me.

Reported-by: kernel test robot <oliver.sang@intel.com>
Link: https://lore.kernel.org/r/202503101536.27099c77-lkp@intel.com # [0]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 common/config         |  2 +
 common/rc             |  8 ++++
 tests/generic/764     | 94 +++++++++++++++++++++++++++++++++++++++++++
 tests/generic/764.out |  2 +
 4 files changed, 106 insertions(+)
 create mode 100755 tests/generic/764
 create mode 100644 tests/generic/764.out

Comments

Dave Chinner March 26, 2025, 9:10 p.m. UTC | #1
On Wed, Mar 26, 2025 at 11:50:55AM -0700, Luis Chamberlain wrote:
> 0-day reported a page migration kernel warning with folios which happen
> to be buffer-heads [0]. I'm having a terribly hard time reproducing the bug
> and so I wrote this test to force page migration filesystems.
> 
> It turns out we have have no tests for page migration on fstests or ltp,
> and its no surprise, other than compaction covered by generic/750 there
> is no easy way to trigger page migration right now unless you have a
> numa system.
> 
> We should evaluate if we want to help stress test page migration
> artificially by later implementing a way to do page migration on simple
> systems to an artificial target.
> 
> So far, this doesn't trigger any kernel splats, not even warnings for me.
> 
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Link: https://lore.kernel.org/r/202503101536.27099c77-lkp@intel.com # [0]
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> ---
>  common/config         |  2 +
>  common/rc             |  8 ++++
>  tests/generic/764     | 94 +++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/764.out |  2 +
>  4 files changed, 106 insertions(+)
>  create mode 100755 tests/generic/764
>  create mode 100644 tests/generic/764.out
> 
> diff --git a/common/config b/common/config
> index 2afbda141746..93b50f113b44 100644
> --- a/common/config
> +++ b/common/config
> @@ -239,6 +239,8 @@ export BTRFS_MAP_LOGICAL_PROG=$(type -P btrfs-map-logical)
>  export PARTED_PROG="$(type -P parted)"
>  export XFS_PROPERTY_PROG="$(type -P xfs_property)"
>  export FSCRYPTCTL_PROG="$(type -P fscryptctl)"
> +export NUMACTL_PROG="$(type -P numactl)"
> +export MIGRATEPAGES_PROG="$(type -P migratepages)"
>  
>  # udev wait functions.
>  #
> diff --git a/common/rc b/common/rc
> index e51686389a78..ed9613a9bf28 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -281,6 +281,14 @@ _require_vm_compaction()
>  	fi
>  }
>  
> +_require_numa_nodes()
> +{
> +	readarray -t QUEUE < <($NUMACTL_PROG --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')

sed makes this easier: remove the membind token, then remove all the
lines that have ":"s left in them. This leaves behind the membind
node string.

$ numactl --show | sed -e 's/membind://' -e '/:/d'
 0 1 2 3
$

Also should have:

	_require_command "$NUMACTL_PROG" "numactl"

built into it, rather than requiring the test to declare it first.

> +	if (( ${#QUEUE[@]} < 2 )); then
> +		_notrun "You need a system with at least two numa nodes to run this test"
> +	fi
> +}



> +
>  # Requires CONFIG_DEBUGFS and truncation knobs
>  _require_split_huge_pages_knob()
>  {
> diff --git a/tests/generic/764 b/tests/generic/764
> new file mode 100755
> index 000000000000..91d9fb7e08da
> --- /dev/null
> +++ b/tests/generic/764
> @@ -0,0 +1,94 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
> +#
> +# FS QA Test 764
> +#
> +# fsstress + migrate_pages() test
> +#
> +. ./common/preamble
> +_begin_fstest auto rw long_rw stress soak smoketest
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -f $runfile
> +	rm -f $tmp.*
> +	kill -9 $run_migration_pid > /dev/null 2>&1
> +	kill -9 $stress_pid > /dev/null 2>&1
> +
> +	wait > /dev/null 2>&1
> +}

If you implement this using the fsstress wrappers like I mention
below, and get rid of running the main migration loop in background,
this cleanup function can go away completely.

> +
> +_require_scratch
> +_require_command "$NUMACTL_PROG" "numactl"
> +_require_command "$MIGRATEPAGES_PROG" "migratepages"
> +_require_numa_nodes
> +
> +readarray -t QUEUE < <($NUMACTL_PROG --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')
> +if (( ${#QUEUE[@]} < 2 )); then
> +	echo "Not enough NUMA nodes to pick two different ones."
> +	exit 1
> +fi

You've implemented this twice.

> +echo "Silence is golden"
> +
> +_scratch_mkfs > $seqres.full 2>&1
> +_scratch_mount >> $seqres.full 2>&1
> +
> +nr_cpus=$((LOAD_FACTOR * 4))
> +nr_ops=$((25000 * nr_cpus * TIME_FACTOR))

Don't scale ops with nr_cpus - you've already scaled processes
with nr_cpus.

> +fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
> +test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")
> +
> +runfile="$tmp.migratepages"
> +pidfile="$tmp.stress.pid"
> +
> +run_stress_fs()
> +{
> +	$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" &
> +	stress_pid=$!
> +	echo $stress_pid > $pidfile
> +	wait $stress_pid
> +	rm -f $runfile
> +	rm -f $pidfile
> +}

Don't reimplement _run_fsstress(), call it instead.

> +
> +run_stress_fs &

Actually, you want _run_fsstress_bg() here, and then
_kill_fsstress() when you want it to die.

> +touch $runfile
> +stress_pid=$(cat $pidfile)

Don't need either of these.

> +
> +while [ -e $runfile ]; do

while [ -n "_FSSTRESS_PID" ]; do


> +	readarray -t QUEUE < <(numactl --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')

Third time this is implemented.

> +	# Proper Fisher–Yates shuffle
> +	for ((i=${#QUEUE[@]} - 1; i > 0; i--)); do
> +		j=$((RANDOM % (i + 1)))
> +		var=${QUEUE[i]}
> +		QUEUE[i]=${QUEUE[j]}
> +		QUEUE[j]=$var
> +	done
> +
> +	RANDOM_NODE_1=${QUEUE[0]}
> +	RANDOM_NODE_2=${QUEUE[1]}

If all you are doing is picking two random nodes, then you could
just use RANDOM for the array index and drop the whole shuffle
thing, yes?

> +	if [[ -f $pidfile ]]; then

no need for this if we gate the loop on _FSSTRESS_PID

> +		echo "migrating parent fsstress process:" >> $seqres.full
> +		echo -en "\t$MIGRATEPAGES_PROG $pid $RANDOM_NODE_1 $RANDOM_NODE_2 ..." >> $seqres.full
> +		$MIGRATEPAGES_PROG $stress_pid $RANDOM_NODE_1 $RANDOM_NODE_2
> +		echo " $?" >> $seqres.full
> +		echo "migrating child fsstress processes ..." >> $seqres.full
> +		for pid in $(ps --ppid "$stress_pid" -o pid=); do
> +			echo -en "\tmigratepages $pid $RANDOM_NODE_1 $RANDOM_NODE_2 ..." >> $seqres.full
> +			$MIGRATEPAGES_PROG $pid $RANDOM_NODE_1 $RANDOM_NODE_2
> +			echo " $?" >> $seqres.full
> +		done
> +	fi
> +	sleep 2
> +done &
> +run_migration_pid=$!

why is this put in the background, only to then wait on it to
complete? The loop will stop when fsstress finishes, yes?
Which means this doesn't need to be run in the background at all,
and then cleanup doesn't need to handle killing this, either.

-Dave.
Jan Kara March 27, 2025, 11:53 a.m. UTC | #2
On Wed 26-03-25 11:50:55, Luis Chamberlain wrote:
> 0-day reported a page migration kernel warning with folios which happen
> to be buffer-heads [0]. I'm having a terribly hard time reproducing the bug
> and so I wrote this test to force page migration filesystems.
> 
> It turns out we have have no tests for page migration on fstests or ltp,
> and its no surprise, other than compaction covered by generic/750 there
> is no easy way to trigger page migration right now unless you have a
> numa system.
> 
> We should evaluate if we want to help stress test page migration
> artificially by later implementing a way to do page migration on simple
> systems to an artificial target.
> 
> So far, this doesn't trigger any kernel splats, not even warnings for me.
> 
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Link: https://lore.kernel.org/r/202503101536.27099c77-lkp@intel.com # [0]
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>

So when I was testing page migration in the past MM guys advised me to use
THP compaction as a way to trigger page migration. You can manually
trigger compaction by:

echo 1 >/proc/sys/vm/compact_memory

So you first mess with the page cache a bit to fragment memory and then
call the above to try to compact it back...

								Honza
Dave Chinner March 27, 2025, 8:22 p.m. UTC | #3
On Thu, Mar 27, 2025 at 12:53:30PM +0100, Jan Kara wrote:
> On Wed 26-03-25 11:50:55, Luis Chamberlain wrote:
> > 0-day reported a page migration kernel warning with folios which happen
> > to be buffer-heads [0]. I'm having a terribly hard time reproducing the bug
> > and so I wrote this test to force page migration filesystems.
> > 
> > It turns out we have have no tests for page migration on fstests or ltp,
> > and its no surprise, other than compaction covered by generic/750 there
> > is no easy way to trigger page migration right now unless you have a
> > numa system.
> > 
> > We should evaluate if we want to help stress test page migration
> > artificially by later implementing a way to do page migration on simple
> > systems to an artificial target.
> > 
> > So far, this doesn't trigger any kernel splats, not even warnings for me.
> > 
> > Reported-by: kernel test robot <oliver.sang@intel.com>
> > Link: https://lore.kernel.org/r/202503101536.27099c77-lkp@intel.com # [0]
> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> 
> So when I was testing page migration in the past MM guys advised me to use
> THP compaction as a way to trigger page migration. You can manually
> trigger compaction by:
> 
> echo 1 >/proc/sys/vm/compact_memory

Right, that's what generic/750 does. IT runs fsstress and every 5
seconds runs memory compaction in the background.

> So you first mess with the page cache a bit to fragment memory and then
> call the above to try to compact it back...

Which is effectively what g/750 tries to exercise.

When it's run by check-parallel, compaction ends up doing a lot
more work over a much wider range of tests...

-Dave.
Luis Chamberlain March 27, 2025, 9:35 p.m. UTC | #4
On Fri, Mar 28, 2025 at 07:22:45AM +1100, Dave Chinner wrote:
> On Thu, Mar 27, 2025 at 12:53:30PM +0100, Jan Kara wrote:
> > On Wed 26-03-25 11:50:55, Luis Chamberlain wrote:
> > > 0-day reported a page migration kernel warning with folios which happen
> > > to be buffer-heads [0]. I'm having a terribly hard time reproducing the bug
> > > and so I wrote this test to force page migration filesystems.
> > > 
> > > It turns out we have have no tests for page migration on fstests or ltp,
> > > and its no surprise, other than compaction covered by generic/750 there
> > > is no easy way to trigger page migration right now unless you have a
> > > numa system.
> > > 
> > > We should evaluate if we want to help stress test page migration
> > > artificially by later implementing a way to do page migration on simple
> > > systems to an artificial target.
> > > 
> > > So far, this doesn't trigger any kernel splats, not even warnings for me.
> > > 
> > > Reported-by: kernel test robot <oliver.sang@intel.com>
> > > Link: https://lore.kernel.org/r/202503101536.27099c77-lkp@intel.com # [0]
> > > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> > 
> > So when I was testing page migration in the past MM guys advised me to use
> > THP compaction as a way to trigger page migration. You can manually
> > trigger compaction by:
> > 
> > echo 1 >/proc/sys/vm/compact_memory
> 
> Right, that's what generic/750 does. IT runs fsstress and every 5
> seconds runs memory compaction in the background.
> 
> > So you first mess with the page cache a bit to fragment memory and then
> > call the above to try to compact it back...
> 
> Which is effectively what g/750 tries to exercise.

Indeed. And I've tried g/750 for over 24 hours trying to reproduce
the issue reported by Oliver and I was not able to, so this augments the
coverage.

The original report by Oliver was about ltp syscalls-04/close_range01
triggering the spin lock on the buffer_migrate_folio_norefs() path which
triggers a lock followed by a sleep context. But the report indicates
the test ran with btrfs and btrfs does not use buffer_migrate_folio_norefs().
Although clearly the splat and diagnosis by Matthew that the spinlock seems
to need fixing, reproducing this issue would be good. But this has been hard.

In fact there are only a few users of buffer_migrate_folio_norefs() left
and ext4 is one of them, as well as the block layer.

I wrote this test to see if this might help with another path, the other
aspect of migration on numa nodes with ext4. But sadly I can't reproduce
the issue yet.

I'm next trying fio against a block device directory and then looping
with migratepages on the pid, essentially bouncing the memory from fio
from one node to another in a loop. And.. nothing yet.. even if I then
try to loop enabling compaction.

Syszbot recently provided another reproducer in C  [0] but that hasn't let me
reproduce the issue yet either.

> When it's run by check-parallel, compaction ends up doing a lot
> more work over a much wider range of tests...

Yeah I would hope the issue is reproducible with check-parallel, I
haven't been able to run it yet but as soon as I do I am going to
be supper happy due the huge benefits this will bring to testing.

[0] https://lkml.kernel.org/r/67e57c41.050a0220.2f068f.0033.GAE@google.com

  Luis
diff mbox series

Patch

diff --git a/common/config b/common/config
index 2afbda141746..93b50f113b44 100644
--- a/common/config
+++ b/common/config
@@ -239,6 +239,8 @@  export BTRFS_MAP_LOGICAL_PROG=$(type -P btrfs-map-logical)
 export PARTED_PROG="$(type -P parted)"
 export XFS_PROPERTY_PROG="$(type -P xfs_property)"
 export FSCRYPTCTL_PROG="$(type -P fscryptctl)"
+export NUMACTL_PROG="$(type -P numactl)"
+export MIGRATEPAGES_PROG="$(type -P migratepages)"
 
 # udev wait functions.
 #
diff --git a/common/rc b/common/rc
index e51686389a78..ed9613a9bf28 100644
--- a/common/rc
+++ b/common/rc
@@ -281,6 +281,14 @@  _require_vm_compaction()
 	fi
 }
 
+_require_numa_nodes()
+{
+	readarray -t QUEUE < <($NUMACTL_PROG --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')
+	if (( ${#QUEUE[@]} < 2 )); then
+		_notrun "You need a system with at least two numa nodes to run this test"
+	fi
+}
+
 # Requires CONFIG_DEBUGFS and truncation knobs
 _require_split_huge_pages_knob()
 {
diff --git a/tests/generic/764 b/tests/generic/764
new file mode 100755
index 000000000000..91d9fb7e08da
--- /dev/null
+++ b/tests/generic/764
@@ -0,0 +1,94 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024 Luis Chamberlain.  All Rights Reserved.
+#
+# FS QA Test 764
+#
+# fsstress + migrate_pages() test
+#
+. ./common/preamble
+_begin_fstest auto rw long_rw stress soak smoketest
+
+_cleanup()
+{
+	cd /
+	rm -f $runfile
+	rm -f $tmp.*
+	kill -9 $run_migration_pid > /dev/null 2>&1
+	kill -9 $stress_pid > /dev/null 2>&1
+
+	wait > /dev/null 2>&1
+}
+
+_require_scratch
+_require_command "$NUMACTL_PROG" "numactl"
+_require_command "$MIGRATEPAGES_PROG" "migratepages"
+_require_numa_nodes
+
+readarray -t QUEUE < <($NUMACTL_PROG --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')
+if (( ${#QUEUE[@]} < 2 )); then
+	echo "Not enough NUMA nodes to pick two different ones."
+	exit 1
+fi
+
+echo "Silence is golden"
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount >> $seqres.full 2>&1
+
+nr_cpus=$((LOAD_FACTOR * 4))
+nr_ops=$((25000 * nr_cpus * TIME_FACTOR))
+fsstress_args=(-w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus)
+test -n "$SOAK_DURATION" && fsstress_args+=(--duration="$SOAK_DURATION")
+
+runfile="$tmp.migratepages"
+pidfile="$tmp.stress.pid"
+
+run_stress_fs()
+{
+	$FSSTRESS_PROG $FSSTRESS_AVOID "${fsstress_args[@]}" &
+	stress_pid=$!
+	echo $stress_pid > $pidfile
+	wait $stress_pid
+	rm -f $runfile
+	rm -f $pidfile
+}
+
+run_stress_fs &
+touch $runfile
+
+stress_pid=$(cat $pidfile)
+
+while [ -e $runfile ]; do
+	readarray -t QUEUE < <(numactl --show | awk '/^membind:/ {for (i=2; i<=NF; i++) print $i}')
+	# Proper Fisher–Yates shuffle
+	for ((i=${#QUEUE[@]} - 1; i > 0; i--)); do
+		j=$((RANDOM % (i + 1)))
+		var=${QUEUE[i]}
+		QUEUE[i]=${QUEUE[j]}
+		QUEUE[j]=$var
+	done
+
+	RANDOM_NODE_1=${QUEUE[0]}
+	RANDOM_NODE_2=${QUEUE[1]}
+
+	if [[ -f $pidfile ]]; then
+		echo "migrating parent fsstress process:" >> $seqres.full
+		echo -en "\t$MIGRATEPAGES_PROG $pid $RANDOM_NODE_1 $RANDOM_NODE_2 ..." >> $seqres.full
+		$MIGRATEPAGES_PROG $stress_pid $RANDOM_NODE_1 $RANDOM_NODE_2
+		echo " $?" >> $seqres.full
+		echo "migrating child fsstress processes ..." >> $seqres.full
+		for pid in $(ps --ppid "$stress_pid" -o pid=); do
+			echo -en "\tmigratepages $pid $RANDOM_NODE_1 $RANDOM_NODE_2 ..." >> $seqres.full
+			$MIGRATEPAGES_PROG $pid $RANDOM_NODE_1 $RANDOM_NODE_2
+			echo " $?" >> $seqres.full
+		done
+	fi
+	sleep 2
+done &
+run_migration_pid=$!
+
+wait > /dev/null 2>&1
+
+status=0
+exit
diff --git a/tests/generic/764.out b/tests/generic/764.out
new file mode 100644
index 000000000000..bb58e5b8957f
--- /dev/null
+++ b/tests/generic/764.out
@@ -0,0 +1,2 @@ 
+QA output created by 764
+Silence is golden