diff mbox series

[v2,3/3] xfs/{422,517}: kill background jobs on test termination

Message ID 20220621173729.2135249-4-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series aborted fstests may leave frozen fs behind | expand

Commit Message

Amir Goldstein June 21, 2022, 5:37 p.m. UTC
Those tests failed to cleanup background jobs properly after test
is interrupted and even sometimes when it completed succefully.

xfs/517 would sometime fails randomally with this false positive error:
     QA output created by 517
     Format and populate
     Concurrent fsmap and freeze
    +Terminated
     Test done

The tests have several background sub-shells that spawn short lived
programs in a loop.  By killing the spawned programs using killall,
killall could find no process to kill and the sub-shell loop could still
spawn another process that is not going to be killed and in the worst
case, the freeze_loop() could spawn the xfs_io "freeze" command after
test has thawn the fs before exit, which leaves the fs frozen after the
test.

The "Terminated" output is emitted by the sub-shell when killing the
programs that it has spawned when the loop did not finish before test
timeout.  By killing the sub-shell and not the spawned programs, we
avoid the false positive "Terminated" error.

Use a helper to perform this cleanup dance:
First kill and wait the freeze_loop so it won't try to freeze fs again
Then make sure fs is not frozen.
Then kill and wait for the rest of the sub-shells, because
if fs is frozen a killed writer process will never exit.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 tests/xfs/422 | 32 ++++++++++++++++++++++++++++++--
 tests/xfs/517 | 26 +++++++++++++++++++++++---
 2 files changed, 53 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/tests/xfs/422 b/tests/xfs/422
index a83a66df..fdbb8bf1 100755
--- a/tests/xfs/422
+++ b/tests/xfs/422
@@ -13,6 +13,32 @@  _begin_fstest dangerous_scrub dangerous_online_repair freeze
 
 _register_cleanup "_cleanup" BUS
 
+# First kill and wait the freeze loop so it won't try to freeze fs again
+# Then make sure fs is not frozen
+# Then kill and wait for the rest of the workers
+# Because if fs is frozen a killed writer will never exit
+kill_loops() {
+	local sig=$1
+
+	[ -n "$freeze_pid" ] && kill $sig $freeze_pid
+	wait $freeze_pid
+	unset freeze_pid
+	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT
+	[ -n "$stress_pid" ] && kill $sig $stress_pid
+	[ -n "$repair_pid" ] && kill $sig $repair_pid
+	wait
+	unset stress_pid
+	unset repair_pid
+}
+
+# Override the default cleanup function.
+_cleanup()
+{
+	kill_loops -9 > /dev/null 2>&1
+	cd /
+	rm -rf $tmp.*
+}
+
 # Import common functions.
 . ./common/filter
 . ./common/fuzzy
@@ -78,8 +104,11 @@  end=$((start + (30 * TIME_FACTOR) ))
 
 echo "Loop started at $(date --date="@${start}"), ending at $(date --date="@${end}")" >> $seqres.full
 stress_loop $end &
+stress_pid=$!
 freeze_loop $end &
+freeze_pid=$!
 repair_loop $end &
+repair_pid=$!
 
 # Wait until 2 seconds after the loops should have finished...
 while [ "$(date +%s)" -lt $((end + 2)) ]; do
@@ -87,8 +116,7 @@  while [ "$(date +%s)" -lt $((end + 2)) ]; do
 done
 
 # ...and clean up after the loops in case they didn't do it themselves.
-$KILLALL_PROG -TERM xfs_io fsstress >> $seqres.full 2>&1
-$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT >> $seqres.full 2>&1
+kill_loops >> $seqres.full 2>&1
 
 echo "Loop finished at $(date)" >> $seqres.full
 echo "Test done"
diff --git a/tests/xfs/517 b/tests/xfs/517
index f7f9a8a2..6877af13 100755
--- a/tests/xfs/517
+++ b/tests/xfs/517
@@ -11,11 +11,29 @@  _begin_fstest auto quick fsmap freeze
 
 _register_cleanup "_cleanup" BUS
 
+# First kill and wait the freeze loop so it won't try to freeze fs again
+# Then make sure fs is not frozen
+# Then kill and wait for the rest of the workers
+# Because if fs is frozen a killed writer will never exit
+kill_loops() {
+	local sig=$1
+
+	[ -n "$freeze_pid" ] && kill $sig $freeze_pid
+	wait $freeze_pid
+	unset freeze_pid
+	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT
+	[ -n "$stress_pid" ] && kill $sig $stress_pid
+	[ -n "$fsmap_pid" ] && kill $sig $fsmap_pid
+	wait
+	unset stress_pid
+	unset fsmap_pid
+}
+
 # Override the default cleanup function.
 _cleanup()
 {
+	kill_loops -9 > /dev/null 2>&1
 	cd /
-	$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT > /dev/null 2>&1
 	rm -rf $tmp.*
 }
 
@@ -83,8 +101,11 @@  end=$((start + (30 * TIME_FACTOR) ))
 
 echo "Loop started at $(date --date="@${start}"), ending at $(date --date="@${end}")" >> $seqres.full
 stress_loop $end &
+stress_pid=$!
 freeze_loop $end &
+freeze_pid=$!
 fsmap_loop $end &
+fsmap_pid=$!
 
 # Wait until 2 seconds after the loops should have finished...
 while [ "$(date +%s)" -lt $((end + 2)) ]; do
@@ -92,8 +113,7 @@  while [ "$(date +%s)" -lt $((end + 2)) ]; do
 done
 
 # ...and clean up after the loops in case they didn't do it themselves.
-$KILLALL_PROG -TERM xfs_io fsstress >> $seqres.full 2>&1
-$XFS_IO_PROG -x -c 'thaw' $SCRATCH_MNT >> $seqres.full 2>&1
+kill_loops >> $seqres.full 2>&1
 
 echo "Loop finished at $(date)" >> $seqres.full
 echo "Test done"