new file mode 100755
@@ -0,0 +1,199 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FSQA Test No. 236
+#
+# Test for fsync data loss after renaming a file or adding a hard link, with a
+# previous fsync of another file, as well as that mtime and ctime are correct.
+# Test both with COW and NOCOW writes.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1 # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+ _cleanup_flakey
+ cd /
+ rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs btrfs
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+# The comments inside the function mentioning specific inode numbers and IDs
+# (transactions, log commits, etc) are for the case where the function is run
+# on a freshly created filesystem, but the logic and reasoning still applies
+# for future invocations.
+test_fsync()
+{
+ local suffix=$1
+ local op_type=$2
+ local foo="$SCRATCH_MNT/foo_$suffix"
+ local bar="$SCRATCH_MNT/bar_$suffix"
+ local baz="$SCRATCH_MNT/baz_$suffix"
+ local digest_before
+ local digest_after
+ local mtime_before
+ local ctime_before
+ local mtime_after
+ local ctime_after
+
+ if [ "$op_type" != "rename" ] && [ "$op_type" != "link" ]; then
+ _fail "Operation type, 2nd argument, must be 'rename' or 'link'"
+ fi
+
+ # Create an extra empty file to be used later for syncing the log and
+ # bumping the committed log transaction ID by +1.
+ touch $bar
+
+ # This is the file we are interested in testing for data loss after an
+ # fsync. Create it and fill it with initial data, then fsync it.
+ #
+ # Before the fsync the inode has the full sync flag set, the current
+ # log transaction has an ID of 0 (first log transaction), the inode's
+ # ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1.
+ #
+ # After the fsync the full sync flag is not set anymore, ->logged_trans
+ # is 6 (generation of the current transaction), ->last_log_commit is 0
+ # (which is the value of ->last_sub_trans) and ->last_sub_trans remains
+ # as 0.
+ #
+ # Also, after the fsync, the log transaction ID is bumped from 0 to 1.
+ #
+ $XFS_IO_PROG -f -c "pwrite -S 0xab 0 1M" -c "fsync" $baz >>$seqres.full
+ # File bar is inode 257.
+ # File baz is inode 258.
+
+ # During the rename or link operation below the following happens:
+ #
+ # We update inode 258 and that causes its ->last_sub_trans to be set to
+ # 1 (the current log transaction ID), and its ->last_log_commit remains
+ # with a value of 0 (meaning it was committed in the previous log
+ # transaction).
+ #
+ # After updating inode 258, because we have previously fsynced it, we
+ # log again the inode because it has a new name/dentry now. This results
+ # in updating its ->last_log_commit from 0 to 1 (the current value of
+ # its ->last_sub_trans).
+ #
+ if [ "$op_type" == "rename" ]; then
+ mv $baz $foo
+ else
+ ln $baz $foo
+ fi
+ # File bar is inode 257.
+ # File foo is inode 258.
+
+ # Before the next write, sleep for 1 second so that we can test if mtime
+ # and ctime are preserved after the power failure.
+ sleep 1
+
+ # This buffered write leaves ->last_sub_trans of inode 258 as 1, the ID
+ # of the current log transaction (inode->root->log_transid).
+ $XFS_IO_PROG -c "pwrite -S 0xcd 0 1M" $foo >>$seqres.full
+
+ # The fsync against inode 257, file bar, results in committing the log
+ # transaction with ID 1, updating inode->root->last_log_commit to 1, and
+ # bumping root->log_transid from 1 to 2.
+ $XFS_IO_PROG -c "fsync" $bar
+
+ # Now on fsync of inode 258, file foo, delalloc is flushed and, because
+ # the inode does not have the full sync flag set anymore, it only waits
+ # for the writeback to finish, it does not wait for the ordered extent
+ # to complete.
+ #
+ # If the ordered extent does not complete before btrfs_sync_file() calls
+ # btrfs_inode_in_log(), then we would not update the inode the log and
+ # sync the log, resulting in a guaranteed data loss after a power failure
+ # for COW writes and with slim chances for data loss as well for NOCOW
+ # writes, because the fsync would return success to user space without
+ # issuing barriers (REQ_PREFLUSH not sent to the block layer).
+ # That happened because btrfs_inode_in_log() would return true before the
+ # ordered extent completes, as in that case the inode has ->last_sub_trans
+ # set to 1, ->last_log_commit as 1 and root->last_log_commit is 1 as well.
+ #
+ # Also, for both COW and NOCOW writes, when the race happens we ended up
+ # not logging the inode, resulting in an outdated mtime and ctime after
+ # replaying the log.
+ #
+ $XFS_IO_PROG -c "fsync" $foo
+
+ # Simulate a power failure and then mount again the filesystem to replay
+ # the log tree.
+ # After the power failure We expect $foo, inode 258, to have 1M of data
+ # full of bytes with a value of 0xcd, and not 0xab.
+
+ digest_before=$(_md5_checksum $foo)
+ mtime_before=$(stat -c %Y $foo)
+ ctime_before=$(stat -c %Z $foo)
+
+ _flakey_drop_and_remount
+
+ digest_after=$(_md5_checksum $foo)
+ mtime_after=$(stat -c %Y $foo)
+ ctime_after=$(stat -c %Z $foo)
+
+ if [ $digest_after != $digest_before ]; then
+ echo -n "Incorrect data after log replay, "
+ echo "expected digest: $digest_before got: $digest_after"
+ fi
+
+ if [ $mtime_after != $mtime_before ]; then
+ echo "mtime not preserved, expected: $mtime_before got: $mtime_after"
+ fi
+
+ if [ $ctime_after != $ctime_before ]; then
+ echo "ctime not preserved, expected: $ctime_before got: $ctime_after"
+ fi
+}
+
+# Test first with data cow (it's the default, but be explicit to make it clear).
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+MOUNT_OPTIONS="-o datacow"
+_mount_flakey
+
+# Test a few times each scenario because this test was motivated by a race
+# condition.
+echo "Testing fsync after rename with COW writes"
+for ((i = 1; i <= 3; i++)); do
+ test_fsync "rename_cow_$i" "rename"
+done
+echo "Testing fsync after link with COW writes"
+for ((i = 1; i <= 3; i++)); do
+ test_fsync "link_cow_$i" "link"
+done
+
+# Now lets test with nodatacow.
+_unmount_flakey
+MOUNT_OPTIONS="-o nodatacow"
+_mount_flakey
+
+echo "Testing fsync after rename with NOCOW writes"
+for ((i = 1; i <= 3; i++)); do
+ test_fsync "rename_nocow_$i" "rename"
+done
+echo "Testing fsync after link with NOCOW writes"
+for ((i = 1; i <= 3; i++)); do
+ test_fsync "link_nocow_$i" "link"
+done
+
+_unmount_flakey
+
+status=0
+exit
new file mode 100644
@@ -0,0 +1,5 @@
+QA output created by 236
+Testing fsync after rename with COW writes
+Testing fsync after link with COW writes
+Testing fsync after rename with NOCOW writes
+Testing fsync after link with NOCOW writes
@@ -238,3 +238,4 @@
233 auto quick subvolume
234 auto quick compress rw
235 auto quick send
+236 auto quick log