diff mbox series

[RFC] btrfs-progs: mkfs: optimize file descriptor usage in mkfs.btrfs

Message ID 06b40e351b544a314178909772281994bb9de259.1706714983.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RFC] btrfs-progs: mkfs: optimize file descriptor usage in mkfs.btrfs | expand

Commit Message

Anand Jain Jan. 31, 2024, 3:49 p.m. UTC
I've reproduced the missing udev events issue without device partitions
using the test case as below. The test waits for the creation of 'by-uuid'
and, waiting indefinitely means successful reproduction. as below:

--------------------------------------------------
#!/bin/bash
usage()
{
	echo
	echo "Usage: ./t1 sdb btrfs"
	exit 1
}

: ${1?"arg1 <dev> is missing"} || usage
dev=$1

: ${2?"arg2 <fstype> is missing"} || usage
fstype=$2

systemd=$(rpm -q --queryformat='%{NAME}-%{VERSION}-%{RELEASE}' systemd)

run_testcase()
{
	local cnt=$1
	local ret=0
	local sleepcnt=0

	local newuuid=""
	local logpid=""
	local log=""
	local logfile="./udev_log_${systemd}_${fstype}.out"

	>$logfile

	wipefs -a /dev/${dev}* &>/dev/null
	sync
	udevadm settle /dev/$dev

	udevadm monitor -k -u -p > $logfile &
	logpid=$!
	>strace.out
	run "strace -f -ttT -o strace.out mkfs.$fstype -q -f /dev/$dev"

	newuuid=$(blkid -p /dev/$dev | awk '{print $2}' | sed -e 's/UUID=//' -e 's/\"//g')

	kill $logpid
	sync $logfile

	ret=-1
	while [ $ret != 0 ]
	do
		run -s -q "ls -lt /dev/disk/by-uuid | grep $newuuid"
		ret=$?
		((sleepcnt++))
		sleep 1
	done

	#for systemd-239-78.0.3.el8
	log=$(cat $logfile|grep ID_FS_TYPE_NEW)
	#for systemd-252-18.0.1.el9.x86_64
	#log=$(grep --text "ID_FS_UUID=${newuuid}" $logfile)

	echo $cnt sleepcnt=$sleepcnt newuuid=$newuuid ret=$ret log=$log
}

echo Test case: t1: version 3.
echo

run -o cat /etc/system-release
run -o uname -a
run -o rpm -q systemd
if [ $fstype == "btrfs" ]; then
	run -o mkfs.btrfs --version
elif [ $fstype == "xfs" ]; then
	run -o mkfs.xfs -V
else
	echo unknown fstype $fstype
fi
echo

for ((cnt = 0; cnt < 13; cnt++)); do
	run_testcase $cnt
done
-----------------------------------------------------------------


It appears that the problem is due to the convoluted nested device open
and device close in mkfs.btrfs as shown below:

-------------
 prepare_ctx opens all devices <-- fd1
   zero the super-block
   writes temp-sb to the first device.

 open_ctree opens the first device again <-- fd2

 prepare_ctx writes temp-sb to all the remaining devs <-- fd1

 fs_info->finalize_on_close = 1;
 close_ctree_fs_info()<-- writes valid <--- fd2

 prepare_ctx is closed <--- fd1.
-------------

This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
and with this change both the test cases (with partition and without
partition) now runs fine.

I've done an initial tests only, not validated with the multi-device mkfs.
More cleanup is possible but pending feedback;  marking this patch as an RFC.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 kernel-shared/disk-io.c | 30 ++++++++++++++++++++++++++++++
 kernel-shared/disk-io.h |  1 +
 mkfs/main.c             |  2 +-
 3 files changed, 32 insertions(+), 1 deletion(-)

Comments

Josef Bacik Jan. 31, 2024, 8:48 p.m. UTC | #1
On Wed, Jan 31, 2024 at 11:49:28PM +0800, Anand Jain wrote:
> I've reproduced the missing udev events issue without device partitions
> using the test case as below. The test waits for the creation of 'by-uuid'
> and, waiting indefinitely means successful reproduction. as below:
> 
> --------------------------------------------------
> #!/bin/bash
> usage()
> {
> 	echo
> 	echo "Usage: ./t1 sdb btrfs"
> 	exit 1
> }
> 
> : ${1?"arg1 <dev> is missing"} || usage
> dev=$1
> 
> : ${2?"arg2 <fstype> is missing"} || usage
> fstype=$2
> 
> systemd=$(rpm -q --queryformat='%{NAME}-%{VERSION}-%{RELEASE}' systemd)
> 
> run_testcase()
> {
> 	local cnt=$1
> 	local ret=0
> 	local sleepcnt=0
> 
> 	local newuuid=""
> 	local logpid=""
> 	local log=""
> 	local logfile="./udev_log_${systemd}_${fstype}.out"
> 
> 	>$logfile
> 
> 	wipefs -a /dev/${dev}* &>/dev/null
> 	sync
> 	udevadm settle /dev/$dev
> 
> 	udevadm monitor -k -u -p > $logfile &
> 	logpid=$!
> 	>strace.out
> 	run "strace -f -ttT -o strace.out mkfs.$fstype -q -f /dev/$dev"
> 
> 	newuuid=$(blkid -p /dev/$dev | awk '{print $2}' | sed -e 's/UUID=//' -e 's/\"//g')
> 
> 	kill $logpid
> 	sync $logfile
> 
> 	ret=-1
> 	while [ $ret != 0 ]
> 	do
> 		run -s -q "ls -lt /dev/disk/by-uuid | grep $newuuid"
> 		ret=$?
> 		((sleepcnt++))
> 		sleep 1
> 	done
> 
> 	#for systemd-239-78.0.3.el8
> 	log=$(cat $logfile|grep ID_FS_TYPE_NEW)
> 	#for systemd-252-18.0.1.el9.x86_64
> 	#log=$(grep --text "ID_FS_UUID=${newuuid}" $logfile)
> 
> 	echo $cnt sleepcnt=$sleepcnt newuuid=$newuuid ret=$ret log=$log
> }
> 
> echo Test case: t1: version 3.
> echo
> 
> run -o cat /etc/system-release
> run -o uname -a
> run -o rpm -q systemd
> if [ $fstype == "btrfs" ]; then
> 	run -o mkfs.btrfs --version
> elif [ $fstype == "xfs" ]; then
> 	run -o mkfs.xfs -V
> else
> 	echo unknown fstype $fstype
> fi
> echo
> 
> for ((cnt = 0; cnt < 13; cnt++)); do
> 	run_testcase $cnt
> done
> -----------------------------------------------------------------
> 
> 
> It appears that the problem is due to the convoluted nested device open
> and device close in mkfs.btrfs as shown below:
> 
> -------------
>  prepare_ctx opens all devices <-- fd1
>    zero the super-block
>    writes temp-sb to the first device.
> 
>  open_ctree opens the first device again <-- fd2
> 
>  prepare_ctx writes temp-sb to all the remaining devs <-- fd1
> 
>  fs_info->finalize_on_close = 1;
>  close_ctree_fs_info()<-- writes valid <--- fd2
> 
>  prepare_ctx is closed <--- fd1.
> -------------
> 
> This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
> and with this change both the test cases (with partition and without
> partition) now runs fine.
> 
> I've done an initial tests only, not validated with the multi-device mkfs.
> More cleanup is possible but pending feedback;  marking this patch as an RFC.

I'd like to see the cleaned up version of this patch, but I have a few comments.

1) I think re-using the fd is reasonable, tho could this just be reworked to
   create the temp-sb and write this to all the devs, close the file
   descriptors, and then call open_ctree?

2) I hate adding another thing into a core file that we'll have to figure out
   how to undo later as we sync more code from the kernel into btrfs-progs, I'm
   not sure if there's a way around this, but thinking harder about adding
   something to disk-io.c that is for userspace only would be good.

Thanks,

Josef
Qu Wenruo Feb. 1, 2024, 8:49 a.m. UTC | #2
On 2024/2/1 07:18, Josef Bacik wrote:
> On Wed, Jan 31, 2024 at 11:49:28PM +0800, Anand Jain wrote:
>> I've reproduced the missing udev events issue without device partitions
>> using the test case as below. The test waits for the creation of 'by-uuid'
>> and, waiting indefinitely means successful reproduction. as below:
>>
>> --------------------------------------------------
>> #!/bin/bash
>> usage()
>> {
>> 	echo
>> 	echo "Usage: ./t1 sdb btrfs"
>> 	exit 1
>> }
>>
>> : ${1?"arg1 <dev> is missing"} || usage
>> dev=$1
>>
>> : ${2?"arg2 <fstype> is missing"} || usage
>> fstype=$2
>>
>> systemd=$(rpm -q --queryformat='%{NAME}-%{VERSION}-%{RELEASE}' systemd)
>>
>> run_testcase()
>> {
>> 	local cnt=$1
>> 	local ret=0
>> 	local sleepcnt=0
>>
>> 	local newuuid=""
>> 	local logpid=""
>> 	local log=""
>> 	local logfile="./udev_log_${systemd}_${fstype}.out"
>>
>> 	>$logfile
>>
>> 	wipefs -a /dev/${dev}* &>/dev/null
>> 	sync
>> 	udevadm settle /dev/$dev
>>
>> 	udevadm monitor -k -u -p > $logfile &
>> 	logpid=$!
>> 	>strace.out
>> 	run "strace -f -ttT -o strace.out mkfs.$fstype -q -f /dev/$dev"
>>
>> 	newuuid=$(blkid -p /dev/$dev | awk '{print $2}' | sed -e 's/UUID=//' -e 's/\"//g')
>>
>> 	kill $logpid
>> 	sync $logfile
>>
>> 	ret=-1
>> 	while [ $ret != 0 ]
>> 	do
>> 		run -s -q "ls -lt /dev/disk/by-uuid | grep $newuuid"
>> 		ret=$?
>> 		((sleepcnt++))
>> 		sleep 1
>> 	done
>>
>> 	#for systemd-239-78.0.3.el8
>> 	log=$(cat $logfile|grep ID_FS_TYPE_NEW)
>> 	#for systemd-252-18.0.1.el9.x86_64
>> 	#log=$(grep --text "ID_FS_UUID=${newuuid}" $logfile)
>>
>> 	echo $cnt sleepcnt=$sleepcnt newuuid=$newuuid ret=$ret log=$log
>> }
>>
>> echo Test case: t1: version 3.
>> echo
>>
>> run -o cat /etc/system-release
>> run -o uname -a
>> run -o rpm -q systemd
>> if [ $fstype == "btrfs" ]; then
>> 	run -o mkfs.btrfs --version
>> elif [ $fstype == "xfs" ]; then
>> 	run -o mkfs.xfs -V
>> else
>> 	echo unknown fstype $fstype
>> fi
>> echo
>>
>> for ((cnt = 0; cnt < 13; cnt++)); do
>> 	run_testcase $cnt
>> done
>> -----------------------------------------------------------------
>>
>>
>> It appears that the problem is due to the convoluted nested device open
>> and device close in mkfs.btrfs as shown below:
>>
>> -------------
>>   prepare_ctx opens all devices <-- fd1
>>     zero the super-block
>>     writes temp-sb to the first device.
>>
>>   open_ctree opens the first device again <-- fd2
>>
>>   prepare_ctx writes temp-sb to all the remaining devs <-- fd1
>>
>>   fs_info->finalize_on_close = 1;
>>   close_ctree_fs_info()<-- writes valid <--- fd2
>>
>>   prepare_ctx is closed <--- fd1.
>> -------------

The problem is, even if we change the sequence, it doesn't make much
difference.

There are several things involved, and most of them are out of our control:

- The udev scan is triggered on writable fd close().
- The udev scan is always executed on the parent block device
   Not the partition device.
- The udev scan the whole disk, not just the partition

With those involved, changing the nested behavior would not change anything.

The write in another partition of the same parent block device can still
triggered a scan meanwhile we're making fs on our partition.

If you really want to change the behavior, you need to reuse all device
fd, and even with the reuse, you still can not solve the problem as mkfs
on another partition can still lead to the race.

Thus the proper way to do, is just to follow the udev's doc, to use
flock() on the parent block device.

Thanks,
Qu
>>
>> This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
>> and with this change both the test cases (with partition and without
>> partition) now runs fine.
>>
>> I've done an initial tests only, not validated with the multi-device mkfs.
>> More cleanup is possible but pending feedback;  marking this patch as an RFC.
>
> I'd like to see the cleaned up version of this patch, but I have a few comments.
>
> 1) I think re-using the fd is reasonable, tho could this just be reworked to
>     create the temp-sb and write this to all the devs, close the file
>     descriptors, and then call open_ctree?
>
> 2) I hate adding another thing into a core file that we'll have to figure out
>     how to undo later as we sync more code from the kernel into btrfs-progs, I'm
>     not sure if there's a way around this, but thinking harder about adding
>     something to disk-io.c that is for userspace only would be good.
>
> Thanks,
>
> Josef
>
David Sterba Feb. 1, 2024, 7:07 p.m. UTC | #3
On Thu, Feb 01, 2024 at 07:19:15PM +1030, Qu Wenruo wrote:
> The problem is, even if we change the sequence, it doesn't make much
> difference.
> 
> There are several things involved, and most of them are out of our control:
> 
> - The udev scan is triggered on writable fd close().
> - The udev scan is always executed on the parent block device
>    Not the partition device.
> - The udev scan the whole disk, not just the partition
> 
> With those involved, changing the nested behavior would not change anything.
> 
> The write in another partition of the same parent block device can still
> triggered a scan meanwhile we're making fs on our partition.

So this means that creating ext4 on /dev/sda1 and btrfs on /dev/sda2 can
trigger the udev events? And when both mkfs utilities would lock the
device then running them concurrently will fail, right? This could
happen in installation tools that can create different filesystems at
the same time.

I wonder if we should add options to assert, skip locking or wait until
it's free.
David Sterba Feb. 1, 2024, 7:18 p.m. UTC | #4
On Wed, Jan 31, 2024 at 03:48:00PM -0500, Josef Bacik wrote:
> On Wed, Jan 31, 2024 at 11:49:28PM +0800, Anand Jain wrote:
> > This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
> > and with this change both the test cases (with partition and without
> > partition) now runs fine.
> > 
> > I've done an initial tests only, not validated with the multi-device mkfs.
> > More cleanup is possible but pending feedback;  marking this patch as an RFC.
> 
> I'd like to see the cleaned up version of this patch, but I have a few comments.
> 
> 1) I think re-using the fd is reasonable, tho could this just be reworked to
>    create the temp-sb and write this to all the devs, close the file
>    descriptors, and then call open_ctree?

This would trigger the udev unnecessarily and could let other processes
to eg. try to mount the device (like systemd did or maybe still does).
This could fail the second open_ctree. That it's all done with one fd
open reduces possible interactions that could be problematic.

> 2) I hate adding another thing into a core file that we'll have to figure out
>    how to undo later as we sync more code from the kernel into btrfs-progs, I'm
>    not sure if there's a way around this, but thinking harder about adding
>    something to disk-io.c that is for userspace only would be good.

Yeah the open_ctree functions in progs are misplaced in disk-io.c and
are there for historical reasons. We'd need separate file (or maybe
whole compat directory) but for now let it be there, the cleanup is
going to be big.
Qu Wenruo Feb. 1, 2024, 8:50 p.m. UTC | #5
On 2024/2/2 05:37, David Sterba wrote:
> On Thu, Feb 01, 2024 at 07:19:15PM +1030, Qu Wenruo wrote:
>> The problem is, even if we change the sequence, it doesn't make much
>> difference.
>>
>> There are several things involved, and most of them are out of our control:
>>
>> - The udev scan is triggered on writable fd close().
>> - The udev scan is always executed on the parent block device
>>     Not the partition device.
>> - The udev scan the whole disk, not just the partition
>>
>> With those involved, changing the nested behavior would not change anything.
>>
>> The write in another partition of the same parent block device can still
>> triggered a scan meanwhile we're making fs on our partition.
>
> So this means that creating ext4 on /dev/sda1 and btrfs on /dev/sda2 can
> trigger the udev events? And when both mkfs utilities would lock the
> device then running them concurrently will fail, right? This could
> happen in installation tools that can create different filesystems at
> the same time.

For ext4, I didn't see any direct flock call, maybe they are using some
helpers to do the same work but I'm not 100% sure.

>
> I wonder if we should add options to assert, skip locking or wait until
> it's free.
> The "udevadm lock" has various options to do that.
I'm pretty sure we can add options like --wait to implement the same
deadline behavior for mkfs.btrfs.

Thanks,
Qu
Qu Wenruo Feb. 1, 2024, 9:59 p.m. UTC | #6
On 2024/2/1 07:18, Josef Bacik wrote:
> On Wed, Jan 31, 2024 at 11:49:28PM +0800, Anand Jain wrote:
>> I've reproduced the missing udev events issue without device partitions
>> using the test case as below. The test waits for the creation of 'by-uuid'
>> and, waiting indefinitely means successful reproduction. as below:
>>
>> --------------------------------------------------
>> #!/bin/bash
>> usage()
>> {
>> 	echo
>> 	echo "Usage: ./t1 sdb btrfs"
>> 	exit 1
>> }
>>
>> : ${1?"arg1 <dev> is missing"} || usage
>> dev=$1
>>
>> : ${2?"arg2 <fstype> is missing"} || usage
>> fstype=$2
>>
>> systemd=$(rpm -q --queryformat='%{NAME}-%{VERSION}-%{RELEASE}' systemd)
>>
>> run_testcase()
>> {
>> 	local cnt=$1
>> 	local ret=0
>> 	local sleepcnt=0
>>
>> 	local newuuid=""
>> 	local logpid=""
>> 	local log=""
>> 	local logfile="./udev_log_${systemd}_${fstype}.out"
>>
>> 	>$logfile
>>
>> 	wipefs -a /dev/${dev}* &>/dev/null
>> 	sync
>> 	udevadm settle /dev/$dev
>>
>> 	udevadm monitor -k -u -p > $logfile &
>> 	logpid=$!
>> 	>strace.out
>> 	run "strace -f -ttT -o strace.out mkfs.$fstype -q -f /dev/$dev"
>>
>> 	newuuid=$(blkid -p /dev/$dev | awk '{print $2}' | sed -e 's/UUID=//' -e 's/\"//g')
>>
>> 	kill $logpid
>> 	sync $logfile
>>
>> 	ret=-1
>> 	while [ $ret != 0 ]
>> 	do
>> 		run -s -q "ls -lt /dev/disk/by-uuid | grep $newuuid"
>> 		ret=$?
>> 		((sleepcnt++))
>> 		sleep 1
>> 	done
>>
>> 	#for systemd-239-78.0.3.el8
>> 	log=$(cat $logfile|grep ID_FS_TYPE_NEW)
>> 	#for systemd-252-18.0.1.el9.x86_64
>> 	#log=$(grep --text "ID_FS_UUID=${newuuid}" $logfile)
>>
>> 	echo $cnt sleepcnt=$sleepcnt newuuid=$newuuid ret=$ret log=$log
>> }
>>
>> echo Test case: t1: version 3.
>> echo
>>
>> run -o cat /etc/system-release
>> run -o uname -a
>> run -o rpm -q systemd
>> if [ $fstype == "btrfs" ]; then
>> 	run -o mkfs.btrfs --version
>> elif [ $fstype == "xfs" ]; then
>> 	run -o mkfs.xfs -V
>> else
>> 	echo unknown fstype $fstype
>> fi
>> echo
>>
>> for ((cnt = 0; cnt < 13; cnt++)); do
>> 	run_testcase $cnt
>> done
>> -----------------------------------------------------------------
>>
>>
>> It appears that the problem is due to the convoluted nested device open
>> and device close in mkfs.btrfs as shown below:

I did a full strace with extra debug output, the problem is not the
nested fds lifespan, but as I expected, a stray writeable fd.

In fact the problem is we have a pre-mature close on the first device,
if and only if using open_ctree_fs_info():

open_ctree_fs_info():
|- fp = open(oca->filename, oflags);
|- info = __open_ctree_fd();
|- close(fp);

That close() is the problem.
At that time, the fs is still just a temporary fs, without a valid btrfs
superblock.
Kicking off the udev scan at this timing is the worst.

We can keep that fd into oca, and close it after close_ctree().

And the problem should be gone, even we still keep the nested fd lifespan.

So although your patch is working, the explanation is no on the point.

And my flock() would cover the case, as it would prevent the udev scan
to be started.

I'll prepare a fix based on the findings.

Thanks,
Qu
>>
>> -------------
>>   prepare_ctx opens all devices <-- fd1
>>     zero the super-block
>>     writes temp-sb to the first device.
>>
>>   open_ctree opens the first device again <-- fd2
>>
>>   prepare_ctx writes temp-sb to all the remaining devs <-- fd1
>>
>>   fs_info->finalize_on_close = 1;
>>   close_ctree_fs_info()<-- writes valid <--- fd2
>>
>>   prepare_ctx is closed <--- fd1.
>> -------------
>>
>> This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
>> and with this change both the test cases (with partition and without
>> partition) now runs fine.
>>
>> I've done an initial tests only, not validated with the multi-device mkfs.
>> More cleanup is possible but pending feedback;  marking this patch as an RFC.
>
> I'd like to see the cleaned up version of this patch, but I have a few comments.
>
> 1) I think re-using the fd is reasonable, tho could this just be reworked to
>     create the temp-sb and write this to all the devs, close the file
>     descriptors, and then call open_ctree?
>
> 2) I hate adding another thing into a core file that we'll have to figure out
>     how to undo later as we sync more code from the kernel into btrfs-progs, I'm
>     not sure if there's a way around this, but thinking harder about adding
>     something to disk-io.c that is for userspace only would be good.
>
> Thanks,
>
> Josef
>
Anand Jain Feb. 2, 2024, 1:52 a.m. UTC | #7
On 2/2/24 00:48, David Sterba wrote:
> On Wed, Jan 31, 2024 at 03:48:00PM -0500, Josef Bacik wrote:
>> On Wed, Jan 31, 2024 at 11:49:28PM +0800, Anand Jain wrote:
>>> This cleanup patch reuses the main file descriptor (fd1) in open_ctree(),
>>> and with this change both the test cases (with partition and without
>>> partition) now runs fine.
>>>
>>> I've done an initial tests only, not validated with the multi-device mkfs.
>>> More cleanup is possible but pending feedback;  marking this patch as an RFC.
>>
>> I'd like to see the cleaned up version of this patch, but I have a few comments.
>>
>> 1) I think re-using the fd is reasonable, tho could this just be reworked to
>>     create the temp-sb and write this to all the devs, close the file
>>     descriptors, and then call open_ctree?


Multiple sequential open-close maybe fine.

> This would trigger the udev unnecessarily and could let other processes
> to eg. try to mount the device (like systemd did or maybe still does).

> This could fail the second open_ctree.

I think you are missing the point that when we write the temp-sb, there 
is no valid BTRFS magic. As a result, the kernel won't get the scan, and 
systemd won't attempt to mount.

However, there will be a udev event (due to the write-close), and in 
this udev event, we shall have:

ID_FS_UUID=<blank>

This action cleans the /dev/disk/by-uuid directory, which is fine.

> That it's all done with one fd
> open reduces possible interactions that could be problematic.

Hmm reduced possible interactions is problematic? I didn't get this.

Here is the sequence of events to help discuss:

Consider a single device, for now:

  fd1 writes zero and temp-sb.
  fd2 writes good-sb.
  fd2 closes.
  fd1 closes.

Please note that the fd which wrote zero/temp-sb (fd1) closes last.

Per udevadm monitor the change event OR the UUID is missing (depending 
on the systemd version).

ID_FS_UUID=<blank> instead of ID_FS_UUID=<UUID>

>> 2) I hate adding another thing into a core file that we'll have to figure out
>>     how to undo later as we sync more code from the kernel into btrfs-progs, I'm
>>     not sure if there's a way around this, but thinking harder about adding
>>     something to disk-io.c that is for userspace only would be good.
> 
> Yeah the open_ctree functions in progs are misplaced in disk-io.c and
> are there for historical reasons. We'd need separate file (or maybe
> whole compat directory) but for now let it be there, the cleanup is
> going to be big.


Thanks, Anand
diff mbox series

Patch

diff --git a/kernel-shared/disk-io.c b/kernel-shared/disk-io.c
index c053319200cb..b4951d229647 100644
--- a/kernel-shared/disk-io.c
+++ b/kernel-shared/disk-io.c
@@ -1660,6 +1660,36 @@  out:
 	return NULL;
 }
 
+struct btrfs_fs_info *open_ctree_fs_info_mkfs(int fd, struct open_ctree_args *oca)
+{
+	int ret;
+	struct stat st;
+	//int oflags = O_RDWR;
+	struct btrfs_fs_info *fs_info;
+
+	ret = stat(oca->filename, &st);
+	if (ret < 0) {
+		error("cannot stat '%s': %m", oca->filename);
+		return NULL;
+	}
+
+	if (!(((st.st_mode & S_IFMT) == S_IFREG) || ((st.st_mode & S_IFMT) == S_IFBLK))) {
+		error("not a regular file or block device: %s", oca->filename);
+		return NULL;
+	}
+
+	/*
+	if (!(oca->flags & OPEN_CTREE_WRITES))
+		oflags = O_RDONLY;
+
+	if ((oflags & O_RDWR) && zoned_model(oca->filename) == ZONED_HOST_MANAGED)
+		oflags |= O_DIRECT;
+	*/
+
+	fs_info = __open_ctree_fd(fd, oca);
+	return fs_info;
+}
+
 struct btrfs_fs_info *open_ctree_fs_info(struct open_ctree_args *oca)
 {
 	int fp;
diff --git a/kernel-shared/disk-io.h b/kernel-shared/disk-io.h
index 68cdf5b08d52..0f2aa24a50a1 100644
--- a/kernel-shared/disk-io.h
+++ b/kernel-shared/disk-io.h
@@ -199,6 +199,7 @@  struct open_ctree_args {
 };
 
 struct btrfs_fs_info *open_ctree_fs_info(struct open_ctree_args *oca);
+struct btrfs_fs_info *open_ctree_fs_info_mkfs(int fd, struct open_ctree_args *oca);
 int close_ctree_fs_info(struct btrfs_fs_info *fs_info);
 static inline int close_ctree(struct btrfs_root *root)
 {
diff --git a/mkfs/main.c b/mkfs/main.c
index b50b78b5665a..2526fb2317e5 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1800,7 +1800,7 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 
 	oca.filename = file;
 	oca.flags = OPEN_CTREE_WRITES | OPEN_CTREE_TEMPORARY_SUPER;
-	fs_info = open_ctree_fs_info(&oca);
+	fs_info = open_ctree_fs_info_mkfs(prepare_ctx[0].fd, &oca);
 	if (!fs_info) {
 		error("open ctree failed");
 		goto error;