diff mbox series

ocfs2: Fix wrong search logic in __ocfs2_resv_find_window

Message ID 20230406154052.9863-1-heming.zhao@suse.com (mailing list archive)
State New, archived
Headers show
Series ocfs2: Fix wrong search logic in __ocfs2_resv_find_window | expand

Commit Message

heming.zhao@suse.com April 6, 2023, 3:40 p.m. UTC
** problem **

Current code triggers a defragfs bug [1]:

```
tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
defragfs.ocfs2 1.8.7
[ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
[ERROR]"/mnt/test_from_dd1":No space left on device
```

I added some debug messages in relevant functions. When above error
messages appeared, the la-window still had enough continuous bitmap
to use, the -ENOSPC is wrong.

** analysis **

__ocfs2_resv_find_window should use resv node from rb-tree to do linear
search. But current code logic uses un-managered area between two resvs.
The searching logic deviates from this func job, which should only focus
on reservation areas (when rb_root is non NULL). Another issue of
__ocfs2_resv_find_window is more inclined to generate inner fragment.
It doesn't search & consume existing reservations but be apt to create
new reservation item.

This patch pulls this func (__ocfs2_resv_find_window) to do what it
should do: search & consume resev. if this func fails to get suitable
bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
resv from LRU to do the final search.

[1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931

Signed-off-by: Heming Zhao <heming.zhao@suse.com>
---
 fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 14 deletions(-)

Comments

heming.zhao@suse.com April 7, 2023, 3:42 p.m. UTC | #1
Hello Joseph,

On Thu, Apr 06, 2023 at 11:40:52PM +0800, Heming Zhao wrote:
> ** problem **
> 
> Current code triggers a defragfs bug [1]:
> 
> ```
> tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
> defragfs.ocfs2 1.8.7
> [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
> [ERROR]"/mnt/test_from_dd1":No space left on device
> ```
> 
> I added some debug messages in relevant functions. When above error
> messages appeared, the la-window still had enough continuous bitmap
> to use, the -ENOSPC is wrong.
> 

There is another problem with la-window alloc flow, and also should have a patch.
When reservatioins are disabled, ocfs2_local_alloc_find_clear_bits starts linear
search, and returns 'bitoff:-1' if 'numfound < *numbits'. In my view, this code
logic is wrong, in this case, it should return max suitable numfound.

The pseudo code:

```
diff -Nupr a/localalloc.c b/localalloc.c
--- a/localalloc.c	2023-04-07 23:33:42.077083405 +0800
+++ b/localalloc.c	2023-04-07 23:35:10.756344502 +0800
@@ -835,6 +835,7 @@ static int ocfs2_local_alloc_find_clear_
 				     struct ocfs2_alloc_reservation *resv)
 {
 	int numfound = 0, bitoff, left, startoff;
+	unsigned int best_start, best_len = 0;
 	int local_resv = 0;
 	struct ocfs2_alloc_reservation r;
 	void *bitmap = NULL;
@@ -940,6 +941,10 @@ static int ocfs2_local_alloc_find_clear_
 			numfound = 1;
 			startoff = bitoff+1;
 		}
+		if (numfound > best_len) {
+			best_len = numfound;
+			best_start = startoff - numfound;
+		}
 		/* we got everything we needed */
 		if (numfound == *numbits) {
 			/* mlog(0, "Found it all!\n"); */
@@ -949,10 +954,12 @@ static int ocfs2_local_alloc_find_clear_
 
 	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
 
-	if (numfound == *numbits)
-		bitoff = startoff - numfound;
-	else
+	if (best_len = 0) {
 		bitoff = -1;
+	} else {
+		bitoff = best_start;
+		*numbits = best_len;
+	}
 
 bail:
 	if (local_resv)
```

If you agree with my thinking, I will test & file a formal patch.

Thanks,
Heming
Joseph Qi April 18, 2023, 9:43 a.m. UTC | #2
Sorry for the late replay since I was busy with other projects these
days. Will take a look at this patch this weekend.

Thanks,
Joseph

On 4/6/23 11:40 PM, Heming Zhao wrote:
> ** problem **
> 
> Current code triggers a defragfs bug [1]:
> 
> ```
> tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
> defragfs.ocfs2 1.8.7
> [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
> [ERROR]"/mnt/test_from_dd1":No space left on device
> ```
> 
> I added some debug messages in relevant functions. When above error
> messages appeared, the la-window still had enough continuous bitmap
> to use, the -ENOSPC is wrong.
> 
> ** analysis **
> 
> __ocfs2_resv_find_window should use resv node from rb-tree to do linear
> search. But current code logic uses un-managered area between two resvs.
> The searching logic deviates from this func job, which should only focus
> on reservation areas (when rb_root is non NULL). Another issue of
> __ocfs2_resv_find_window is more inclined to generate inner fragment.
> It doesn't search & consume existing reservations but be apt to create
> new reservation item.
> 
> This patch pulls this func (__ocfs2_resv_find_window) to do what it
> should do: search & consume resev. if this func fails to get suitable
> bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
> resv from LRU to do the final search.
> 
> [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
> 
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> ---
>  fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
> index a9d1296d736d..eda672622d1d 100644
> --- a/fs/ocfs2/reservations.c
> +++ b/fs/ocfs2/reservations.c
> @@ -458,10 +458,11 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  {
>  	struct rb_root *root = &resmap->m_reservations;
>  	unsigned int gap_start, gap_end, gap_len;
> -	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
> +	struct ocfs2_alloc_reservation *prev_resv, *next_resv, *best_resv;
>  	struct rb_node *prev, *next;
>  	unsigned int cstart, clen;
>  	unsigned int best_start = 0, best_len = 0;
> +	int create_new = 0;
>  
>  	/*
>  	 * Nasty cases to consider:
> @@ -540,8 +541,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  		if (clen) {
>  			best_len = clen;
>  			best_start = cstart;
> +			create_new = 1;
>  			if (best_len == wanted)
> -				goto out_insert;
> +				goto out_create;
>  		}
>  
>  		prev_resv = next_resv;
> @@ -557,13 +559,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  	while (1) {
>  		next = rb_next(prev);
>  		if (next) {
> -			next_resv = rb_entry(next,
> -					     struct ocfs2_alloc_reservation,
> -					     r_node);
> -
> -			gap_start = ocfs2_resv_end(prev_resv) + 1;
> -			gap_end = next_resv->r_start - 1;
> -			gap_len = gap_end - gap_start + 1;
> +			gap_start = prev_resv->r_start;
> +			gap_end = prev_resv->r_start + prev_resv->r_len - 1;
> +			gap_len = prev_resv->r_len;
>  		} else {
>  			/*
>  			 * We're at the rightmost edge of the
> @@ -575,8 +573,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  			gap_end = resmap->m_bitmap_len - 1;
>  		}
>  
> -		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
> -					next ? ocfs2_resv_end(next_resv) : -1);
> +		trace_ocfs2_resv_find_window_next(next ? prev_resv->r_start : -1,
> +					next ? ocfs2_resv_end(prev_resv) : -1);
>  		/*
>  		 * No need to check this gap if we have already found
>  		 * a larger region of free bits.
> @@ -589,10 +587,16 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  		if (clen == wanted) {
>  			best_len = clen;
>  			best_start = cstart;
> -			goto out_insert;
> +			best_resv = prev_resv;
> +			if (!next)
> +				goto out_create;
> +			else
> +				goto out_insert;
>  		} else if (clen > best_len) {
>  			best_len = clen;
>  			best_start = cstart;
> +			best_resv = prev_resv;
> +			create_new = 0;
>  		}
>  
>  next_resv:
> @@ -604,12 +608,28 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  				     r_node);
>  	}
>  
> -out_insert:
> -	if (best_len) {
> +	if (!best_len)
> +		return;
> +
> +	if (create_new) {
> +out_create:
>  		resv->r_start = best_start;
>  		resv->r_len = best_len;
>  		ocfs2_resv_insert(resmap, resv);
> +		return;
>  	}
> +
> +out_insert:
> +	if (best_resv->r_len <= wanted) {
> +		resv->r_start = best_start;
> +		resv->r_len = best_len;
> +		__ocfs2_resv_discard(resmap, best_resv);
> +	} else {
> +		best_resv->r_len -= best_len;
> +		resv->r_start = ocfs2_resv_end(best_resv) + 1;
> +		resv->r_len = best_len;
> +	}
> +	ocfs2_resv_insert(resmap, resv);
>  }
>  
>  static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
Joseph Qi April 21, 2023, 7:35 a.m. UTC | #3
Hi,
Could you please share a reproducer?

Thanks,
Joseph

On 4/6/23 11:40 PM, Heming Zhao wrote:
> ** problem **
> 
> Current code triggers a defragfs bug [1]:
> 
> ```
> tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
> defragfs.ocfs2 1.8.7
> [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
> [ERROR]"/mnt/test_from_dd1":No space left on device
> ```
> 
> I added some debug messages in relevant functions. When above error
> messages appeared, the la-window still had enough continuous bitmap
> to use, the -ENOSPC is wrong.
> 
> ** analysis **
> 
> __ocfs2_resv_find_window should use resv node from rb-tree to do linear
> search. But current code logic uses un-managered area between two resvs.
> The searching logic deviates from this func job, which should only focus
> on reservation areas (when rb_root is non NULL). Another issue of
> __ocfs2_resv_find_window is more inclined to generate inner fragment.
> It doesn't search & consume existing reservations but be apt to create
> new reservation item.
> 
> This patch pulls this func (__ocfs2_resv_find_window) to do what it
> should do: search & consume resev. if this func fails to get suitable
> bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
> resv from LRU to do the final search.
> 
> [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
> 
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> ---
>  fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
> index a9d1296d736d..eda672622d1d 100644
> --- a/fs/ocfs2/reservations.c
> +++ b/fs/ocfs2/reservations.c
> @@ -458,10 +458,11 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  {
>  	struct rb_root *root = &resmap->m_reservations;
>  	unsigned int gap_start, gap_end, gap_len;
> -	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
> +	struct ocfs2_alloc_reservation *prev_resv, *next_resv, *best_resv;
>  	struct rb_node *prev, *next;
>  	unsigned int cstart, clen;
>  	unsigned int best_start = 0, best_len = 0;
> +	int create_new = 0;
>  
>  	/*
>  	 * Nasty cases to consider:
> @@ -540,8 +541,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  		if (clen) {
>  			best_len = clen;
>  			best_start = cstart;
> +			create_new = 1;
>  			if (best_len == wanted)
> -				goto out_insert;
> +				goto out_create;
>  		}
>  
>  		prev_resv = next_resv;
> @@ -557,13 +559,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  	while (1) {
>  		next = rb_next(prev);
>  		if (next) {
> -			next_resv = rb_entry(next,
> -					     struct ocfs2_alloc_reservation,
> -					     r_node);
> -
> -			gap_start = ocfs2_resv_end(prev_resv) + 1;
> -			gap_end = next_resv->r_start - 1;
> -			gap_len = gap_end - gap_start + 1;
> +			gap_start = prev_resv->r_start;
> +			gap_end = prev_resv->r_start + prev_resv->r_len - 1;
> +			gap_len = prev_resv->r_len;
>  		} else {
>  			/*
>  			 * We're at the rightmost edge of the
> @@ -575,8 +573,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  			gap_end = resmap->m_bitmap_len - 1;
>  		}
>  
> -		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
> -					next ? ocfs2_resv_end(next_resv) : -1);
> +		trace_ocfs2_resv_find_window_next(next ? prev_resv->r_start : -1,
> +					next ? ocfs2_resv_end(prev_resv) : -1);
>  		/*
>  		 * No need to check this gap if we have already found
>  		 * a larger region of free bits.
> @@ -589,10 +587,16 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  		if (clen == wanted) {
>  			best_len = clen;
>  			best_start = cstart;
> -			goto out_insert;
> +			best_resv = prev_resv;
> +			if (!next)
> +				goto out_create;
> +			else
> +				goto out_insert;
>  		} else if (clen > best_len) {
>  			best_len = clen;
>  			best_start = cstart;
> +			best_resv = prev_resv;
> +			create_new = 0;
>  		}
>  
>  next_resv:
> @@ -604,12 +608,28 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>  				     r_node);
>  	}
>  
> -out_insert:
> -	if (best_len) {
> +	if (!best_len)
> +		return;
> +
> +	if (create_new) {
> +out_create:
>  		resv->r_start = best_start;
>  		resv->r_len = best_len;
>  		ocfs2_resv_insert(resmap, resv);
> +		return;
>  	}
> +
> +out_insert:
> +	if (best_resv->r_len <= wanted) {
> +		resv->r_start = best_start;
> +		resv->r_len = best_len;
> +		__ocfs2_resv_discard(resmap, best_resv);
> +	} else {
> +		best_resv->r_len -= best_len;
> +		resv->r_start = ocfs2_resv_end(best_resv) + 1;
> +		resv->r_len = best_len;
> +	}
> +	ocfs2_resv_insert(resmap, resv);
>  }
>  
>  static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
heming.zhao@suse.com April 21, 2023, 8:01 a.m. UTC | #4
On Fri, Apr 21, 2023 at 03:35:01PM +0800, Joseph Qi wrote:
> Hi,
> Could you please share a reproducer?
> 

Anyone could access & download the URL [1] (I wrote it in patch commit log)
without register SUSE account.

Please check attachment file, which I downloaded from [1] and modified under
the BZ comment 1. The trigger method is also in comment 1, I copy here:
./defragfs_test.sh -d /dev/vdf -m /mnt -n ocfstst -s pcmk -o /tmp -b 1024 -c 8192

note: you may modify '-n' (cluster name) & -s' (cluster stack).

[1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931

Thanks,
Heming

> 
> On 4/6/23 11:40 PM, Heming Zhao wrote:
> > ** problem **
> > 
> > Current code triggers a defragfs bug [1]:
> > 
> > ```
> > tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
> > defragfs.ocfs2 1.8.7
> > [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
> > [ERROR]"/mnt/test_from_dd1":No space left on device
> > ```
> > 
> > I added some debug messages in relevant functions. When above error
> > messages appeared, the la-window still had enough continuous bitmap
> > to use, the -ENOSPC is wrong.
> > 
> > ** analysis **
> > 
> > __ocfs2_resv_find_window should use resv node from rb-tree to do linear
> > search. But current code logic uses un-managered area between two resvs.
> > The searching logic deviates from this func job, which should only focus
> > on reservation areas (when rb_root is non NULL). Another issue of
> > __ocfs2_resv_find_window is more inclined to generate inner fragment.
> > It doesn't search & consume existing reservations but be apt to create
> > new reservation item.
> > 
> > This patch pulls this func (__ocfs2_resv_find_window) to do what it
> > should do: search & consume resev. if this func fails to get suitable
> > bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
> > resv from LRU to do the final search.
> > 
> > [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
> > 
> > Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> > ---
> >  fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
> >  1 file changed, 34 insertions(+), 14 deletions(-)
heming.zhao@suse.com April 21, 2023, 8:43 a.m. UTC | #5
On Fri, Apr 21, 2023 at 04:01:29PM +0800, Heming Zhao via Ocfs2-devel wrote:
> On Fri, Apr 21, 2023 at 03:35:01PM +0800, Joseph Qi wrote:
> > Hi,
> > Could you please share a reproducer?
> > 
> 
> Anyone could access & download the URL [1] (I wrote it in patch commit log)
> without register SUSE account.
> 
> Please check attachment file, which I downloaded from [1] and modified under
> the BZ comment 1. The trigger method is also in comment 1, I copy here:
> ./defragfs_test.sh -d /dev/vdf -m /mnt -n ocfstst -s pcmk -o /tmp -b 1024 -c 8192
> 
> note: you may modify '-n' (cluster name) & -s' (cluster stack).
> 
> [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
> 

It looks the attachment file was removed by oracle mail server.
I copy & past it here:

```
#!/bin/sh
#
# mkfs_test -o <outdir> -d <device> -b <blocksize> -c <clustersize>
#
usage() {
    echo "usage: ${DEFRAGFS_TEST} -o <outdir> -d <device> -m <mountpoint> -b <blocksize> -c <clustersize> -s <cluster-stack> -n <cluster-name>"
    echo "       -o output directory for the logs"
    echo "       -d device"
    echo "       -m mountpoint"
    echo "       -b blocksize"
    echo "       -c clustersize"
    echo "       -s cluster stack"
    echo "       -n cluster name"
    exit 1
}

get_partsz() {
	if [ -h "${device}" ]; then
		device=`readlink -f "${device}"`
	fi

    dev=`echo ${device} | sed 's/\/dev\///'`
    num=`cat /proc/partitions | ${AWK} -v DEV=${dev} '
		BEGIN{dev=DEV} // {split($0, a); if (a[4] == dev) {printf("%u\n", $3); exit 0;} }'`
    if [ ${num} -eq 0 ]; then
        echo "error: unable to find size of device"  |tee -a ${LOGFILE}
        exit 1
    fi

    partsz=$[${num}*1024]
    return 0
}

do_fsck() {
    if [ "$#" -lt "1" ]; then
        echo "do_fsck(): <out>" |tee -a ${LOGFILE}
        exit 1
    fi
    out=$1
    echo ${FSCK} -fn ${device} >>${LOGFILE}
    echo -n "fsck ..... " |tee -a ${LOGFILE}
    echo ${FSCK} -fn ${device} >>${out}
    ${FSCK} -fn ${device} >>${out} 2>&1
    grep "All passes succeeded" ${LOGFILE} >/dev/null 2>&1
    grep "All passes succeeded" ${out} >/dev/null 2>&1
    if [ $? -ne 0 ] ; then
        echo "FAILED. Errors in ${out}" |tee -a ${LOGFILE}
        exit 1
    else
        echo "OK" |tee -a ${LOGFILE}
    fi
    echo "" >> ${out}
    return 0
}

do_mkfs() {
    if [ "$#" -lt "6" ] ; then
        echo "do_mkfs(): blocksize clustersize device volsize out cluster-stack cluster-name"  |tee -a ${LOGFILE}
        exit 1
    fi

    B=$1
    C=$2
    D=$3
    #V=$4
    O=$4
    S=$5
    N=$6

    echo ${MKFS} -b ${B} -C ${C} ${D} --cluster-stack=${S} --cluster-name=${N} >> ${LOGFILE}
    echo -n "mkfs ..... " |tee -a ${LOGFILE}
    echo ${MKFS} -b ${B} -C ${C} ${D} --cluster-stack=${S} --cluster-name=${N} >> ${O}
    ${MKFS} -x -F -b ${B} -C ${C} -N 1 -J size=4M ${D} --cluster-stack=${S} $numblks --cluster-name=${N} >> ${O} 2>&1
    echo "OK" |tee -a ${LOGFILE}
    echo "" >> ${O}
}

do_mount() {
	# mount the device on mntdir
	echo -n "mount ${device} ${mntdir} " |tee -a ${LOGFILE}
	${MOUNT} -t ocfs2 ${device} ${mntdir} 2>/dev/null
	if [ $? -ne 0 ]
	then
		echo -n "FAILED. Check dmesg for errors." 2>&1  |tee -a ${LOGFILE}
		exit 1
	else
		echo "OK"  |tee -a ${LOGFILE}
	fi
}


do_defragfs() {
	$DEFRAGFS $mntdir
}

do_genfrag_old() {
	# 应该使用一个新的方法来产生空洞文件,类似lseek的方法,但是仔细想过以后,发现lseek好像并不适用
	# 另一种方法就是直接切断inode的部分索引
    	if [ "$#" -lt "1" ] ; then
        	echo "do_genfrag(): blocksize"  |tee -a ${LOGFILE}
        	exit 1
    	fi
	in_dev="/dev/random"
	blk_sz=$1
        max_num=$[$partsz/($blk_sz*5000)]
        if [ "$max_num" -gt "100" ]
	then
		max_num=1
	fi
	for i in `seq 1 $max_num`
	do
		tmp_file=${mntdir%*/}/test_from_dd$i
		dd if=/dev/urandom of=$tmp_file bs=$blk_sz count=5000
		echo "dd if=/dev/urandom of=$tmp_file bs=$blk_sz count=5000"
		cd $mntdir
		split -b $blk_sz $tmp_file "zheshishenmedongxi"
		# exit 1
		rm -rf zheshishenmedongxi*x
		rm -rf $tmp_file
		cat zheshishenmedongxi* > $tmp_file 
		rm -rf zheshishenmedongxi*
		md5sum $tmp_file > $tmp_file"_md5"
		cd -
	done
	#echo "exit..."
	#exit 100
}

do_genfrag() {
    	if [ "$#" -lt "1" ] ; then
        	echo "do_genfrag(): blocksize"  |tee -a ${LOGFILE}
        	exit 1
    	fi
	in_dev="/dev/random"
	cluster_sz=$1
        max_num=$[$partsz/($cluster_sz*100)]
        if [ "$max_num" -gt "100" ]
	then
		max_num=100
	fi
	cd $mntdir
	for i in `seq 1 100` 
	do
		for j in `seq 1 $max_num`
		do
			tmp_file=${mntdir%*/}/test_from_dd$j
			dd if=/dev/urandom of=tmp_file bs=$cluster_sz count=1 > /dev/null 2>&1
			echo "dd if=/dev/urandom of=tmp_file bs=$cluster_sz count=1"
			cat tmp_file >> $tmp_file 
		done
	done
	for i in `seq 1 $max_num`
	do
		tmp_file=${mntdir%*/}/test_from_dd$i
		md5sum $tmp_file > $tmp_file"_md5"
	done
	cd -
	#echo "exit..."
	#exit 100
}

check_md5() {
	cd $mntdir
	md5sum --check *_md5
	cd -
	if [ $? -ne 0 ] 
        then
                echo "FAILED. after defragfs, the file md5 code not match." 2>&1  |tee -a ${LOGFILE}
                exit 1
        else
                echo "md5 check OK"  |tee -a ${LOGFILE}
        fi

}

do_umount() {
	# umount the volume
	echo -n "umount ${mntdir} "  |tee -a ${LOGFILE}
	${UMOUNT} ${mntdir} 2>/dev/null
	if [ $? -ne 0 ]
	then
		echo "FAILED. Check dmesg for errors." 2>&1  |tee -a ${LOGFILE}
		exit 1
	else
		echo "OK"  |tee -a ${LOGFILE}
	fi
}


DEFRAGFS="`which sudo` -u root `which defragfs.ocfs2`"
MKFS="`which sudo` -u root `which mkfs.ocfs2`"
FSCK="`which sudo` -u root `which fsck.ocfs2`"
DEBUGFS="`which sudo` -u root `which debugfs.ocfs2`"
MOUNT="`which sudo` -u root `which mount.ocfs2`"
UMOUNT="`which sudo` -u root `which umount`"
MKDIR="`which sudo` -u root `which mkdir`"
RM="`which sudo` -u root `which rm`"
GREP=`which grep`
DATE=`which date`
AWK=`which awk`
DD=`which dd`

DEFRAGFS_TEST=`basename $0`

bindir=`basename ${0}`
outdir=`basename ${bindir}`
device=
mntdir=
blocksize="NONE"
clustersize="NONE"
OPTIND=1

# TODO: add persent
while getopts "d:i:o:m:c:b:s:n:" args
do
  case "$args" in
    o) outdir="$OPTARG";;
    d) device="$OPTARG";;
    m) mntdir="$OPTARG";;
    b) blocksize="$OPTARG";;
    c) clustersize="$OPTARG";;
    s) cluster_stack="$OPTARG";;
    n) cluster_name="$OPTARG";;
  esac
done
LOGFILE=${outdir}/defragfs-test.log
if [ -f ${LOGFILE} ]; then
	mv ${LOGFILE} `dirname ${LOGFILE}`/`date +%F-%H-%M-%S`-`basename ${LOGFILE}`
fi;

if [ -z "${outdir}" ]; then
    echo "invalid output directory: ${outdir}"
    usage ;
fi

if [ ! -b "${device}" ]; then
    echo "invalid device: ${device}" #|tee -a ${LOGFILE}
    usage ;
fi

if [ -z "${mntdir}" ]; then
    echo "invalid mount point: ${mntdir}" #|tee -a ${LOGFILE}
    usage ;
fi

if [ -z "${cluster_stack}" ]; then
    echo "invalid cluster stack: ${cluster_stack}" #|tee -a ${LOGFILE}
    usage ;
fi

if [ -z "${cluster_name}" ]; then
    echo "invalid cluster name: ${cluster_name}" #|tee -a ${LOGFILE}
    usage ;
fi

echo "create logdir ${outdir}" #|tee -a ${LOGFILE}
mkdir -p ${outdir}

#get partition size
partsz=0
get_partsz


#numblks=10485760

testnum=1

if [ "$blocksize" != "NONE" ];then
    bslist="$blocksize"
else
    bslist="512 1024 2048 4096"
fi

if [ "$clustersize" != "NONE" ];then
    cslist="$clustersize"
else
    cslist="4096 8192 16384 32768 65536 131072 262144 524288 1048576"
fi

echo $bslist
echo $cslist
### Test all combinations of blocksizes and clustersizes
for blks in $(echo "$bslist")
do
    for clusts in $(echo "$cslist")
    do
        TAG=defragfs_test_${testnum}
        OUT=${outdir}/${TAG}.log
	if [ -f ${OUT} ]; then
		rm -f ${OUT};
	fi;

        echo "Test ${testnum}: -b ${blks} -C ${clusts}" #|tee -a ${LOGFILE}
        do_mkfs ${blks} ${clusts} ${device} ${OUT} ${cluster_stack} ${cluster_name}
        do_fsck ${OUT}
	do_mount
	do_genfrag ${clusts}
	#exit 100
	do_defragfs
	do_umount
    done
done
exit 0


### Test option '-T mail'
TAG=defragfs_test_${testnum}
OUT=${outdir}/${TAG}.log
if [ -f ${OUT} ]; then
	rm -f ${OUT};
fi;
echo "Test ${testnum}: -T mail" #|tee -a ${LOGFILE}
echo -n "mkfs ..... " #|tee -a ${LOGFILE}
${MKFS} -x -F -b 4K -C 4K -N 2 -T mail ${device} 262144 >${OUT} 2>&1
echo "OK" #|tee -a ${LOGFILE}
echo -n "verify ..... " #|tee -a ${LOGFILE}
${DEBUGFS} -R "ls -l //" ${device} >>${OUT} 2>&1
num=`${AWK} '/journal:0000/ {print $6;}' ${OUT}`
if [ $num -ne 134217728 ]; then
    echo "ERROR: Journal size too small for type mail" >> ${OUT}
    echo "" >> ${OUT}
    echo "FAILED. Errors in ${OUT}" #|tee -a ${LOGFILE}
else
    echo "OK" #|tee -a ${LOGFILE}
fi
do_fsck ${OUT}
```

Thanks
heming.zhao@suse.com April 29, 2023, 6:37 a.m. UTC | #6
ping...

On 4/21/23 3:35 PM, Joseph Qi wrote:
> Hi,
> Could you please share a reproducer?
> 
> Thanks,
> Joseph
> 
> On 4/6/23 11:40 PM, Heming Zhao wrote:
>> ** problem **
>>
>> Current code triggers a defragfs bug [1]:
>>
>> ```
>> tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
>> defragfs.ocfs2 1.8.7
>> [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
>> [ERROR]"/mnt/test_from_dd1":No space left on device
>> ```
>>
>> I added some debug messages in relevant functions. When above error
>> messages appeared, the la-window still had enough continuous bitmap
>> to use, the -ENOSPC is wrong.
>>
>> ** analysis **
>>
>> __ocfs2_resv_find_window should use resv node from rb-tree to do linear
>> search. But current code logic uses un-managered area between two resvs.
>> The searching logic deviates from this func job, which should only focus
>> on reservation areas (when rb_root is non NULL). Another issue of
>> __ocfs2_resv_find_window is more inclined to generate inner fragment.
>> It doesn't search & consume existing reservations but be apt to create
>> new reservation item.
>>
>> This patch pulls this func (__ocfs2_resv_find_window) to do what it
>> should do: search & consume resev. if this func fails to get suitable
>> bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
>> resv from LRU to do the final search.
>>
>> [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
>>
>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>> ---
>>   fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
>>   1 file changed, 34 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
>> index a9d1296d736d..eda672622d1d 100644
>> --- a/fs/ocfs2/reservations.c
>> +++ b/fs/ocfs2/reservations.c
>> @@ -458,10 +458,11 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   {
>>   	struct rb_root *root = &resmap->m_reservations;
>>   	unsigned int gap_start, gap_end, gap_len;
>> -	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
>> +	struct ocfs2_alloc_reservation *prev_resv, *next_resv, *best_resv;
>>   	struct rb_node *prev, *next;
>>   	unsigned int cstart, clen;
>>   	unsigned int best_start = 0, best_len = 0;
>> +	int create_new = 0;
>>   
>>   	/*
>>   	 * Nasty cases to consider:
>> @@ -540,8 +541,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   		if (clen) {
>>   			best_len = clen;
>>   			best_start = cstart;
>> +			create_new = 1;
>>   			if (best_len == wanted)
>> -				goto out_insert;
>> +				goto out_create;
>>   		}
>>   
>>   		prev_resv = next_resv;
>> @@ -557,13 +559,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   	while (1) {
>>   		next = rb_next(prev);
>>   		if (next) {
>> -			next_resv = rb_entry(next,
>> -					     struct ocfs2_alloc_reservation,
>> -					     r_node);
>> -
>> -			gap_start = ocfs2_resv_end(prev_resv) + 1;
>> -			gap_end = next_resv->r_start - 1;
>> -			gap_len = gap_end - gap_start + 1;
>> +			gap_start = prev_resv->r_start;
>> +			gap_end = prev_resv->r_start + prev_resv->r_len - 1;
>> +			gap_len = prev_resv->r_len;
>>   		} else {
>>   			/*
>>   			 * We're at the rightmost edge of the
>> @@ -575,8 +573,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   			gap_end = resmap->m_bitmap_len - 1;
>>   		}
>>   
>> -		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
>> -					next ? ocfs2_resv_end(next_resv) : -1);
>> +		trace_ocfs2_resv_find_window_next(next ? prev_resv->r_start : -1,
>> +					next ? ocfs2_resv_end(prev_resv) : -1);
>>   		/*
>>   		 * No need to check this gap if we have already found
>>   		 * a larger region of free bits.
>> @@ -589,10 +587,16 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   		if (clen == wanted) {
>>   			best_len = clen;
>>   			best_start = cstart;
>> -			goto out_insert;
>> +			best_resv = prev_resv;
>> +			if (!next)
>> +				goto out_create;
>> +			else
>> +				goto out_insert;
>>   		} else if (clen > best_len) {
>>   			best_len = clen;
>>   			best_start = cstart;
>> +			best_resv = prev_resv;
>> +			create_new = 0;
>>   		}
>>   
>>   next_resv:
>> @@ -604,12 +608,28 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>   				     r_node);
>>   	}
>>   
>> -out_insert:
>> -	if (best_len) {
>> +	if (!best_len)
>> +		return;
>> +
>> +	if (create_new) {
>> +out_create:
>>   		resv->r_start = best_start;
>>   		resv->r_len = best_len;
>>   		ocfs2_resv_insert(resmap, resv);
>> +		return;
>>   	}
>> +
>> +out_insert:
>> +	if (best_resv->r_len <= wanted) {
>> +		resv->r_start = best_start;
>> +		resv->r_len = best_len;
>> +		__ocfs2_resv_discard(resmap, best_resv);
>> +	} else {
>> +		best_resv->r_len -= best_len;
>> +		resv->r_start = ocfs2_resv_end(best_resv) + 1;
>> +		resv->r_len = best_len;
>> +	}
>> +	ocfs2_resv_insert(resmap, resv);
>>   }
>>   
>>   static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
heming.zhao@suse.com June 28, 2023, 3:09 a.m. UTC | #7
ping ...

On 4/29/23 2:37 PM, Heming Zhao via Ocfs2-devel wrote:
> ping...
> 
> On 4/21/23 3:35 PM, Joseph Qi wrote:
>> Hi,
>> Could you please share a reproducer?
>>
>> Thanks,
>> Joseph
>>
>> On 4/6/23 11:40 PM, Heming Zhao wrote:
>>> ** problem **
>>>
>>> Current code triggers a defragfs bug [1]:
>>>
>>> ```
>>> tw-tst:~ # defragfs.ocfs2 /mnt/test_from_dd1
>>> defragfs.ocfs2 1.8.7
>>> [ERROR]Move extent failed:"/mnt/test_from_dd1" - No space left on device
>>> [ERROR]"/mnt/test_from_dd1":No space left on device
>>> ```
>>>
>>> I added some debug messages in relevant functions. When above error
>>> messages appeared, the la-window still had enough continuous bitmap
>>> to use, the -ENOSPC is wrong.
>>>
>>> ** analysis **
>>>
>>> __ocfs2_resv_find_window should use resv node from rb-tree to do linear
>>> search. But current code logic uses un-managered area between two resvs.
>>> The searching logic deviates from this func job, which should only focus
>>> on reservation areas (when rb_root is non NULL). Another issue of
>>> __ocfs2_resv_find_window is more inclined to generate inner fragment.
>>> It doesn't search & consume existing reservations but be apt to create
>>> new reservation item.
>>>
>>> This patch pulls this func (__ocfs2_resv_find_window) to do what it
>>> should do: search & consume resev. if this func fails to get suitable
>>> bitmap area, caller ocfs2_resv_find_window fallbacks to use oldest
>>> resv from LRU to do the final search.
>>>
>>> [1]: https://bugzilla.suse.com/show_bug.cgi?id=1131931
>>>
>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>>> ---
>>>   fs/ocfs2/reservations.c | 48 +++++++++++++++++++++++++++++------------
>>>   1 file changed, 34 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
>>> index a9d1296d736d..eda672622d1d 100644
>>> --- a/fs/ocfs2/reservations.c
>>> +++ b/fs/ocfs2/reservations.c
>>> @@ -458,10 +458,11 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>   {
>>>       struct rb_root *root = &resmap->m_reservations;
>>>       unsigned int gap_start, gap_end, gap_len;
>>> -    struct ocfs2_alloc_reservation *prev_resv, *next_resv;
>>> +    struct ocfs2_alloc_reservation *prev_resv, *next_resv, *best_resv;
>>>       struct rb_node *prev, *next;
>>>       unsigned int cstart, clen;
>>>       unsigned int best_start = 0, best_len = 0;
>>> +    int create_new = 0;
>>>       /*
>>>        * Nasty cases to consider:
>>> @@ -540,8 +541,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>           if (clen) {
>>>               best_len = clen;
>>>               best_start = cstart;
>>> +            create_new = 1;
>>>               if (best_len == wanted)
>>> -                goto out_insert;
>>> +                goto out_create;
>>>           }
>>>           prev_resv = next_resv;
>>> @@ -557,13 +559,9 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>       while (1) {
>>>           next = rb_next(prev);
>>>           if (next) {
>>> -            next_resv = rb_entry(next,
>>> -                         struct ocfs2_alloc_reservation,
>>> -                         r_node);
>>> -
>>> -            gap_start = ocfs2_resv_end(prev_resv) + 1;
>>> -            gap_end = next_resv->r_start - 1;
>>> -            gap_len = gap_end - gap_start + 1;
>>> +            gap_start = prev_resv->r_start;
>>> +            gap_end = prev_resv->r_start + prev_resv->r_len - 1;
>>> +            gap_len = prev_resv->r_len;
>>>           } else {
>>>               /*
>>>                * We're at the rightmost edge of the
>>> @@ -575,8 +573,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>               gap_end = resmap->m_bitmap_len - 1;
>>>           }
>>> -        trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
>>> -                    next ? ocfs2_resv_end(next_resv) : -1);
>>> +        trace_ocfs2_resv_find_window_next(next ? prev_resv->r_start : -1,
>>> +                    next ? ocfs2_resv_end(prev_resv) : -1);
>>>           /*
>>>            * No need to check this gap if we have already found
>>>            * a larger region of free bits.
>>> @@ -589,10 +587,16 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>           if (clen == wanted) {
>>>               best_len = clen;
>>>               best_start = cstart;
>>> -            goto out_insert;
>>> +            best_resv = prev_resv;
>>> +            if (!next)
>>> +                goto out_create;
>>> +            else
>>> +                goto out_insert;
>>>           } else if (clen > best_len) {
>>>               best_len = clen;
>>>               best_start = cstart;
>>> +            best_resv = prev_resv;
>>> +            create_new = 0;
>>>           }
>>>   next_resv:
>>> @@ -604,12 +608,28 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
>>>                        r_node);
>>>       }
>>> -out_insert:
>>> -    if (best_len) {
>>> +    if (!best_len)
>>> +        return;
>>> +
>>> +    if (create_new) {
>>> +out_create:
>>>           resv->r_start = best_start;
>>>           resv->r_len = best_len;
>>>           ocfs2_resv_insert(resmap, resv);
>>> +        return;
>>>       }
>>> +
>>> +out_insert:
>>> +    if (best_resv->r_len <= wanted) {
>>> +        resv->r_start = best_start;
>>> +        resv->r_len = best_len;
>>> +        __ocfs2_resv_discard(resmap, best_resv);
>>> +    } else {
>>> +        best_resv->r_len -= best_len;
>>> +        resv->r_start = ocfs2_resv_end(best_resv) + 1;
>>> +        resv->r_len = best_len;
>>> +    }
>>> +    ocfs2_resv_insert(resmap, resv);
>>>   }
>>>   static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
diff mbox series

Patch

diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index a9d1296d736d..eda672622d1d 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -458,10 +458,11 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 {
 	struct rb_root *root = &resmap->m_reservations;
 	unsigned int gap_start, gap_end, gap_len;
-	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+	struct ocfs2_alloc_reservation *prev_resv, *next_resv, *best_resv;
 	struct rb_node *prev, *next;
 	unsigned int cstart, clen;
 	unsigned int best_start = 0, best_len = 0;
+	int create_new = 0;
 
 	/*
 	 * Nasty cases to consider:
@@ -540,8 +541,9 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 		if (clen) {
 			best_len = clen;
 			best_start = cstart;
+			create_new = 1;
 			if (best_len == wanted)
-				goto out_insert;
+				goto out_create;
 		}
 
 		prev_resv = next_resv;
@@ -557,13 +559,9 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 	while (1) {
 		next = rb_next(prev);
 		if (next) {
-			next_resv = rb_entry(next,
-					     struct ocfs2_alloc_reservation,
-					     r_node);
-
-			gap_start = ocfs2_resv_end(prev_resv) + 1;
-			gap_end = next_resv->r_start - 1;
-			gap_len = gap_end - gap_start + 1;
+			gap_start = prev_resv->r_start;
+			gap_end = prev_resv->r_start + prev_resv->r_len - 1;
+			gap_len = prev_resv->r_len;
 		} else {
 			/*
 			 * We're at the rightmost edge of the
@@ -575,8 +573,8 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 			gap_end = resmap->m_bitmap_len - 1;
 		}
 
-		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
-					next ? ocfs2_resv_end(next_resv) : -1);
+		trace_ocfs2_resv_find_window_next(next ? prev_resv->r_start : -1,
+					next ? ocfs2_resv_end(prev_resv) : -1);
 		/*
 		 * No need to check this gap if we have already found
 		 * a larger region of free bits.
@@ -589,10 +587,16 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 		if (clen == wanted) {
 			best_len = clen;
 			best_start = cstart;
-			goto out_insert;
+			best_resv = prev_resv;
+			if (!next)
+				goto out_create;
+			else
+				goto out_insert;
 		} else if (clen > best_len) {
 			best_len = clen;
 			best_start = cstart;
+			best_resv = prev_resv;
+			create_new = 0;
 		}
 
 next_resv:
@@ -604,12 +608,28 @@  static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 				     r_node);
 	}
 
-out_insert:
-	if (best_len) {
+	if (!best_len)
+		return;
+
+	if (create_new) {
+out_create:
 		resv->r_start = best_start;
 		resv->r_len = best_len;
 		ocfs2_resv_insert(resmap, resv);
+		return;
 	}
+
+out_insert:
+	if (best_resv->r_len <= wanted) {
+		resv->r_start = best_start;
+		resv->r_len = best_len;
+		__ocfs2_resv_discard(resmap, best_resv);
+	} else {
+		best_resv->r_len -= best_len;
+		resv->r_start = ocfs2_resv_end(best_resv) + 1;
+		resv->r_len = best_len;
+	}
+	ocfs2_resv_insert(resmap, resv);
 }
 
 static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,