diff mbox series

[RFC,v3,11/21] xfs: Unmap blocks according to forcealign

Message ID 20240429174746.2132161-12-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry April 29, 2024, 5:47 p.m. UTC
For when forcealign is enabled, blocks in an inode need to be unmapped
according to extent alignment, like what is already done for rtvol.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 39 +++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_inode.h       |  5 +++++
 2 files changed, 38 insertions(+), 6 deletions(-)

Comments

Dave Chinner May 1, 2024, 12:10 a.m. UTC | #1
On Mon, Apr 29, 2024 at 05:47:36PM +0000, John Garry wrote:
> For when forcealign is enabled, blocks in an inode need to be unmapped
> according to extent alignment, like what is already done for rtvol.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c | 39 +++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_inode.h       |  5 +++++
>  2 files changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 4f39a43d78a7..4a78ab193753 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -5339,6 +5339,15 @@ xfs_bmap_del_extent_real(
>  	return 0;
>  }
>  
> +/* Return the offset of an block number within an extent for forcealign. */
> +static xfs_extlen_t
> +xfs_forcealign_extent_offset(
> +	struct xfs_inode	*ip,
> +	xfs_fsblock_t		bno)
> +{
> +	return bno & (ip->i_extsize - 1);
> +}
> +
>  /*
>   * Unmap (remove) blocks from a file.
>   * If nexts is nonzero then the number of extents to remove is limited to
> @@ -5361,6 +5370,7 @@ __xfs_bunmapi(
>  	struct xfs_bmbt_irec	got;		/* current extent record */
>  	struct xfs_ifork	*ifp;		/* inode fork pointer */
>  	int			isrt;		/* freeing in rt area */
> +	int			isforcealign;	/* freeing for file inode with forcealign */
>  	int			logflags;	/* transaction logging flags */
>  	xfs_extlen_t		mod;		/* rt extent offset */
>  	struct xfs_mount	*mp = ip->i_mount;
> @@ -5397,7 +5407,10 @@ __xfs_bunmapi(
>  		return 0;
>  	}
>  	XFS_STATS_INC(mp, xs_blk_unmap);
> -	isrt = xfs_ifork_is_realtime(ip, whichfork);
> +	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);

Why did you change this check? What's wrong with
xfs_ifork_is_realtime(), and if there is something wrong, why
shouldn't xfs_ifork_is_relatime() get fixed?

> +	isforcealign = (whichfork == XFS_DATA_FORK) &&
> +			xfs_inode_has_forcealign(ip) &&
> +			xfs_inode_has_extsize(ip) && ip->i_extsize > 1;

This is one of the reasons why I said xfs_inode_has_forcealign()
should be checking that extent size hints should be checked in that
helper....

>  	end = start + len;
>  
>  	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
> @@ -5459,11 +5472,15 @@ __xfs_bunmapi(
>  		if (del.br_startoff + del.br_blockcount > end + 1)
>  			del.br_blockcount = end + 1 - del.br_startoff;
>  
> -		if (!isrt || (flags & XFS_BMAPI_REMAP))
> +		if ((!isrt && !isforcealign) || (flags & XFS_BMAPI_REMAP))
>  			goto delete;
>  
> -		mod = xfs_rtb_to_rtxoff(mp,
> -				del.br_startblock + del.br_blockcount);
> +		if (isrt)
> +			mod = xfs_rtb_to_rtxoff(mp,
> +					del.br_startblock + del.br_blockcount);
> +		else if (isforcealign)
> +			mod = xfs_forcealign_extent_offset(ip,
> +					del.br_startblock + del.br_blockcount);

There's got to be a cleaner way to do this.

We already know that either isrt or isforcealign must be set here,
so there's no need for the "else if" construct.

Also, forcealign should take precedence over realtime, so that
forcealign will work on realtime devices as well. I'd change this
code to call a wrapper like:

		mod = xfs_bunmapi_align(ip, del.br_startblock + del.br_blockcount);

static xfs_extlen_t
xfs_bunmapi_align(
	struct xfs_inode	*ip,
	xfs_fsblock_t		bno)
{
	if (!XFS_INODE_IS_REALTIME(ip)) {
		ASSERT(xfs_inode_has_forcealign(ip))
		if (is_power_of_2(ip->i_extsize))
			return bno & (ip->i_extsize - 1);
		return do_div(bno, ip->i_extsize);
	}
	return xfs_rtb_to_rtxoff(ip->i_mount, bno);
}



>  		if (mod) {
>  			/*
>  			 * Realtime extent not lined up at the end.
> @@ -5511,9 +5528,19 @@ __xfs_bunmapi(
>  			goto nodelete;
>  		}
>  
> -		mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
> +		if (isrt)
> +			mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
> +		else if (isforcealign)
> +			mod = xfs_forcealign_extent_offset(ip,
> +					del.br_startblock);
> +
		mod = xfs_bunmapi_align(ip, del.br_startblock);

>  		if (mod) {
> -			xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
> +			xfs_extlen_t off;
> +
> +			if (isrt)
> +				off = mp->m_sb.sb_rextsize - mod;
> +			else if (isforcealign)
> +				off = ip->i_extsize - mod;

			if (forcealign)
				off = ip->i_extsize - mod;
			else
				off = mp->m_sb.sb_rextsize - mod;

-Dave.
John Garry May 1, 2024, 10:54 a.m. UTC | #2
On 01/05/2024 01:10, Dave Chinner wrote:
> On Mon, Apr 29, 2024 at 05:47:36PM +0000, John Garry wrote:
>> For when forcealign is enabled, blocks in an inode need to be unmapped
>> according to extent alignment, like what is already done for rtvol.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/libxfs/xfs_bmap.c | 39 +++++++++++++++++++++++++++++++++------
>>   fs/xfs/xfs_inode.h       |  5 +++++
>>   2 files changed, 38 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
>> index 4f39a43d78a7..4a78ab193753 100644
>> --- a/fs/xfs/libxfs/xfs_bmap.c
>> +++ b/fs/xfs/libxfs/xfs_bmap.c
>> @@ -5339,6 +5339,15 @@ xfs_bmap_del_extent_real(
>>   	return 0;
>>   }
>>   
>> +/* Return the offset of an block number within an extent for forcealign. */
>> +static xfs_extlen_t
>> +xfs_forcealign_extent_offset(
>> +	struct xfs_inode	*ip,
>> +	xfs_fsblock_t		bno)
>> +{
>> +	return bno & (ip->i_extsize - 1);
>> +}
>> +
>>   /*
>>    * Unmap (remove) blocks from a file.
>>    * If nexts is nonzero then the number of extents to remove is limited to
>> @@ -5361,6 +5370,7 @@ __xfs_bunmapi(
>>   	struct xfs_bmbt_irec	got;		/* current extent record */
>>   	struct xfs_ifork	*ifp;		/* inode fork pointer */
>>   	int			isrt;		/* freeing in rt area */
>> +	int			isforcealign;	/* freeing for file inode with forcealign */
>>   	int			logflags;	/* transaction logging flags */
>>   	xfs_extlen_t		mod;		/* rt extent offset */
>>   	struct xfs_mount	*mp = ip->i_mount;
>> @@ -5397,7 +5407,10 @@ __xfs_bunmapi(
>>   		return 0;
>>   	}
>>   	XFS_STATS_INC(mp, xs_blk_unmap);
>> -	isrt = xfs_ifork_is_realtime(ip, whichfork);
>> +	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
> 
> Why did you change this check? What's wrong with
> xfs_ifork_is_realtime(), and if there is something wrong, why
> shouldn't xfs_ifork_is_relatime() get fixed?

oops, I should have not made that change. I must have changed it when 
debugging and not reverted it.

> 
>> +	isforcealign = (whichfork == XFS_DATA_FORK) &&
>> +			xfs_inode_has_forcealign(ip) &&
>> +			xfs_inode_has_extsize(ip) && ip->i_extsize > 1;
> 
> This is one of the reasons why I said xfs_inode_has_forcealign()
> should be checking that extent size hints should be checked in that
> helper....

Right. In this particular case, I found that directories may be 
considered as well if we don't check for xfs_inode_has_extsize() (which 
we don't want).

> 
>>   	end = start + len;
>>   
>>   	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
>> @@ -5459,11 +5472,15 @@ __xfs_bunmapi(
>>   		if (del.br_startoff + del.br_blockcount > end + 1)
>>   			del.br_blockcount = end + 1 - del.br_startoff;
>>   
>> -		if (!isrt || (flags & XFS_BMAPI_REMAP))
>> +		if ((!isrt && !isforcealign) || (flags & XFS_BMAPI_REMAP))
>>   			goto delete;
>>   
>> -		mod = xfs_rtb_to_rtxoff(mp,
>> -				del.br_startblock + del.br_blockcount);
>> +		if (isrt)
>> +			mod = xfs_rtb_to_rtxoff(mp,
>> +					del.br_startblock + del.br_blockcount);
>> +		else if (isforcealign)
>> +			mod = xfs_forcealign_extent_offset(ip,
>> +					del.br_startblock + del.br_blockcount);
> 
> There's got to be a cleaner way to do this.
> 
> We already know that either isrt or isforcealign must be set here,
> so there's no need for the "else if" construct.

right

> 
> Also, forcealign should take precedence over realtime, so that
> forcealign will work on realtime devices as well. I'd change this
> code to call a wrapper like:
> 
> 		mod = xfs_bunmapi_align(ip, del.br_startblock + del.br_blockcount);
> 
> static xfs_extlen_t
> xfs_bunmapi_align(
> 	struct xfs_inode	*ip,
> 	xfs_fsblock_t		bno)
> {
> 	if (!XFS_INODE_IS_REALTIME(ip)) {
> 		ASSERT(xfs_inode_has_forcealign(ip))
> 		if (is_power_of_2(ip->i_extsize))
> 			return bno & (ip->i_extsize - 1);
> 		return do_div(bno, ip->i_extsize);
> 	}
> 	return xfs_rtb_to_rtxoff(ip->i_mount, bno);
> }

ok, that's neater

Thanks,
John
John Garry June 6, 2024, 9:50 a.m. UTC | #3
Hi Dave,

>>   
>>   	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
>> @@ -5459,11 +5472,15 @@ __xfs_bunmapi(
>>   		if (del.br_startoff + del.br_blockcount > end + 1)
>>   			del.br_blockcount = end + 1 - del.br_startoff;
>>   
>> -		if (!isrt || (flags & XFS_BMAPI_REMAP))
>> +		if ((!isrt && !isforcealign) || (flags & XFS_BMAPI_REMAP))
>>   			goto delete;
>>   
>> -		mod = xfs_rtb_to_rtxoff(mp,
>> -				del.br_startblock + del.br_blockcount);
>> +		if (isrt)
>> +			mod = xfs_rtb_to_rtxoff(mp,
>> +					del.br_startblock + del.br_blockcount);
>> +		else if (isforcealign)
>> +			mod = xfs_forcealign_extent_offset(ip,
>> +					del.br_startblock + del.br_blockcount);
> There's got to be a cleaner way to do this.
> 
> We already know that either isrt or isforcealign must be set here,
> so there's no need for the "else if" construct.
> 
> Also, forcealign should take precedence over realtime, so that
> forcealign will work on realtime devices as well. I'd change this
> code to call a wrapper like:
> 
> 		mod = xfs_bunmapi_align(ip, del.br_startblock + del.br_blockcount);
> 
> static xfs_extlen_t
> xfs_bunmapi_align(
> 	struct xfs_inode	*ip,
> 	xfs_fsblock_t		bno)
> {
> 	if (!XFS_INODE_IS_REALTIME(ip)) {
> 		ASSERT(xfs_inode_has_forcealign(ip))
> 		if (is_power_of_2(ip->i_extsize))
> 			return bno & (ip->i_extsize - 1);
> 		return do_div(bno, ip->i_extsize);
> 	}
> 	return xfs_rtb_to_rtxoff(ip->i_mount, bno);
> }

I have made that change according to your suggestion.

However, now that the forcealign power-of-2 extsize restriction has been 
lifted, I am finding another bug.

I will just mention this now, as I want to go back to that other issue 
https://lore.kernel.org/linux-xfs/20240429174746.2132161-1-john.g.garry@oracle.com/T/#mebd7e97dfd0f12219bf92f289c41f62bf2abcff5

However this new one is pretty simple to reproduce.

We create a file:

ext:  logical_offset:        physical_offset: length:   expected: flags:
0:    0..    1775:      40200..     41975:   1776:            last,eof
/root/mnt2/file_22: 1 extent found

The forcealign extsize is 98304, i.e. 24 blks.

And then try to delete it, and get this:

[   17.604237] XFS: Assertion failed: tp->t_blk_res > 0, file: 
fs/xfs/libxfs/xfs_bmap.c, line: 5599
[   17.605908] ------------[ cut here ]------------
[   17.606884] kernel BUG at fs/xfs/xfs_message.c:102!
[   17.607917] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   17.609134] CPU: 3 PID: 240 Comm: kworker/3:2 Not tainted 
6.10.0-rc1-00096-g759a4497daa7-dirty #2553
[   17.610606] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[   17.612619] Workqueue: xfs-inodegc/sda xfs_inodegc_worker
[   17.613682] RIP: 0010:assfail+0x36/0x40
[   17.614134] Code: c2 18 1e 1d 9d 48 89 f1 48 89 fe 48 c7 c7 43 f2 12 
9d e8 7d fd ff ff 80 3d ae 86 8d 01 00 75 09 90 0f 0b 90 c3 cc cc cc cc 
90 <0f> 0b 0f 1f 84 00 00 00 00 00 90 90 90 90 900
[   17.616478] RSP: 0018:ff4887cac0973c28 EFLAGS: 00010202
[   17.617080] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 
000000007fffffff
[   17.617899] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
ffffffff9d12f243
[   17.618717] RBP: ff4887cac0973d40 R08: 0000000000000000 R09: 
000000000000000a
[   17.619548] R10: 000000000000000a R11: f000000000000000 R12: 
ff360ff881d88000[   17.620360] R13: 0000000000000000 R14: 
ff360ff8efa32040 R15: 000ffffffffe0000
[   17.620367] FS:  0000000000000000(0000) GS:ff360ffa75cc0000(0000) 
knlGS:0000000000000000
[   17.620369] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   17.620371] CR2: 00007f213b4ef008 CR3: 000000011cfca003 CR4: 
0000000000771ef0
[   17.620372] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[   17.620374] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[   17.620375] PKRU: 55555554
[   17.620376] Call Trace:
[   17.620384]  <TASK>
[   17.624278]  ? die+0x32/0x90
[   17.624284]  ? do_trap+0xd8/0x100
[   17.624286]  ? assfail+0x36/0x40
[   17.624288]  ? do_error_trap+0x60/0x80
[   17.624289]  ? assfail+0x36/0x40
[   17.624292]  ? exc_invalid_op+0x53/0x70
[   17.628287]  ? assfail+0x36/0x40
[   17.628291]  ? asm_exc_invalid_op+0x1a/0x20
[   17.628295]  ? assfail+0x36/0x40
[   17.628297]  ? assfail+0x23/0x40
[   17.628299]  __xfs_bunmapi+0xb87/0xeb0
[   17.628304]  ? xfs_log_reserve+0x18f/0x210
[   17.629447]  xfs_bunmapi_range+0x62/0xd0
[   17.631699]  xfs_itruncate_extents_flags+0x1c4/0x410
[   17.631703]  xfs_inactive_truncate+0xba/0x140
[   17.631705]  xfs_inactive+0x331/0x3d0
[   17.631707]  xfs_inodegc_worker+0xb8/0x190
[   17.631709]  process_one_work+0x157/0x380
[   17.633949]  worker_thread+0x2ba/0x3e0
[   17.633953]  ? __pfx_worker_thread+0x10/0x10
[   17.633954]  kthread+0xce/0x100
[   17.633958]  ? __pfx_kthread+0x10/0x10
[   17.633960]  ret_from_fork+0x2c/0x50
[   17.633963]  ? __pfx_kthread+0x10/0x10
[   17.633965]  ret_from_fork_asm+0x1a/0x30
[   17.636314]  </TASK>
[   17.640377] Modules linked in:
[   17.642375] ---[ end trace 0000000000000000 ]---

Maybe something is going wrong with the AG bno vs fsbno indexing.

That extent allocated has fsbno=50552 (% 24 != 0). The agsize is 22416 fsb.

That 50552 comes from xfs_alloc_vextent_finish() with args->fsbno=50552 
= XFS_AGB_TO_FSB(mp, agno=1, agbno=17784) = 32K 
(=roundup_power_of_2(22416)) + 17784

So the agbno is aligned, but the fsbno is not.

In __xfs_bunmapi(), at this point:

	mod = xfs_bunmapi_align(ip, del.br_startblock);

	if (mod) {
		xfs_extlen_t off;

		if (isforcealign) {
			off = ip->i_extsize - mod;
		} else {
			ASSERT(isrt);
			off = mp->m_sb.sb_rextsize - mod;
		}

		/*
		 * Realtime extent is lined up at the end but not
		 * at the front.  We'll get rid of full extents if
		 * we can.
		 */

mod=8 del.br_startblock(48776) + del.br_blockcount(1776)=50552

Since this code was originally only used for rt, we may be missing 
setting some struct members which were being set for rt. For example, 
xfs_trans_alloc() accepts rtextents value, and maybe we should be doing 
similar for forcealign. Or xfs_fsb_to_db() has special RT handling, but 
I doubt this is the problem.

I have no such issue in using a power-of-2 extsize.

I do realise that I need to share the full code, but I'm reluctant to 
post with known bugs.

Please let me know if you have an idea. I'll look at this further when I 
get a chance.

Thanks,
John
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4f39a43d78a7..4a78ab193753 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5339,6 +5339,15 @@  xfs_bmap_del_extent_real(
 	return 0;
 }
 
+/* Return the offset of an block number within an extent for forcealign. */
+static xfs_extlen_t
+xfs_forcealign_extent_offset(
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		bno)
+{
+	return bno & (ip->i_extsize - 1);
+}
+
 /*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
@@ -5361,6 +5370,7 @@  __xfs_bunmapi(
 	struct xfs_bmbt_irec	got;		/* current extent record */
 	struct xfs_ifork	*ifp;		/* inode fork pointer */
 	int			isrt;		/* freeing in rt area */
+	int			isforcealign;	/* freeing for file inode with forcealign */
 	int			logflags;	/* transaction logging flags */
 	xfs_extlen_t		mod;		/* rt extent offset */
 	struct xfs_mount	*mp = ip->i_mount;
@@ -5397,7 +5407,10 @@  __xfs_bunmapi(
 		return 0;
 	}
 	XFS_STATS_INC(mp, xs_blk_unmap);
-	isrt = xfs_ifork_is_realtime(ip, whichfork);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	isforcealign = (whichfork == XFS_DATA_FORK) &&
+			xfs_inode_has_forcealign(ip) &&
+			xfs_inode_has_extsize(ip) && ip->i_extsize > 1;
 	end = start + len;
 
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
@@ -5459,11 +5472,15 @@  __xfs_bunmapi(
 		if (del.br_startoff + del.br_blockcount > end + 1)
 			del.br_blockcount = end + 1 - del.br_startoff;
 
-		if (!isrt || (flags & XFS_BMAPI_REMAP))
+		if ((!isrt && !isforcealign) || (flags & XFS_BMAPI_REMAP))
 			goto delete;
 
-		mod = xfs_rtb_to_rtxoff(mp,
-				del.br_startblock + del.br_blockcount);
+		if (isrt)
+			mod = xfs_rtb_to_rtxoff(mp,
+					del.br_startblock + del.br_blockcount);
+		else if (isforcealign)
+			mod = xfs_forcealign_extent_offset(ip,
+					del.br_startblock + del.br_blockcount);
 		if (mod) {
 			/*
 			 * Realtime extent not lined up at the end.
@@ -5511,9 +5528,19 @@  __xfs_bunmapi(
 			goto nodelete;
 		}
 
-		mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
+		if (isrt)
+			mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
+		else if (isforcealign)
+			mod = xfs_forcealign_extent_offset(ip,
+					del.br_startblock);
+
 		if (mod) {
-			xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
+			xfs_extlen_t off;
+
+			if (isrt)
+				off = mp->m_sb.sb_rextsize - mod;
+			else if (isforcealign)
+				off = ip->i_extsize - mod;
 
 			/*
 			 * Realtime extent is lined up at the end but not
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 065028789473..3f13943ab3a3 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -316,6 +316,11 @@  static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
 }
 
+static inline bool xfs_inode_has_extsize(struct xfs_inode *ip)
+{
+	return ip->i_diflags & XFS_DIFLAG_EXTSIZE;
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */