diff mbox series

xfs: fix hung when transaction commit fail in xfs_inactive_ifree

Message ID 20221209110519.GA3741914@ceph-admin (mailing list archive)
State Superseded, archived
Headers show
Series xfs: fix hung when transaction commit fail in xfs_inactive_ifree | expand

Commit Message

Long Li Dec. 9, 2022, 11:05 a.m. UTC
After running unplug disk test and unmount filesystem, the umount thread
hung all the time.

 crash> dmesg
 sd 0:0:0:0: rejecting I/O to offline device
 XFS (sda): log I/O error -5
 XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0
	(fs/xfs/libxfs/xfs_defer.c:504).  Shutting down filesystem.
 XFS (sda): Please unmount the filesystem and rectify the problem(s)
 XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5
 XFS (sda): Unmounting Filesystem

 crash> bt 3368
 PID: 3368   TASK: ffff88801bcd8040  CPU: 3   COMMAND: "umount"
  #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25
  #1 [ffffc900086a7be8] schedule at ffffffff83d414dd
  #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24
  #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2
  #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff
  #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69
  #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd
  #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa
  #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da
  #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639
 #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5
 #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf
 #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf
 #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136
 #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb
 #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9
 #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085

When we free a cluster buffer from xfs_ifree_cluster, all the inodes in
cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are
handled by both buffer and inode log items, inodes marked as XFS_ISTALE
in AIL will be removed from the AIL because the buffer log item will clean
it. If the transaction commit fails in the xfs_inactive_ifree(), inodes
marked as XFS_ISTALE will be left in AIL due to buf log item is not
committed, this will cause the unmount thread above to be blocked all the
time. Error handling in xfs_inactive_ifree() is not enough, the above
exception needs to be considered.

Signed-off-by: Long Li <leo.lilong@huawei.com>
---
 fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_inode.h |   1 -
 2 files changed, 105 insertions(+), 10 deletions(-)

Comments

Darrick J. Wong Feb. 1, 2023, 3:21 a.m. UTC | #1
On Fri, Dec 09, 2022 at 07:05:19PM +0800, Long Li wrote:
> After running unplug disk test and unmount filesystem, the umount thread
> hung all the time.
> 
>  crash> dmesg
>  sd 0:0:0:0: rejecting I/O to offline device
>  XFS (sda): log I/O error -5
>  XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0
> 	(fs/xfs/libxfs/xfs_defer.c:504).  Shutting down filesystem.
>  XFS (sda): Please unmount the filesystem and rectify the problem(s)
>  XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5
>  XFS (sda): Unmounting Filesystem
> 
>  crash> bt 3368
>  PID: 3368   TASK: ffff88801bcd8040  CPU: 3   COMMAND: "umount"
>   #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25
>   #1 [ffffc900086a7be8] schedule at ffffffff83d414dd
>   #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24
>   #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2
>   #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff
>   #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69
>   #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd
>   #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa
>   #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da
>   #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639
>  #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5
>  #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf
>  #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf
>  #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136
>  #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb
>  #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9
>  #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085
> 
> When we free a cluster buffer from xfs_ifree_cluster, all the inodes in
> cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are
> handled by both buffer and inode log items, inodes marked as XFS_ISTALE
> in AIL will be removed from the AIL because the buffer log item will clean
> it. If the transaction commit fails in the xfs_inactive_ifree(), inodes
> marked as XFS_ISTALE will be left in AIL due to buf log item is not
> committed, this will cause the unmount thread above to be blocked all the
> time. Error handling in xfs_inactive_ifree() is not enough, the above
> exception needs to be considered.
> 
> Signed-off-by: Long Li <leo.lilong@huawei.com>
> ---
>  fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++----
>  fs/xfs/xfs_inode.h |   1 -
>  2 files changed, 105 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index d354ea2b74f9..b6808c0a2868 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -49,6 +49,9 @@ struct kmem_cache *xfs_inode_cache;
>  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
>  STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
>  	struct xfs_inode *);
> +STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip,
> +		struct xfs_icluster *xic);
> +STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic);
>  
>  /*
>   * helper function to extract extent size hint from inode
> @@ -1544,6 +1547,7 @@ xfs_inactive_ifree(
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_trans	*tp;
> +	struct xfs_icluster     xic = { 0 };
>  	int			error;
>  
>  	/*
> @@ -1598,7 +1602,7 @@ xfs_inactive_ifree(
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
>  
> -	error = xfs_ifree(tp, ip);
> +	error = xfs_ifree(tp, ip, &xic);
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
>  	if (error) {
>  		/*
> @@ -1612,7 +1616,7 @@ xfs_inactive_ifree(
>  			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
>  		}
>  		xfs_trans_cancel(tp);
> -		return error;
> +		goto out_error;
>  	}
>  
>  	/*
> @@ -1625,11 +1629,19 @@ xfs_inactive_ifree(
>  	 * to try to keep going. Make sure it's not a silent error.
>  	 */
>  	error = xfs_trans_commit(tp);
> -	if (error)
> +	if (error) {
>  		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
>  			__func__, error);
> +		goto out_error;
> +	}
>  
>  	return 0;
> +
> +out_error:
> +	if (xic.deleted)
> +		xfs_ifree_abort(ip, &xic);
> +
> +	return error;
>  }
>  
>  /*
> @@ -2259,14 +2271,14 @@ xfs_ifree_cluster(
>   * inodes in the AGI. We need to remove the inode from that list atomically with
>   * respect to freeing it here.
>   */
> -int
> +STATIC int
>  xfs_ifree(
>  	struct xfs_trans	*tp,
> -	struct xfs_inode	*ip)
> +	struct xfs_inode	*ip,
> +	struct xfs_icluster     *xic)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_perag	*pag;
> -	struct xfs_icluster	xic = { 0 };
>  	struct xfs_inode_log_item *iip = ip->i_itemp;
>  	int			error;
>  
> @@ -2284,7 +2296,7 @@ xfs_ifree(
>  	 * makes the AGI lock -> unlinked list modification order the same as
>  	 * used in O_TMPFILE creation.
>  	 */
> -	error = xfs_difree(tp, pag, ip->i_ino, &xic);
> +	error = xfs_difree(tp, pag, ip->i_ino, xic);
>  	if (error)
>  		goto out;
>  
> @@ -2323,13 +2335,97 @@ xfs_ifree(
>  	VFS_I(ip)->i_generation++;
>  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
>  
> -	if (xic.deleted)
> -		error = xfs_ifree_cluster(tp, pag, ip, &xic);
> +	if (xic->deleted)
> +		error = xfs_ifree_cluster(tp, pag, ip, xic);
>  out:
>  	xfs_perag_put(pag);
>  	return error;
>  }
>  
> +static void
> +xfs_ifree_abort_inode_stale(
> +	struct xfs_perag	*pag,
> +	xfs_ino_t		inum)
> +{
> +	struct xfs_mount        *mp = pag->pag_mount;
> +	struct xfs_inode_log_item *iip;
> +	struct xfs_inode	*ip;
> +
> +retry:
> +	rcu_read_lock();
> +	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
> +
> +	/* Inode not in memory, nothing to do */
> +	if (!ip) {
> +		rcu_read_unlock();
> +		return;
> +	}
> +
> +	/* Skip invalid or not stale inode */
> +	if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) {
> +		rcu_read_unlock();
> +		return;
> +	}
> +
> +	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> +		rcu_read_unlock();
> +		delay(1);
> +		goto retry;
> +	}
> +
> +	iip = ip->i_itemp;
> +	if (!iip || list_empty(&iip->ili_item.li_bio_list))
> +		goto out_iunlock;
> +
> +	if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags))
> +		xfs_iflush_abort(ip);
> +	else
> +		xfs_iflags_clear(ip, XFS_IFLUSHING);

Er... why is the ifree code tearing into the inode log item state ?

Shouldn't this be getting done from the buffer log item when we release
it and find that it's aborted?

--D

> +
> +out_iunlock:
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	rcu_read_unlock();
> +}
> +
> +/*
> + * This is called to clean up inodes marked as stale in xfs_ifree
> + */
> +STATIC void
> +xfs_ifree_abort(
> +	struct xfs_inode	*ip,
> +	struct xfs_icluster	*xic)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_perag        *pag;
> +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> +	xfs_ino_t		inum = xic->first_ino;
> +	int			nbufs;
> +	int			i, j;
> +	int			ioffset;
> +
> +	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
> +
> +	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
> +
> +	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
> +		/*
> +		 * The allocation bitmap tells us which inodes of the chunk were
> +		 * physically allocated. Skip the cluster if an inode falls into
> +		 * a sparse region.
> +		 */
> +		ioffset = inum - xic->first_ino;
> +		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
> +			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
> +			continue;
> +		}
> +
> +		for (i = 0; i < igeo->inodes_per_cluster; i++)
> +			xfs_ifree_abort_inode_stale(pag, inum + i);
> +
> +	}
> +	xfs_perag_put(pag);
> +}
> +
>  /*
>   * This is called to unpin an inode.  The caller must have the inode locked
>   * in at least shared mode so that the buffer cannot be subsequently pinned
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index fa780f08dc89..423542bf6af1 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -499,7 +499,6 @@ uint		xfs_ilock_data_map_shared(struct xfs_inode *);
>  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
>  
>  uint		xfs_ip2xflags(struct xfs_inode *);
> -int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
>  int		xfs_itruncate_extents_flags(struct xfs_trans **,
>  				struct xfs_inode *, int, xfs_fsize_t, int);
>  void		xfs_iext_realloc(xfs_inode_t *, int, int);
> -- 
> 2.31.1
>
Long Li Feb. 11, 2023, 11:46 a.m. UTC | #2
On Tue, Jan 31, 2023 at 07:21:39PM -0800, Darrick J. Wong wrote:
> On Fri, Dec 09, 2022 at 07:05:19PM +0800, Long Li wrote:
> > After running unplug disk test and unmount filesystem, the umount thread
> > hung all the time.
> > 
> >  crash> dmesg
> >  sd 0:0:0:0: rejecting I/O to offline device
> >  XFS (sda): log I/O error -5
> >  XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0
> > 	(fs/xfs/libxfs/xfs_defer.c:504).  Shutting down filesystem.
> >  XFS (sda): Please unmount the filesystem and rectify the problem(s)
> >  XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5
> >  XFS (sda): Unmounting Filesystem
> > 
> >  crash> bt 3368
> >  PID: 3368   TASK: ffff88801bcd8040  CPU: 3   COMMAND: "umount"
> >   #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25
> >   #1 [ffffc900086a7be8] schedule at ffffffff83d414dd
> >   #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24
> >   #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2
> >   #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff
> >   #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69
> >   #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd
> >   #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa
> >   #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da
> >   #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639
> >  #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5
> >  #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf
> >  #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf
> >  #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136
> >  #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb
> >  #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9
> >  #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085
> > 
> > When we free a cluster buffer from xfs_ifree_cluster, all the inodes in
> > cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are
> > handled by both buffer and inode log items, inodes marked as XFS_ISTALE
> > in AIL will be removed from the AIL because the buffer log item will clean
> > it. If the transaction commit fails in the xfs_inactive_ifree(), inodes
> > marked as XFS_ISTALE will be left in AIL due to buf log item is not
> > committed, this will cause the unmount thread above to be blocked all the
> > time. Error handling in xfs_inactive_ifree() is not enough, the above
> > exception needs to be considered.
> > 
> > Signed-off-by: Long Li <leo.lilong@huawei.com>
> > ---
> >  fs/xfs/xfs_inode.c | 114 +++++++++++++++++++++++++++++++++++++++++----
> >  fs/xfs/xfs_inode.h |   1 -
> >  2 files changed, 105 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index d354ea2b74f9..b6808c0a2868 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -49,6 +49,9 @@ struct kmem_cache *xfs_inode_cache;
> >  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
> >  STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
> >  	struct xfs_inode *);
> > +STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip,
> > +		struct xfs_icluster *xic);
> > +STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic);
> >  
> >  /*
> >   * helper function to extract extent size hint from inode
> > @@ -1544,6 +1547,7 @@ xfs_inactive_ifree(
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	struct xfs_trans	*tp;
> > +	struct xfs_icluster     xic = { 0 };
> >  	int			error;
> >  
> >  	/*
> > @@ -1598,7 +1602,7 @@ xfs_inactive_ifree(
> >  	xfs_ilock(ip, XFS_ILOCK_EXCL);
> >  	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> >  
> > -	error = xfs_ifree(tp, ip);
> > +	error = xfs_ifree(tp, ip, &xic);
> >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> >  	if (error) {
> >  		/*
> > @@ -1612,7 +1616,7 @@ xfs_inactive_ifree(
> >  			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> >  		}
> >  		xfs_trans_cancel(tp);
> > -		return error;
> > +		goto out_error;
> >  	}
> >  
> >  	/*
> > @@ -1625,11 +1629,19 @@ xfs_inactive_ifree(
> >  	 * to try to keep going. Make sure it's not a silent error.
> >  	 */
> >  	error = xfs_trans_commit(tp);
> > -	if (error)
> > +	if (error) {
> >  		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
> >  			__func__, error);
> > +		goto out_error;
> > +	}
> >  
> >  	return 0;
> > +
> > +out_error:
> > +	if (xic.deleted)
> > +		xfs_ifree_abort(ip, &xic);
> > +
> > +	return error;
> >  }
> >  
> >  /*
> > @@ -2259,14 +2271,14 @@ xfs_ifree_cluster(
> >   * inodes in the AGI. We need to remove the inode from that list atomically with
> >   * respect to freeing it here.
> >   */
> > -int
> > +STATIC int
> >  xfs_ifree(
> >  	struct xfs_trans	*tp,
> > -	struct xfs_inode	*ip)
> > +	struct xfs_inode	*ip,
> > +	struct xfs_icluster     *xic)
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	struct xfs_perag	*pag;
> > -	struct xfs_icluster	xic = { 0 };
> >  	struct xfs_inode_log_item *iip = ip->i_itemp;
> >  	int			error;
> >  
> > @@ -2284,7 +2296,7 @@ xfs_ifree(
> >  	 * makes the AGI lock -> unlinked list modification order the same as
> >  	 * used in O_TMPFILE creation.
> >  	 */
> > -	error = xfs_difree(tp, pag, ip->i_ino, &xic);
> > +	error = xfs_difree(tp, pag, ip->i_ino, xic);
> >  	if (error)
> >  		goto out;
> >  
> > @@ -2323,13 +2335,97 @@ xfs_ifree(
> >  	VFS_I(ip)->i_generation++;
> >  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> >  
> > -	if (xic.deleted)
> > -		error = xfs_ifree_cluster(tp, pag, ip, &xic);
> > +	if (xic->deleted)
> > +		error = xfs_ifree_cluster(tp, pag, ip, xic);
> >  out:
> >  	xfs_perag_put(pag);
> >  	return error;
> >  }
> >  
> > +static void
> > +xfs_ifree_abort_inode_stale(
> > +	struct xfs_perag	*pag,
> > +	xfs_ino_t		inum)
> > +{
> > +	struct xfs_mount        *mp = pag->pag_mount;
> > +	struct xfs_inode_log_item *iip;
> > +	struct xfs_inode	*ip;
> > +
> > +retry:
> > +	rcu_read_lock();
> > +	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
> > +
> > +	/* Inode not in memory, nothing to do */
> > +	if (!ip) {
> > +		rcu_read_unlock();
> > +		return;
> > +	}
> > +
> > +	/* Skip invalid or not stale inode */
> > +	if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) {
> > +		rcu_read_unlock();
> > +		return;
> > +	}
> > +
> > +	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
> > +		rcu_read_unlock();
> > +		delay(1);
> > +		goto retry;
> > +	}
> > +
> > +	iip = ip->i_itemp;
> > +	if (!iip || list_empty(&iip->ili_item.li_bio_list))
> > +		goto out_iunlock;
> > +
> > +	if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags))
> > +		xfs_iflush_abort(ip);
> > +	else
> > +		xfs_iflags_clear(ip, XFS_IFLUSHING);
> 
> Er... why is the ifree code tearing into the inode log item state ?
> 
> Shouldn't this be getting done from the buffer log item when we release
> it and find that it's aborted?
> 
> --D

Yes, it doesn't looks good here, traverse buffer's b_li_list and abort xfs_inode
marked as XFS_ISTALE could be better.
> 
> > +
> > +out_iunlock:
> > +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > +	rcu_read_unlock();
> > +}
> > +
> > +/*
> > + * This is called to clean up inodes marked as stale in xfs_ifree
> > + */
> > +STATIC void
> > +xfs_ifree_abort(
> > +	struct xfs_inode	*ip,
> > +	struct xfs_icluster	*xic)
> > +{
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	struct xfs_perag        *pag;
> > +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> > +	xfs_ino_t		inum = xic->first_ino;
> > +	int			nbufs;
> > +	int			i, j;
> > +	int			ioffset;
> > +
> > +	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
> > +
> > +	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
> > +
> > +	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
> > +		/*
> > +		 * The allocation bitmap tells us which inodes of the chunk were
> > +		 * physically allocated. Skip the cluster if an inode falls into
> > +		 * a sparse region.
> > +		 */
> > +		ioffset = inum - xic->first_ino;
> > +		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
> > +			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
> > +			continue;
> > +		}
> > +
> > +		for (i = 0; i < igeo->inodes_per_cluster; i++)
> > +			xfs_ifree_abort_inode_stale(pag, inum + i);
> > +
> > +	}
> > +	xfs_perag_put(pag);
> > +}
> > +
> >  /*
> >   * This is called to unpin an inode.  The caller must have the inode locked
> >   * in at least shared mode so that the buffer cannot be subsequently pinned
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index fa780f08dc89..423542bf6af1 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -499,7 +499,6 @@ uint		xfs_ilock_data_map_shared(struct xfs_inode *);
> >  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
> >  
> >  uint		xfs_ip2xflags(struct xfs_inode *);
> > -int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
> >  int		xfs_itruncate_extents_flags(struct xfs_trans **,
> >  				struct xfs_inode *, int, xfs_fsize_t, int);
> >  void		xfs_iext_realloc(xfs_inode_t *, int, int);
> > -- 
> > 2.31.1
> >
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d354ea2b74f9..b6808c0a2868 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,6 +49,9 @@  struct kmem_cache *xfs_inode_cache;
 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
 STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 	struct xfs_inode *);
+STATIC int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip,
+		struct xfs_icluster *xic);
+STATIC void xfs_ifree_abort(struct xfs_inode *ip, struct xfs_icluster *xic);
 
 /*
  * helper function to extract extent size hint from inode
@@ -1544,6 +1547,7 @@  xfs_inactive_ifree(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
+	struct xfs_icluster     xic = { 0 };
 	int			error;
 
 	/*
@@ -1598,7 +1602,7 @@  xfs_inactive_ifree(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-	error = xfs_ifree(tp, ip);
+	error = xfs_ifree(tp, ip, &xic);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	if (error) {
 		/*
@@ -1612,7 +1616,7 @@  xfs_inactive_ifree(
 			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		}
 		xfs_trans_cancel(tp);
-		return error;
+		goto out_error;
 	}
 
 	/*
@@ -1625,11 +1629,19 @@  xfs_inactive_ifree(
 	 * to try to keep going. Make sure it's not a silent error.
 	 */
 	error = xfs_trans_commit(tp);
-	if (error)
+	if (error) {
 		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
 			__func__, error);
+		goto out_error;
+	}
 
 	return 0;
+
+out_error:
+	if (xic.deleted)
+		xfs_ifree_abort(ip, &xic);
+
+	return error;
 }
 
 /*
@@ -2259,14 +2271,14 @@  xfs_ifree_cluster(
  * inodes in the AGI. We need to remove the inode from that list atomically with
  * respect to freeing it here.
  */
-int
+STATIC int
 xfs_ifree(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	struct xfs_icluster     *xic)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_perag	*pag;
-	struct xfs_icluster	xic = { 0 };
 	struct xfs_inode_log_item *iip = ip->i_itemp;
 	int			error;
 
@@ -2284,7 +2296,7 @@  xfs_ifree(
 	 * makes the AGI lock -> unlinked list modification order the same as
 	 * used in O_TMPFILE creation.
 	 */
-	error = xfs_difree(tp, pag, ip->i_ino, &xic);
+	error = xfs_difree(tp, pag, ip->i_ino, xic);
 	if (error)
 		goto out;
 
@@ -2323,13 +2335,97 @@  xfs_ifree(
 	VFS_I(ip)->i_generation++;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	if (xic.deleted)
-		error = xfs_ifree_cluster(tp, pag, ip, &xic);
+	if (xic->deleted)
+		error = xfs_ifree_cluster(tp, pag, ip, xic);
 out:
 	xfs_perag_put(pag);
 	return error;
 }
 
+static void
+xfs_ifree_abort_inode_stale(
+	struct xfs_perag	*pag,
+	xfs_ino_t		inum)
+{
+	struct xfs_mount        *mp = pag->pag_mount;
+	struct xfs_inode_log_item *iip;
+	struct xfs_inode	*ip;
+
+retry:
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+	/* Inode not in memory, nothing to do */
+	if (!ip) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Skip invalid or not stale inode */
+	if (ip->i_ino != inum || !xfs_iflags_test(ip, XFS_ISTALE)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+		rcu_read_unlock();
+		delay(1);
+		goto retry;
+	}
+
+	iip = ip->i_itemp;
+	if (!iip || list_empty(&iip->ili_item.li_bio_list))
+		goto out_iunlock;
+
+	if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags))
+		xfs_iflush_abort(ip);
+	else
+		xfs_iflags_clear(ip, XFS_IFLUSHING);
+
+out_iunlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	rcu_read_unlock();
+}
+
+/*
+ * This is called to clean up inodes marked as stale in xfs_ifree
+ */
+STATIC void
+xfs_ifree_abort(
+	struct xfs_inode	*ip,
+	struct xfs_icluster	*xic)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag        *pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_ino_t		inum = xic->first_ino;
+	int			nbufs;
+	int			i, j;
+	int			ioffset;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
+
+	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
+		/*
+		 * The allocation bitmap tells us which inodes of the chunk were
+		 * physically allocated. Skip the cluster if an inode falls into
+		 * a sparse region.
+		 */
+		ioffset = inum - xic->first_ino;
+		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
+			continue;
+		}
+
+		for (i = 0; i < igeo->inodes_per_cluster; i++)
+			xfs_ifree_abort_inode_stale(pag, inum + i);
+
+	}
+	xfs_perag_put(pag);
+}
+
 /*
  * This is called to unpin an inode.  The caller must have the inode locked
  * in at least shared mode so that the buffer cannot be subsequently pinned
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fa780f08dc89..423542bf6af1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -499,7 +499,6 @@  uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 
 uint		xfs_ip2xflags(struct xfs_inode *);
-int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
 int		xfs_itruncate_extents_flags(struct xfs_trans **,
 				struct xfs_inode *, int, xfs_fsize_t, int);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);