diff mbox series

[RFC,4/4] xfs: skip busy inodes on finobt inode allocation

Message ID 20220217172518.3842951-5-bfoster@redhat.com (mailing list archive)
State Deferred, archived
Headers show
Series xfs: track and skip realloc of busy inodes | expand

Commit Message

Brian Foster Feb. 17, 2022, 5:25 p.m. UTC
Experimental algorithm to skip busy inodes on finobt based inode
allocation. This is a first pass implementation intended primarily
for functional and performance experimentation. The logic is
intentionally kept simple to minimize the scope of changes required
to demonstrate functionality.

The existing finobt inode allocation algorithms are updated to
filter out records that are covered by at least one still-busy
inode[1] and continue scanning as appropriate based on the
allocation mode. For example, near allocation mode continues
scanning left and right until a usable record is found. newino mode
checks the target record and then scans from the first record in the
tree until a usable record is found. If the associated algorithm
cannot find a usable record, it falls back to new chunk allocation
to add non-busy inodes to the selection pool and restarts.

[1] As I write this, it occurs to me this logic could be further
improved to compare the first busy inode against the first free
inode in the record without disrupting the subsequent inode
selection logic.

Not-Signed-off-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_ialloc.c | 81 +++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 3eb41228e210..c79c85327cf4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1248,12 +1248,53 @@  xfs_dialloc_ag_inobt(
 	return error;
 }
 
+#define XFS_LOOKUP_BATCH	XFS_INODES_PER_CHUNK
+#define XFS_ICI_BUSY_TAG	2
+STATIC bool
+xfs_dialloc_ag_inobt_rec_busy(
+	struct xfs_perag		*pag,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	struct xfs_inode		*batch[XFS_LOOKUP_BATCH];
+	struct xfs_inode		*ip;
+	int				found, i;
+	xfs_agino_t			startino = rec->ir_startino;
+	bool				busy = false;
+	unsigned long			destroy_gp;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, (void **) batch,
+					   startino, XFS_LOOKUP_BATCH,
+					   XFS_ICI_BUSY_TAG);
+	for (i = 0; i < found; i++) {
+		ip = batch[i];
+		spin_lock(&ip->i_flags_lock);
+		if (ip->i_ino >= startino + XFS_INODES_PER_CHUNK) {
+			spin_unlock(&ip->i_flags_lock);
+			break;
+		}
+		destroy_gp = ip->i_destroy_gp;
+		spin_unlock(&ip->i_flags_lock);
+
+		if (!poll_state_synchronize_rcu(destroy_gp)) {
+			busy = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	trace_printk("%d: agno %d startino 0x%x found %d busy %d caller %pS\n",
+		     __LINE__, pag->pag_agno, startino, found, busy,
+		     (void *) _RET_IP_);
+	return busy;
+}
+
 /*
  * Use the free inode btree to allocate an inode based on distance from the
  * parent. Note that the provided cursor may be deleted and replaced.
  */
 STATIC int
 xfs_dialloc_ag_finobt_near(
+	struct xfs_perag		*pag,
 	xfs_agino_t			pagino,
 	struct xfs_btree_cur		**ocur,
 	struct xfs_inobt_rec_incore	*rec)
@@ -1281,8 +1322,10 @@  xfs_dialloc_ag_finobt_near(
 		 * existence is enough.
 		 */
 		if (pagino >= rec->ir_startino &&
-		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
-			return 0;
+		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) {
+			if (!xfs_dialloc_ag_inobt_rec_busy(pag, rec))
+				return 0;
+		}
 	}
 
 	error = xfs_btree_dup_cursor(lcur, &rcur);
@@ -1306,6 +1349,21 @@  xfs_dialloc_ag_finobt_near(
 		error = -EFSCORRUPTED;
 		goto error_rcur;
 	}
+
+	while (i == 1 && xfs_dialloc_ag_inobt_rec_busy(pag, rec)) {
+		error = xfs_ialloc_next_rec(lcur, rec, &i, 1);
+		if (error)
+			goto error_rcur;
+		i = !i;
+	}
+
+	while (j == 1 && xfs_dialloc_ag_inobt_rec_busy(pag, &rrec)) {
+		error = xfs_ialloc_next_rec(rcur, &rrec, &j, 0);
+		if (error)
+			goto error_rcur;
+		j = !j;
+	}
+
 	if (i == 1 && j == 1) {
 		/*
 		 * Both the left and right records are valid. Choose the closer
@@ -1327,6 +1385,9 @@  xfs_dialloc_ag_finobt_near(
 	} else if (i == 1) {
 		/* only the left record is valid */
 		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+	} else {
+		error = -EAGAIN;
+		goto error_rcur;
 	}
 
 	return 0;
@@ -1342,6 +1403,7 @@  xfs_dialloc_ag_finobt_near(
  */
 STATIC int
 xfs_dialloc_ag_finobt_newino(
+	struct xfs_perag		*pag,
 	struct xfs_agi			*agi,
 	struct xfs_btree_cur		*cur,
 	struct xfs_inobt_rec_incore	*rec)
@@ -1360,7 +1422,8 @@  xfs_dialloc_ag_finobt_newino(
 				return error;
 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
 				return -EFSCORRUPTED;
-			return 0;
+			if (!xfs_dialloc_ag_inobt_rec_busy(pag, rec))
+				return 0;
 		}
 	}
 
@@ -1379,6 +1442,14 @@  xfs_dialloc_ag_finobt_newino(
 	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
 		return -EFSCORRUPTED;
 
+	while (xfs_dialloc_ag_inobt_rec_busy(pag, rec)) {
+		error = xfs_ialloc_next_rec(cur, rec, &i, 0);
+		if (error)
+			return error;
+		if (i == 1)
+			return -EAGAIN;
+	}
+
 	return 0;
 }
 
@@ -1470,9 +1541,9 @@  xfs_dialloc_ag(
 	 * not, consider the agi hint or find the first free inode in the AG.
 	 */
 	if (pag->pag_agno == pagno)
-		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+		error = xfs_dialloc_ag_finobt_near(pag, pagino, &cur, &rec);
 	else
-		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+		error = xfs_dialloc_ag_finobt_newino(pag, agi, cur, &rec);
 	if (error)
 		goto error_cur;