diff mbox series

[RFC,3/3] Preload submodule state in refresh_index

Message ID 20240305012112.1598053-5-atneya@google.com (mailing list archive)
State New, archived
Headers show
Series Parallel submodule status | expand

Commit Message

Atneya Nair March 5, 2024, 1:21 a.m. UTC
refresh_index currently parallelizes updating cache_entries for regular
files based on lstat. Expand preload to parallelize exploring the
checked out state of submodules, which is substantially more expensive.

Cache the state of the submodule in memory, to avoid unnecessary
re-computation (similar to regular files).

This speeds up git status, and other operations which examine the read
index, especially in repositories with many submodules.

Signed-off-by: Atneya Nair <atneya@google.com>
---

Notes:
    For now, I added a new field to store the submodule state.
    
    Open questions:
    - Where can we efficiently store the submodule state? I assume we can
    re-use some of the ce_flags which aren't used for submodules?
    
    - Why can threads only go up to 64? Can we make this user configurable?

 preload-index.c | 25 ++++++++++++++++++++++---
 read-cache-ll.h |  1 +
 read-cache.c    |  3 +++
 3 files changed, 26 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/preload-index.c b/preload-index.c
index 63fd35d64b..091b22fa4c 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -22,7 +22,7 @@ 
  * to have at least 500 lstat's per thread for it to
  * be worth starting a thread.
  */
-#define MAX_PARALLEL (20)
+#define MAX_PARALLEL (60)
 #define THREAD_COST (500)
 
 struct progress_data {
@@ -59,8 +59,21 @@  static void *preload_thread(void *_data)
 
 		if (ce_stage(ce))
 			continue;
-		if (S_ISGITLINK(ce->ce_mode))
+		if (S_ISGITLINK(ce->ce_mode)) {
+			// This call evaluates the submodule HEAD for GITLINK, which really does determine
+			// if there is a change (for index purposes). We can't use the traditional path of
+			// marking as VALID, because valid can't be used for submodules due to other code
+			// paths in which valid may skip investigation of the worktree in the submodule.
+			// Gitlinks also aren't statable, or fsmonitorable, so caching doesn't have the same
+			// semantics.
+			// Use a special entry to mark the ref change state and its validity. Future calls
+			// to ce_compare_gitlink will leverage this.
+			if (lstat(ce->name, &st))
+				continue;
+			ce->sub_ref_state = (!!(ie_match_stat(index, ce, &st,
+					CE_MATCH_RACY_IS_DIRTY|CE_MATCH_IGNORE_FSMONITOR) & DATA_CHANGED) << 1) | 0x1;
 			continue;
+		}
 		if (ce_uptodate(ce))
 			continue;
 		if (ce_skip_worktree(ce))
@@ -107,11 +120,17 @@  void preload_index(struct index_state *index,
 	struct thread_data data[MAX_PARALLEL];
 	struct progress_data pd;
 	int t2_sum_lstat = 0;
+	int link_count = 0;
 
 	if (!HAVE_THREADS || !core_preload_index)
 		return;
 
-	threads = index->cache_nr / THREAD_COST;
+	for (i = 0; i < index->cache_nr; i++) {
+		link_count += (S_ISGITLINK(index->cache[i]->ce_mode));
+	}
+	// Exploring gitlinks are much more expensive than lstat, so modify the cost
+	threads = (index->cache_nr / THREAD_COST) + (link_count / 25);
+
 	if ((index->cache_nr > 1) && (threads < 2) && git_env_bool("GIT_TEST_PRELOAD_INDEX", 0))
 		threads = 2;
 	if (threads < 2)
diff --git a/read-cache-ll.h b/read-cache-ll.h
index 2a50a784f0..5555bb0ae9 100644
--- a/read-cache-ll.h
+++ b/read-cache-ll.h
@@ -27,6 +27,7 @@  struct cache_entry {
 	unsigned int mem_pool_allocated;
 	unsigned int ce_namelen;
 	unsigned int index;	/* for link extension */
+	unsigned int sub_ref_state; /* TODO pack somewhere. Lowest bit valid, second lowest dirty. */
 	struct object_id oid;
 	char name[FLEX_ARRAY]; /* more */
 };
diff --git a/read-cache.c b/read-cache.c
index f546cf7875..541d40ca30 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -271,6 +271,9 @@  static int ce_compare_gitlink(const struct cache_entry *ce)
 	 *
 	 * If so, we consider it always to match.
 	 */
+	if (ce->sub_ref_state & 0x1)
+		/* Check the cached value */
+		return ce->sub_ref_state >> 1;
 	if (resolve_gitlink_ref(ce->name, "HEAD", &oid) < 0)
 		return 0;
 	return !oideq(&oid, &ce->oid);