diff mbox series

[2/2] bundle: fix non-linear performance scaling with refs

Message ID 20250401-488-generating-bundles-with-many-references-has-non-linear-performance-v1-2-6d23b2d96557@gmail.com (mailing list archive)
State New
Headers show
Series bundle: fix non-linear performance scaling with refs | expand

Commit Message

Karthik Nayak April 1, 2025, 5 p.m. UTC
The 'git bundle create' command has non-linear performance with the
number of refs in the repository. Benchmarking the command shows that
a large portion of the time (~75%) is spent in the
`object_array_remove_duplicates()` function.

The `object_array_remove_duplicates()` function was added in
b2a6d1c686 (bundle: allow the same ref to be given more than once,
2009-01-17) to skip duplicate refs provided by the user from being
written to the bundle. Since this is an O(N^2) algorithm, in repos with
large number of references, this can take up a large amount of time.

Let's instead use a 'strset' to skip duplicates inside
`write_bundle_refs()`. This improves the performance by around 6 times
when tested against in repository with 100000 refs:

Benchmark 1: bundle (refcount = 100000, revision = master)
  Time (mean ± σ):     14.653 s ±  0.203 s    [User: 13.940 s, System: 0.762 s]
  Range (min … max):   14.237 s … 14.920 s    10 runs

Benchmark 2: bundle (refcount = 100000, revision = HEAD)
  Time (mean ± σ):      2.394 s ±  0.023 s    [User: 1.684 s, System: 0.798 s]
  Range (min … max):    2.364 s …  2.425 s    10 runs

Summary
  bundle (refcount = 100000, revision = HEAD) ran
    6.12 ± 0.10 times faster than bundle (refcount = 100000, revision = master)

Previously, `object_array_remove_duplicates()` ensured that both the
refname and the object it pointed to were checked for duplicates. The
new approach, implemented within `write_bundle_refs()`, eliminates
duplicate refnames without comparing the objects they reference. This
works because, for bundle creation, we only need to prevent duplicate
refs from being written to the bundle header. The `revs->pending` array
can contain duplicates of multiple types.

First, references which resolve to the same refname. For e.g. "git
bundle create out.bdl master master" or "git bundle create out.bdl
refs/heads/master refs/heads/master" or "git bundle create out.bdl
master refs/heads/master". In these scenarios we want to prevent writing
"refs/heads/master" twice to the bundle header. Since both the refnames
here would point to the same object (unless there is a race), we do not
need to check equality of the object.

Second, refnames which are duplicates but do not point to the same
object. This can happen when we use an exclusion criteria. For e.g. "git
bundle create out.bdl master master^!", Here `revs->pending` would
contain two elements, both with refname set to "master". However, each
of them would be pointing to an INTERESTING and UNINTERESTING object
respectively. Since we only write refnames with INTERESTING objects to
the bundle header, we perform our duplicate checks only on such objects.

Signed-off-by: Karthik Nayak <karthik.188@gmail.com>
---
 bundle.c               | 10 +++++++++-
 object.c               | 33 ---------------------------------
 object.h               |  6 ------
 t/t6020-bundle-misc.sh |  4 ----
 4 files changed, 9 insertions(+), 44 deletions(-)
diff mbox series

Patch

diff --git a/bundle.c b/bundle.c
index d7ad690843..30cfba0be2 100644
--- a/bundle.c
+++ b/bundle.c
@@ -384,6 +384,9 @@  static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
 {
 	int i;
 	int ref_count = 0;
+	struct strset objects;
+
+	strset_init(&objects);
 
 	for (i = 0; i < revs->pending.nr; i++) {
 		struct object_array_entry *e = revs->pending.objects + i;
@@ -401,6 +404,9 @@  static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
 			flag = 0;
 		display_ref = (flag & REF_ISSYMREF) ? e->name : ref;
 
+		if (strset_contains(&objects, display_ref))
+			goto skip_write_ref;
+
 		if (e->item->type == OBJ_TAG &&
 				!is_tag_in_date_range(e->item, revs)) {
 			e->item->flags |= UNINTERESTING;
@@ -423,6 +429,7 @@  static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
 		}
 
 		ref_count++;
+		strset_add(&objects, display_ref);
 		write_or_die(bundle_fd, oid_to_hex(&e->item->oid), the_hash_algo->hexsz);
 		write_or_die(bundle_fd, " ", 1);
 		write_or_die(bundle_fd, display_ref, strlen(display_ref));
@@ -431,6 +438,8 @@  static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
 		free(ref);
 	}
 
+	strset_clear(&objects);
+
 	/* end header */
 	write_or_die(bundle_fd, "\n", 1);
 	return ref_count;
@@ -566,7 +575,6 @@  int create_bundle(struct repository *r, const char *path,
 	 */
 	revs.blob_objects = revs.tree_objects = 0;
 	traverse_commit_list(&revs, write_bundle_prerequisites, NULL, &bpi);
-	object_array_remove_duplicates(&revs_copy.pending);
 
 	/* write bundle refs */
 	ref_count = write_bundle_refs(bundle_fd, &revs_copy);
diff --git a/object.c b/object.c
index 100bf9b8d1..a2c5986178 100644
--- a/object.c
+++ b/object.c
@@ -491,39 +491,6 @@  void object_array_clear(struct object_array *array)
 	array->nr = array->alloc = 0;
 }
 
-/*
- * Return true if array already contains an entry.
- */
-static int contains_object(struct object_array *array,
-			   const struct object *item, const char *name)
-{
-	unsigned nr = array->nr, i;
-	struct object_array_entry *object = array->objects;
-
-	for (i = 0; i < nr; i++, object++)
-		if (item == object->item && !strcmp(object->name, name))
-			return 1;
-	return 0;
-}
-
-void object_array_remove_duplicates(struct object_array *array)
-{
-	unsigned nr = array->nr, src;
-	struct object_array_entry *objects = array->objects;
-
-	array->nr = 0;
-	for (src = 0; src < nr; src++) {
-		if (!contains_object(array, objects[src].item,
-				     objects[src].name)) {
-			if (src != array->nr)
-				objects[array->nr] = objects[src];
-			array->nr++;
-		} else {
-			object_array_release_entry(&objects[src]);
-		}
-	}
-}
-
 void clear_object_flags(unsigned flags)
 {
 	int i;
diff --git a/object.h b/object.h
index 17f32f1103..0e12c75922 100644
--- a/object.h
+++ b/object.h
@@ -324,12 +324,6 @@  typedef int (*object_array_each_func_t)(struct object_array_entry *, void *);
 void object_array_filter(struct object_array *array,
 			 object_array_each_func_t want, void *cb_data);
 
-/*
- * Remove from array all but the first entry with a given name.
- * Warning: this function uses an O(N^2) algorithm.
- */
-void object_array_remove_duplicates(struct object_array *array);
-
 /*
  * Remove any objects from the array, freeing all used memory; afterwards
  * the array is ready to store more objects with add_object_array().
diff --git a/t/t6020-bundle-misc.sh b/t/t6020-bundle-misc.sh
index dd09df1287..500c81b8a1 100755
--- a/t/t6020-bundle-misc.sh
+++ b/t/t6020-bundle-misc.sh
@@ -684,7 +684,6 @@  test_expect_success 'create bundle with duplicate refnames' '
 	test_cmp expect actual
 '
 
-# This exhibits a bug, since the same refname is now added to the bundle twice.
 test_expect_success 'create bundle with duplicate refnames and --all' '
 	git bundle create out.bdl --all "main" "main" &&
 
@@ -701,7 +700,6 @@  test_expect_success 'create bundle with duplicate refnames and --all' '
 	<TAG-2> refs/tags/v2
 	<TAG-3> refs/tags/v3
 	<COMMIT-P> HEAD
-	<COMMIT-P> refs/heads/main
 	EOF
 	test_cmp expect actual
 '
@@ -717,7 +715,6 @@  test_expect_success 'create bundle with duplicate exlusion refnames' '
 	test_cmp expect actual
 '
 
-# This exhibits a bug, since the same refname is now added to the bundle twice.
 test_expect_success 'create bundle with duplicate refname short-form' '
 	git bundle create out.bdl "main" "main" "refs/heads/main" "refs/heads/main" &&
 
@@ -725,7 +722,6 @@  test_expect_success 'create bundle with duplicate refname short-form' '
 		make_user_friendly_and_stable_output >actual &&
 	cat >expect <<-\EOF &&
 	<COMMIT-P> refs/heads/main
-	<COMMIT-P> refs/heads/main
 	EOF
 	test_cmp expect actual
 '