diff mbox series

[v3,1/3] index-pack --promisor: dedup before checking links

Message ID 7ae21c921fe367d4b15cd4a299196009c15205d9.1733262662.git.jonathantanmy@google.com (mailing list archive)
State New
Headers show
Series Performance improvements for repacking non-promisor objects | expand

Commit Message

Jonathan Tan Dec. 3, 2024, 9:52 p.m. UTC
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) fixed a bug with what was believed to be a negligible
decrease in performance [1] [2]. But at $DAYJOB, with at least one repo,
it was found that the decrease in performance was very significant.

Looking at the patch, whenever we parse an object in the packfile to
be indexed, we check the targets of all its outgoing links for its
existence. However, this could be optimized by first collecting all such
targets into an oidset (thus deduplicating them) before checking. Teach
Git to do that.

On a certain fetch from the aforementioned repo, this improved
performance from approximately 7 hours to 24m47.815s. This number will
be further reduced in a subsequent patch.

[1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@mail.gmail.com/
[2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@google.com/

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/index-pack.c | 73 +++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

Comments

Junio C Hamano Dec. 4, 2024, 4:36 a.m. UTC | #1
Jonathan Tan <jonathantanmy@google.com> writes:

> @@ -1781,26 +1775,41 @@ static void repack_local_links(void)
>  	struct object_id *oid;
>  	char *base_name;

We may want to give a meaningless NULL initialization to this
variable, due to false positive from a compliler.

> -	if (!oidset_size(&local_links))
> +	if (!oidset_size(&outgoing_links))
>  		return;
>  
> -	base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));

It used to be that it was really obvious that base_name is always
initialized.  But now due to micro-optimization ...

> +	oidset_iter_init(&outgoing_links, &iter);
> +	while ((oid = oidset_iter_next(&iter))) {
> +		struct object_info info = OBJECT_INFO_INIT;
> +		if (oid_object_info_extended(the_repository, oid, &info, 0))
> +			/* Missing; assume it is a promisor object */
> +			continue;
> +		if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
> +			continue;
> ...
> +		if (!cmd.args.nr) {
> +			base_name = mkpathdup(
> +				"%s/pack/pack",
> +				repo_get_object_directory(the_repository));

... we lazily allocate only after we know we will run a command.

> +			strvec_push(&cmd.args, "pack-objects");
> +			strvec_push(&cmd.args,
> +				    "--exclude-promisor-objects-best-effort");
> +			strvec_push(&cmd.args, base_name);
> +			cmd.git_cmd = 1;
> +			cmd.in = -1;
> +			cmd.out = -1;
> +			if (start_command(&cmd))
> +				die(_("could not start pack-objects to repack local links"));
> +		}

We know outgoing_links is not empty, so we know we will enter the
while() loop at least once, but it may be possible that all the
objects in the outgoing_links oidset end up to be missing or packed
in a promisor pack, hitting continue and never running the command.

> -	oidset_iter_init(&local_links, &iter);
> -	while ((oid = oidset_iter_next(&iter))) {
>  		if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
>  		    write_in_full(cmd.in, "\n", 1) < 0)
>  			die(_("failed to feed local object to pack-objects"));
>  	}
> +
> +	if (!cmd.args.nr)
> +		return;

But then we have this early return, so from human-reader's point of
view, we will never hit free(base_name) at the end of this function.

But GCC used in the macOS build does not seem to realize it.

https://github.com/git/git/actions/runs/12152173257/job/33888089229#step:4:380

It may be safer to give a meaningless NULL as the initial value of
the variable.

Thanks.
diff mbox series

Patch

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 95babdc5ea..d1c777a6af 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -155,11 +155,11 @@  static int input_fd, output_fd;
 static const char *curr_pack;
 
 /*
- * local_links is guarded by read_mutex, and record_local_links is read-only in
- * a thread.
+ * outgoing_links is guarded by read_mutex, and record_outgoing_links is
+ * read-only in a thread.
  */
-static struct oidset local_links = OIDSET_INIT;
-static int record_local_links;
+static struct oidset outgoing_links = OIDSET_INIT;
+static int record_outgoing_links;
 
 static struct thread_local_data *thread_data;
 static int nr_dispatched;
@@ -812,18 +812,12 @@  static int check_collison(struct object_entry *entry)
 	return 0;
 }
 
-static void record_if_local_object(const struct object_id *oid)
+static void record_outgoing_link(const struct object_id *oid)
 {
-	struct object_info info = OBJECT_INFO_INIT;
-	if (oid_object_info_extended(the_repository, oid, &info, 0))
-		/* Missing; assume it is a promisor object */
-		return;
-	if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
-		return;
-	oidset_insert(&local_links, oid);
+	oidset_insert(&outgoing_links, oid);
 }
 
-static void do_record_local_links(struct object *obj)
+static void do_record_outgoing_links(struct object *obj)
 {
 	if (obj->type == OBJ_TREE) {
 		struct tree *tree = (struct tree *)obj;
@@ -837,16 +831,16 @@  static void do_record_local_links(struct object *obj)
 			 */
 			return;
 		while (tree_entry_gently(&desc, &entry))
-			record_if_local_object(&entry.oid);
+			record_outgoing_link(&entry.oid);
 	} else if (obj->type == OBJ_COMMIT) {
 		struct commit *commit = (struct commit *) obj;
 		struct commit_list *parents = commit->parents;
 
 		for (; parents; parents = parents->next)
-			record_if_local_object(&parents->item->object.oid);
+			record_outgoing_link(&parents->item->object.oid);
 	} else if (obj->type == OBJ_TAG) {
 		struct tag *tag = (struct tag *) obj;
-		record_if_local_object(get_tagged_oid(tag));
+		record_outgoing_link(get_tagged_oid(tag));
 	}
 }
 
@@ -896,7 +890,7 @@  static void sha1_object(const void *data, struct object_entry *obj_entry,
 		free(has_data);
 	}
 
-	if (strict || do_fsck_object || record_local_links) {
+	if (strict || do_fsck_object || record_outgoing_links) {
 		read_lock();
 		if (type == OBJ_BLOB) {
 			struct blob *blob = lookup_blob(the_repository, oid);
@@ -928,8 +922,8 @@  static void sha1_object(const void *data, struct object_entry *obj_entry,
 				die(_("fsck error in packed object"));
 			if (strict && fsck_walk(obj, NULL, &fsck_options))
 				die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid));
-			if (record_local_links)
-				do_record_local_links(obj);
+			if (record_outgoing_links)
+				do_record_outgoing_links(obj);
 
 			if (obj->type == OBJ_TREE) {
 				struct tree *item = (struct tree *) obj;
@@ -1781,26 +1775,41 @@  static void repack_local_links(void)
 	struct object_id *oid;
 	char *base_name;
 
-	if (!oidset_size(&local_links))
+	if (!oidset_size(&outgoing_links))
 		return;
 
-	base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
+	oidset_iter_init(&outgoing_links, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		struct object_info info = OBJECT_INFO_INIT;
+		if (oid_object_info_extended(the_repository, oid, &info, 0))
+			/* Missing; assume it is a promisor object */
+			continue;
+		if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
+			continue;
 
-	strvec_push(&cmd.args, "pack-objects");
-	strvec_push(&cmd.args, "--exclude-promisor-objects-best-effort");
-	strvec_push(&cmd.args, base_name);
-	cmd.git_cmd = 1;
-	cmd.in = -1;
-	cmd.out = -1;
-	if (start_command(&cmd))
-		die(_("could not start pack-objects to repack local links"));
+		if (!cmd.args.nr) {
+			base_name = mkpathdup(
+				"%s/pack/pack",
+				repo_get_object_directory(the_repository));
+			strvec_push(&cmd.args, "pack-objects");
+			strvec_push(&cmd.args,
+				    "--exclude-promisor-objects-best-effort");
+			strvec_push(&cmd.args, base_name);
+			cmd.git_cmd = 1;
+			cmd.in = -1;
+			cmd.out = -1;
+			if (start_command(&cmd))
+				die(_("could not start pack-objects to repack local links"));
+		}
 
-	oidset_iter_init(&local_links, &iter);
-	while ((oid = oidset_iter_next(&iter))) {
 		if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
 		    write_in_full(cmd.in, "\n", 1) < 0)
 			die(_("failed to feed local object to pack-objects"));
 	}
+
+	if (!cmd.args.nr)
+		return;
+
 	close(cmd.in);
 
 	out = xfdopen(cmd.out, "r");
@@ -1899,7 +1908,7 @@  int cmd_index_pack(int argc,
 			} else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
 				; /* nothing to do */
 			} else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) {
-				record_local_links = 1;
+				record_outgoing_links = 1;
 			} else if (starts_with(arg, "--threads=")) {
 				char *end;
 				nr_threads = strtoul(arg+10, &end, 0);