diff mbox series

[v3,4/4] revision: avoid hitting packfiles when commits are in commit-graph

Message ID f6fc2a5e6d94befa915fb59b6296ce3153820c13.1627896460.git.ps@pks.im (mailing list archive)
State Superseded
Headers show
Series Speed up connectivity checks | expand

Commit Message

Patrick Steinhardt Aug. 2, 2021, 9:38 a.m. UTC
When queueing references in git-rev-list(1), we try to either reuse an
already parsed object or alternatively we load the object header from
disk in order to determine its type. This is inefficient for commits
though in cases where we have a commit graph available: instead of
hitting the real object on disk to determine its type, we may instead
search the object graph for the object ID. In case it's found, we can
directly fill in the commit object, otherwise we can still hit the disk
to determine the object's type.

Expose a new function `find_object_in_graph()`, which fills in an object
of unknown type in case we find its object ID in the graph. This
provides a big performance win in cases where there is a commit-graph
available in the repository in case we load lots of references. The
following has been executed in a real-world repository with about 2.2
million refs:

    Benchmark #1: HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev
      Time (mean ± σ):      4.465 s ±  0.037 s    [User: 4.144 s, System: 0.320 s]
      Range (min … max):    4.411 s …  4.514 s    10 runs

    Benchmark #2: HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev
      Time (mean ± σ):      2.886 s ±  0.032 s    [User: 2.570 s, System: 0.316 s]
      Range (min … max):    2.826 s …  2.933 s    10 runs

    Summary
      'HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev' ran
        1.55 ± 0.02 times faster than 'HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev'

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 commit-graph.c | 55 +++++++++++++++++++++++++++++++++++++++-----------
 commit-graph.h |  2 ++
 revision.c     | 10 +++++----
 3 files changed, 51 insertions(+), 16 deletions(-)

Comments

Junio C Hamano Aug. 2, 2021, 8:01 p.m. UTC | #1
Patrick Steinhardt <ps@pks.im> writes:

> diff --git a/commit-graph.c b/commit-graph.c
> index 3860a0d847..a81d5cebc0 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -864,6 +864,48 @@ static int fill_commit_in_graph(struct repository *r,
>  	return 1;
>  }


Describe return-value here.  0 for not-found, !0 for found?

> +static int find_object_id_in_graph(const struct object_id *id, struct commit_graph *g, uint32_t *pos)
> +{
> +	struct commit_graph *cur_g = g;
> +	uint32_t lex_index;
> +
> +	while (cur_g && !bsearch_graph(cur_g, (struct object_id *)id, &lex_index))
> +		cur_g = cur_g->base_graph;
> +
> +	if (cur_g) {
> +		*pos = lex_index + cur_g->num_commits_in_base;
> +		return 1;
> +	}
> +
> +	return 0;
> +}

Likewise, or as this is public, perhaps in commit-graph.h next to
its declaration.

> +int find_object_in_graph(struct repository *repo, struct object *object)
> +{
> +	struct commit *commit;
> +	uint32_t pos;
> +
> +	if (object->parsed) {
> +		if (object->type != OBJ_COMMIT)
> +			return -1;
> +		return 0;

This is puzzling---at least it is not consistent with what the
function name says ("please say if you find _this_ object in the
commit-graph file"---if that is not what this function does, it
needs a comment before the implementation).

The caller had object and we has already been parsed.  If the
function were "with help from commit-graph, please tell me if you
can positively say this is a commit", the above is understandable.
If we know positively that it is not commit, we say "no, it is not a
commit" (which may be suboptimal---if the caller falls back to
another codepath, the object will still not be a commit) and if we
know it is a commit, we can say "yes, it definitely is a commit" and
the caller can stop there.

I guess my only problem with this function is that its name and what
it does does not align.  If the caller uses it for the real purpose
of the function I guessed, then the logic itself may be OK.

> +	}
> +
> +	if (!repo->objects->commit_graph)
> +		return -1;

There is no commit-graph, then we decline to make a decision, which
makes sense.

> +	if (!find_object_id_in_graph(&object->oid, repo->objects->commit_graph, &pos))
> +		return -1;

If it does not exist in the graph, we cannot tell, either.

> +	commit = object_as_type(object, OBJ_COMMIT, 1);
> +	if (!commit)
> +		return -1;
> +	if (!fill_commit_in_graph(repo, commit, repo->objects->commit_graph, pos))
> +		return -1;
> +
> +	return 0;
> +}
> +
>  static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uint32_t *pos)
>  {
>  	uint32_t graph_pos = commit_graph_position(item);
> @@ -871,18 +913,7 @@ static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin
>  		*pos = graph_pos;
>  		return 1;
>  	} else {
> -		struct commit_graph *cur_g = g;
> -		uint32_t lex_index;
> -
> -		while (cur_g && !bsearch_graph(cur_g, &(item->object.oid), &lex_index))
> -			cur_g = cur_g->base_graph;
> -
> -		if (cur_g) {
> -			*pos = lex_index + cur_g->num_commits_in_base;
> -			return 1;
> -		}
> -
> -		return 0;
> +		return find_object_id_in_graph(&item->object.oid, g, pos);

And I think this one is a op-op refactoring that does not change the
behaviour of find_commit_in_graph()?  It might be easier if done in
a separate preparatory step, but it is small enough.

> diff --git a/commit-graph.h b/commit-graph.h
> index 96c24fb577..f373fab4c0 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -139,6 +139,8 @@ int write_commit_graph(struct object_directory *odb,
>  		       enum commit_graph_write_flags flags,
>  		       const struct commit_graph_opts *opts);
>  
> +int find_object_in_graph(struct repository *repo, struct object *object);
> +
>  #define COMMIT_GRAPH_VERIFY_SHALLOW	(1 << 0)
>  
>  int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags);
> diff --git a/revision.c b/revision.c
> index 671b6d6513..c3f9cf2998 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -362,10 +362,12 @@ static struct object *get_reference(struct rev_info *revs, const char *name,
>  	struct object *object = lookup_unknown_object(revs->repo, oid);
>  
>  	if (object->type == OBJ_NONE) {
> -		int type = oid_object_info(revs->repo, oid, NULL);
> -		if (type < 0 || !object_as_type(object, type, 1)) {
> -			object = NULL;
> -			goto out;
> +		if (find_object_in_graph(revs->repo, object) < 0) {
> +			int type = oid_object_info(revs->repo, oid, NULL);
> +			if (type < 0 || !object_as_type(object, type, 1)) {
> +				object = NULL;
> +				goto out;
> +			}
>  		}
>  	}
Patrick Steinhardt Aug. 3, 2021, 9:16 a.m. UTC | #2
On Mon, Aug 02, 2021 at 01:01:03PM -0700, Junio C Hamano wrote:
> Patrick Steinhardt <ps@pks.im> writes:
[snip]
> > +int find_object_in_graph(struct repository *repo, struct object *object)
> > +{
> > +	struct commit *commit;
> > +	uint32_t pos;
> > +
> > +	if (object->parsed) {
> > +		if (object->type != OBJ_COMMIT)
> > +			return -1;
> > +		return 0;
> 
> This is puzzling---at least it is not consistent with what the
> function name says ("please say if you find _this_ object in the
> commit-graph file"---if that is not what this function does, it
> needs a comment before the implementation).
> 
> The caller had object and we has already been parsed.  If the
> function were "with help from commit-graph, please tell me if you
> can positively say this is a commit", the above is understandable.
> If we know positively that it is not commit, we say "no, it is not a
> commit" (which may be suboptimal---if the caller falls back to
> another codepath, the object will still not be a commit) and if we
> know it is a commit, we can say "yes, it definitely is a commit" and
> the caller can stop there.
> 
> I guess my only problem with this function is that its name and what
> it does does not align.  If the caller uses it for the real purpose
> of the function I guessed, then the logic itself may be OK.

Fair point. The only caller for now only calls the function if the
object's type is unknown, so it really is "Resolve the commit if it is
one". I'll adjust the function's name.

> >  static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uint32_t *pos)
> >  {
> >  	uint32_t graph_pos = commit_graph_position(item);
> > @@ -871,18 +913,7 @@ static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin
> >  		*pos = graph_pos;
> >  		return 1;
> >  	} else {
> > -		struct commit_graph *cur_g = g;
> > -		uint32_t lex_index;
> > -
> > -		while (cur_g && !bsearch_graph(cur_g, &(item->object.oid), &lex_index))
> > -			cur_g = cur_g->base_graph;
> > -
> > -		if (cur_g) {
> > -			*pos = lex_index + cur_g->num_commits_in_base;
> > -			return 1;
> > -		}
> > -
> > -		return 0;
> > +		return find_object_id_in_graph(&item->object.oid, g, pos);
> 
> And I think this one is a op-op refactoring that does not change the
> behaviour of find_commit_in_graph()?  It might be easier if done in
> a separate preparatory step, but it is small enough.

Will do.

One thing that occurred to me this morning after waking up is that this
commit changes semantics: if we're able to look up the commit via the
commit-graph, then we'll happily consider it to exist in the repository.
But given that we don't hit the object database at all anymore, it may
be that the commit-graph was out of date while the commit got
unreachable and thus pruned. So it may not even exist anymore in the
repository.

I wonder what our stance on this is. I can definitely understand the
angle that this would be a deal breaker given that we now claim commits
exist which don't anymore. On the other hand, we update commit-graphs
via git-gc(1), which makes this scenario a lot less likely nowadays. Is
there any precedent in our codebase where we treat commits part of the
commit-graph as existing? If not, do we want to make that assumption?

Patrick
Junio C Hamano Aug. 3, 2021, 9:56 p.m. UTC | #3
Patrick Steinhardt <ps@pks.im> writes:

> I wonder what our stance on this is. I can definitely understand the
> angle that this would be a deal breaker given that we now claim commits
> exist which don't anymore.

An optimization that produces a wrong result very fast is a useless
optimization that has no place in our codebase.  But don't we have
some clue recorded in the commit graph file that tells us with what
packfile the graph is to be used (iow, if the named packfile still
exists there, the objects recorded in the graph file are to be found
there) or something?
Ævar Arnfjörð Bjarmason Aug. 4, 2021, 10:51 a.m. UTC | #4
On Tue, Aug 03 2021, Patrick Steinhardt wrote:

> I wonder what our stance on this is. I can definitely understand the
> angle that this would be a deal breaker given that we now claim commits
> exist which don't anymore. On the other hand, we update commit-graphs
> via git-gc(1), which makes this scenario a lot less likely nowadays. Is
> there any precedent in our codebase where we treat commits part of the
> commit-graph as existing? If not, do we want to make that assumption?

I don't think there is, but don't see why given the performance benefits
it should not at least be exposed in this form for those that think they
know what they're doing.

But right now the way we write the commit graph seems guaranteed to
produce races in this area, i.e. we expire objects first, and then we
write a new graph (see cmd_gc()). I.e. something like:

 1. (Re)pack reachable objects
 2. Prune unrechable and old
 3. Write a new commit graph using discovered tips

I don't see a good reason other than just that this needs some
refactoring to not instead do:

 1. Discover reachable refs
 2. (Re)pack reachable objects from those refs
 3. Write a new commit graph using those ref tips, i.e. don't re-run
    for_each_ref() in write_commit_graph_reachable() but call
    write_commit_graph() with those OIDs directly.
 4. Prune unreachable and old

Which I think would nicely get around this particular race condition,
which also doesn't exist if you call "git commit-graph write", it's just
if we write a new graph and then expire objects without being aware that
the graph needs updating.
Patrick Steinhardt Aug. 5, 2021, 11:01 a.m. UTC | #5
On Tue, Aug 03, 2021 at 02:56:49PM -0700, Junio C Hamano wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> 
> > I wonder what our stance on this is. I can definitely understand the
> > angle that this would be a deal breaker given that we now claim commits
> > exist which don't anymore.
> 
> An optimization that produces a wrong result very fast is a useless
> optimization that has no place in our codebase.  But don't we have
> some clue recorded in the commit graph file that tells us with what
> packfile the graph is to be used (iow, if the named packfile still
> exists there, the objects recorded in the graph file are to be found
> there) or something?

Unfortunately, no. For bitmaps we have this information given that a
bitmap is tied to a specific pack anyway. But for commit-graphs, the
story is different given that they don't really care about the packs per
se, but only about the commits.

I was briefly wondering whether we can somehow use generation numbers to
cut off parsing some commits: given we have already observed a commit
with generation number N and we have determined that this commit's
object exists, and we now see a commit with generation number M with
M<N, then we can skip the object lookup because M is reachable by N and
thus its object must exist. But generation numbers cannot determine
reachability, but only unreachability, so I fear this is not possible.

We can do the following on top though:

diff --git a/revision.c b/revision.c
index 3527ef3f65..9e62de20ab 100644
--- a/revision.c
+++ b/revision.c
@@ -368,6 +368,8 @@ static struct object *get_reference(struct rev_info *revs, const char *name,
 				object = NULL;
 				goto out;
 			}
+		} else if (!repo_has_object_file(revs->repo, oid)) {
+			die("bad object %s", name);
 		}
 	}

We assert that the object exists, but `repo_has_object_file()` won't try
to unpack the object header given that we request no info about the
object. And because the object ID has been part of the commit-graph, we
know that it's a commit. It's a bit slower compared to the version where
we don't assert object existence, but still a lot faster compared to
looking up the object type via the ODB:

    Benchmark #1: HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)
      Time (mean ± σ):      4.512 s ±  0.057 s    [User: 4.131 s, System: 0.381 s]
      Range (min … max):    4.435 s …  4.632 s    10 runs

    Benchmark #2: without-existence rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)
      Time (mean ± σ):      2.903 s ±  0.022 s    [User: 2.533 s, System: 0.369 s]
      Range (min … max):    2.878 s …  2.954 s    10 runs

    Benchmark #3: with-existence: rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)
      Time (mean ± σ):      3.071 s ±  0.014 s    [User: 2.712 s, System: 0.358 s]
      Range (min … max):    3.050 s …  3.088 s    10 runs

    Summary
      'without-existence: rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)' ran
        1.06 ± 0.01 times faster than 'with-existance: rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)'
        1.55 ± 0.02 times faster than 'HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)'

Patrick
Junio C Hamano Aug. 5, 2021, 4:16 p.m. UTC | #6
Patrick Steinhardt <ps@pks.im> writes:

> On Tue, Aug 03, 2021 at 02:56:49PM -0700, Junio C Hamano wrote:
>> Patrick Steinhardt <ps@pks.im> writes:
>> 
>> > I wonder what our stance on this is. I can definitely understand the
>> > angle that this would be a deal breaker given that we now claim commits
>> > exist which don't anymore.
>> 
>> An optimization that produces a wrong result very fast is a useless
>> optimization that has no place in our codebase.  But don't we have
>> some clue recorded in the commit graph file that tells us with what
>> packfile the graph is to be used (iow, if the named packfile still
>> exists there, the objects recorded in the graph file are to be found
>> there) or something?
>
> Unfortunately, no. For bitmaps we have this information given that a
> bitmap is tied to a specific pack anyway. But for commit-graphs, the
> story is different given that they don't really care about the packs per
> se, but only about the commits.

[jc: refreshed Cc: list to limit to those in "shortlog commit-graph.[ch]"]

On this subject, I'd ask those who have worked on the commit-graph
for ideas.  It would be a glaring flaw _if_ the data structure that
is designed to be a "cache of precomputed summary that would help
runtime performance" has no way to detect out-of-date cache and/or
to invalidate when it goes stale, but I somehow doubt that is the
case, given the caliber of folks who have worked in it.  To me, it
feels a lot more likely that we may be missing an existing mechanism
to do so.  It could be that ...

> We can do the following on top though:
>
> diff --git a/revision.c b/revision.c
> index 3527ef3f65..9e62de20ab 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -368,6 +368,8 @@ static struct object *get_reference(struct rev_info *revs, const char *name,
>  				object = NULL;
>  				goto out;
>  			}
> +		} else if (!repo_has_object_file(revs->repo, oid)) {
> +			die("bad object %s", name);
>  		}
>  	}
>
> We assert that the object exists, but `repo_has_object_file()` won't try
> to unpack the object header given that we request no info about the
> object. And because the object ID has been part of the commit-graph, we
> know that it's a commit. It's a bit slower compared to the version where
> we don't assert object existence, but still a lot faster compared to
> looking up the object type via the ODB.

... the above is the designed way to correctly use the commit-graph
data?  That is, you find an object in the commit-graph, and you make
sure the object exists in the object store in some other means
(because there is no mechanism for commit-graph to prevent a gc from
pruning an object recorded in it away) before you consider you can
use the object.

Thoughts and help?

Thanks.
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 3860a0d847..a81d5cebc0 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -864,6 +864,48 @@  static int fill_commit_in_graph(struct repository *r,
 	return 1;
 }
 
+static int find_object_id_in_graph(const struct object_id *id, struct commit_graph *g, uint32_t *pos)
+{
+	struct commit_graph *cur_g = g;
+	uint32_t lex_index;
+
+	while (cur_g && !bsearch_graph(cur_g, (struct object_id *)id, &lex_index))
+		cur_g = cur_g->base_graph;
+
+	if (cur_g) {
+		*pos = lex_index + cur_g->num_commits_in_base;
+		return 1;
+	}
+
+	return 0;
+}
+
+int find_object_in_graph(struct repository *repo, struct object *object)
+{
+	struct commit *commit;
+	uint32_t pos;
+
+	if (object->parsed) {
+		if (object->type != OBJ_COMMIT)
+			return -1;
+		return 0;
+	}
+
+	if (!repo->objects->commit_graph)
+		return -1;
+
+	if (!find_object_id_in_graph(&object->oid, repo->objects->commit_graph, &pos))
+		return -1;
+
+	commit = object_as_type(object, OBJ_COMMIT, 1);
+	if (!commit)
+		return -1;
+	if (!fill_commit_in_graph(repo, commit, repo->objects->commit_graph, pos))
+		return -1;
+
+	return 0;
+}
+
 static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uint32_t *pos)
 {
 	uint32_t graph_pos = commit_graph_position(item);
@@ -871,18 +913,7 @@  static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin
 		*pos = graph_pos;
 		return 1;
 	} else {
-		struct commit_graph *cur_g = g;
-		uint32_t lex_index;
-
-		while (cur_g && !bsearch_graph(cur_g, &(item->object.oid), &lex_index))
-			cur_g = cur_g->base_graph;
-
-		if (cur_g) {
-			*pos = lex_index + cur_g->num_commits_in_base;
-			return 1;
-		}
-
-		return 0;
+		return find_object_id_in_graph(&item->object.oid, g, pos);
 	}
 }
 
diff --git a/commit-graph.h b/commit-graph.h
index 96c24fb577..f373fab4c0 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -139,6 +139,8 @@  int write_commit_graph(struct object_directory *odb,
 		       enum commit_graph_write_flags flags,
 		       const struct commit_graph_opts *opts);
 
+int find_object_in_graph(struct repository *repo, struct object *object);
+
 #define COMMIT_GRAPH_VERIFY_SHALLOW	(1 << 0)
 
 int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags);
diff --git a/revision.c b/revision.c
index 671b6d6513..c3f9cf2998 100644
--- a/revision.c
+++ b/revision.c
@@ -362,10 +362,12 @@  static struct object *get_reference(struct rev_info *revs, const char *name,
 	struct object *object = lookup_unknown_object(revs->repo, oid);
 
 	if (object->type == OBJ_NONE) {
-		int type = oid_object_info(revs->repo, oid, NULL);
-		if (type < 0 || !object_as_type(object, type, 1)) {
-			object = NULL;
-			goto out;
+		if (find_object_in_graph(revs->repo, object) < 0) {
+			int type = oid_object_info(revs->repo, oid, NULL);
+			if (type < 0 || !object_as_type(object, type, 1)) {
+				object = NULL;
+				goto out;
+			}
 		}
 	}