diff mbox series

[7/9] object-store: provide helpers for loose_objects_cache

Message ID 20181112145056.GG7400@sigill.intra.peff.net (mailing list archive)
State New, archived
Headers show
Series caching loose objects | expand

Commit Message

Jeff King Nov. 12, 2018, 2:50 p.m. UTC
Our object_directory struct has a loose objects cache that all users of
the struct can see. But the only one that knows how to load the cache is
find_short_object_filename(). Let's extract that logic in to a reusable
function.

While we're at it, let's also reset the cache when we re-read the object
directories. This shouldn't have an impact on performance, as re-reads
are meant to be rare (and are already expensive, so we avoid them with
things like OBJECT_INFO_QUICK).

Since the cache is already meant to be an approximation, it's tempting
to skip even this bit of safety. But it's necessary to allow more code
to use it. For instance, fetch-pack explicitly re-reads the object
directory after performing its fetch, and would be confused if we didn't
clear the cache.

Signed-off-by: Jeff King <peff@peff.net>
---
 object-store.h | 18 +++++++++++++-----
 packfile.c     |  8 ++++++++
 sha1-file.c    | 26 ++++++++++++++++++++++++++
 sha1-name.c    | 21 +--------------------
 4 files changed, 48 insertions(+), 25 deletions(-)

Comments

René Scharfe Nov. 12, 2018, 7:24 p.m. UTC | #1
Am 12.11.2018 um 15:50 schrieb Jeff King:
> --- a/sha1-file.c
> +++ b/sha1-file.c
> @@ -2125,6 +2125,32 @@ int for_each_loose_object(each_loose_object_fn cb, void *data,
>  	return 0;
>  }
>  
> +static int append_loose_object(const struct object_id *oid, const char *path,
> +			       void *data)
> +{
> +	oid_array_append(data, oid);
> +	return 0;
> +}
> +
> +void odb_load_loose_cache(struct object_directory *odb, int subdir_nr)
> +{
> +	struct strbuf buf = STRBUF_INIT;
> +
> +	if (subdir_nr < 0 ||

Why not make subdir_nr unsigned (like in for_each_file_in_obj_subdir()), and
get rid of this first check?

> +	    subdir_nr >= ARRAY_SIZE(odb->loose_objects_subdir_seen))

Using unsigned char for subdir_nr would allow removing the second check as
well, but might hide invalid values in implicit conversions, I guess.

> +		BUG("subdir_nr out of range");

Showing the invalid value (like in for_each_file_in_obj_subdir()) would make
debugging easier in case the impossible actually happens.

> +
> +	if (odb->loose_objects_subdir_seen[subdir_nr])
> +		return;
> +
> +	strbuf_addstr(&buf, odb->path);
> +	for_each_file_in_obj_subdir(subdir_nr, &buf,
> +				    append_loose_object,
> +				    NULL, NULL,
> +				    &odb->loose_objects_cache);
> +	odb->loose_objects_subdir_seen[subdir_nr] = 1;

About here would be the ideal new home for ...

> +}
> +
>  static int check_stream_sha1(git_zstream *stream,
>  			     const char *hdr,
>  			     unsigned long size,
> diff --git a/sha1-name.c b/sha1-name.c
> index 358ca5e288..b24502811b 100644
> --- a/sha1-name.c
> +++ b/sha1-name.c
> @@ -83,36 +83,19 @@ static void update_candidates(struct disambiguate_state *ds, const struct object
>  	/* otherwise, current can be discarded and candidate is still good */
>  }
>  
> -static int append_loose_object(const struct object_id *oid, const char *path,
> -			       void *data)
> -{
> -	oid_array_append(data, oid);
> -	return 0;
> -}
> -
>  static int match_sha(unsigned, const unsigned char *, const unsigned char *);
>  
>  static void find_short_object_filename(struct disambiguate_state *ds)
>  {
>  	int subdir_nr = ds->bin_pfx.hash[0];
>  	struct object_directory *odb;
> -	struct strbuf buf = STRBUF_INIT;
>  
>  	for (odb = the_repository->objects->odb;
>  	     odb && !ds->ambiguous;
>  	     odb = odb->next) {
>  		int pos;
>  
> -		if (!odb->loose_objects_subdir_seen[subdir_nr]) {
> -			strbuf_reset(&buf);
> -			strbuf_addstr(&buf, odb->path);
> -			for_each_file_in_obj_subdir(subdir_nr, &buf,
> -						    append_loose_object,
> -						    NULL, NULL,
> -						    &odb->loose_objects_cache);
> -			odb->loose_objects_subdir_seen[subdir_nr] = 1;
> -		}
> -
> +		odb_load_loose_cache(odb, subdir_nr);
>  		pos = oid_array_lookup(&odb->loose_objects_cache, &ds->bin_pfx);
>  		if (pos < 0)
>  			pos = -1 - pos;
> @@ -125,8 +108,6 @@ static void find_short_object_filename(struct disambiguate_state *ds)
>  			pos++;
>  		}
>  	}
> -
> -	strbuf_release(&buf);

... this line.

>  }
>  
>  static int match_sha(unsigned len, const unsigned char *a, const unsigned char *b)
>
Jeff King Nov. 12, 2018, 8:16 p.m. UTC | #2
On Mon, Nov 12, 2018 at 08:24:59PM +0100, René Scharfe wrote:

> > +void odb_load_loose_cache(struct object_directory *odb, int subdir_nr)
> > +{
> > +	struct strbuf buf = STRBUF_INIT;
> > +
> > +	if (subdir_nr < 0 ||
> 
> Why not make subdir_nr unsigned (like in for_each_file_in_obj_subdir()), and
> get rid of this first check?

I stole the use of "int" from your code. ;)

More seriously, though, I wondered if callers might have sign issues
assigning from a "signed char". Usually we hold object ids in an
"unsigned char", but what happens if I do:

  signed char foo[] = { 1, 2, 3, 4 };
  odb_load_loose_cache(foo[0]);

when the parameter is "unsigned"?

I'll admit I get lost in all of the integer promotion rules there, but
are we sure there's no way we can end up with a funky truncation?

If the answer is no, then I agree that your suggestion is a strict
improvement.

> > +	    subdir_nr >= ARRAY_SIZE(odb->loose_objects_subdir_seen))
> 
> Using unsigned char for subdir_nr would allow removing the second check as
> well, but might hide invalid values in implicit conversions, I guess.

Yeah, I know that one could be a dangerous truncation.

I also considered just taking an object_id, which would make the
function "load the cache such that this oid would be valid". And it's
not necessarily the caller's business how much we load.

But that's OK for OBJECT_INFO_QUICK, but it's pretty darn subtle for the
abbrev code. That code doesn't care about just one object, but wants all
objects that share its prefix. That works now because we know that the
prefix is always at least 2 hex chars, so it's OK to load just that
subset.

> > +		BUG("subdir_nr out of range");
> 
> Showing the invalid value (like in for_each_file_in_obj_subdir()) would make
> debugging easier in case the impossible actually happens.

Good suggestion.

> > +	strbuf_addstr(&buf, odb->path);
> > +	for_each_file_in_obj_subdir(subdir_nr, &buf,
> > +				    append_loose_object,
> > +				    NULL, NULL,
> > +				    &odb->loose_objects_cache);
> > +	odb->loose_objects_subdir_seen[subdir_nr] = 1;
> 
> About here would be the ideal new home for ...
> [...]
> > -
> > -	strbuf_release(&buf);
> 
> ... this line.

Oops, thanks. I toyed with making the strbuf here static, which is why I
dropped the release. But since we only use it on a cache miss, I decided
it was better to avoid the hidden global (and then of course forgot to
re-add the release).

-Peff
diff mbox series

Patch

diff --git a/object-store.h b/object-store.h
index 30faf7b391..bf1e0cb761 100644
--- a/object-store.h
+++ b/object-store.h
@@ -11,11 +11,12 @@  struct object_directory {
 	struct object_directory *next;
 
 	/*
-	 * Used to store the results of readdir(3) calls when searching
-	 * for unique abbreviated hashes.  This cache is never
-	 * invalidated, thus it's racy and not necessarily accurate.
-	 * That's fine for its purpose; don't use it for tasks requiring
-	 * greater accuracy!
+	 * Used to store the results of readdir(3) calls when we are OK
+	 * sacrificing accuracy due to races for speed. That includes
+	 * our search for unique abbreviated hashes. Don't use it for tasks
+	 * requiring greater accuracy!
+	 *
+	 * Be sure to call odb_load_loose_cache() before using.
 	 */
 	char loose_objects_subdir_seen[256];
 	struct oid_array loose_objects_cache;
@@ -45,6 +46,13 @@  void add_to_alternates_file(const char *dir);
  */
 void add_to_alternates_memory(const char *dir);
 
+/*
+ * Populate an odb's loose object cache for one particular subdirectory (i.e.,
+ * the one that corresponds to the first byte of objects you're interested in,
+ * from 0 to 255 inclusive).
+ */
+void odb_load_loose_cache(struct object_directory *odb, int subdir_nr);
+
 struct packed_git {
 	struct packed_git *next;
 	struct list_head mru;
diff --git a/packfile.c b/packfile.c
index 1eda33247f..91fd40efb0 100644
--- a/packfile.c
+++ b/packfile.c
@@ -987,6 +987,14 @@  static void prepare_packed_git(struct repository *r)
 
 void reprepare_packed_git(struct repository *r)
 {
+	struct object_directory *odb;
+
+	for (odb = r->objects->odb; odb; odb = odb->next) {
+		oid_array_clear(&odb->loose_objects_cache);
+		memset(&odb->loose_objects_subdir_seen, 0,
+		       sizeof(odb->loose_objects_subdir_seen));
+	}
+
 	r->objects->approximate_object_count_valid = 0;
 	r->objects->packed_git_initialized = 0;
 	prepare_packed_git(r);
diff --git a/sha1-file.c b/sha1-file.c
index 503262edd2..4aae716a37 100644
--- a/sha1-file.c
+++ b/sha1-file.c
@@ -2125,6 +2125,32 @@  int for_each_loose_object(each_loose_object_fn cb, void *data,
 	return 0;
 }
 
+static int append_loose_object(const struct object_id *oid, const char *path,
+			       void *data)
+{
+	oid_array_append(data, oid);
+	return 0;
+}
+
+void odb_load_loose_cache(struct object_directory *odb, int subdir_nr)
+{
+	struct strbuf buf = STRBUF_INIT;
+
+	if (subdir_nr < 0 ||
+	    subdir_nr >= ARRAY_SIZE(odb->loose_objects_subdir_seen))
+		BUG("subdir_nr out of range");
+
+	if (odb->loose_objects_subdir_seen[subdir_nr])
+		return;
+
+	strbuf_addstr(&buf, odb->path);
+	for_each_file_in_obj_subdir(subdir_nr, &buf,
+				    append_loose_object,
+				    NULL, NULL,
+				    &odb->loose_objects_cache);
+	odb->loose_objects_subdir_seen[subdir_nr] = 1;
+}
+
 static int check_stream_sha1(git_zstream *stream,
 			     const char *hdr,
 			     unsigned long size,
diff --git a/sha1-name.c b/sha1-name.c
index 358ca5e288..b24502811b 100644
--- a/sha1-name.c
+++ b/sha1-name.c
@@ -83,36 +83,19 @@  static void update_candidates(struct disambiguate_state *ds, const struct object
 	/* otherwise, current can be discarded and candidate is still good */
 }
 
-static int append_loose_object(const struct object_id *oid, const char *path,
-			       void *data)
-{
-	oid_array_append(data, oid);
-	return 0;
-}
-
 static int match_sha(unsigned, const unsigned char *, const unsigned char *);
 
 static void find_short_object_filename(struct disambiguate_state *ds)
 {
 	int subdir_nr = ds->bin_pfx.hash[0];
 	struct object_directory *odb;
-	struct strbuf buf = STRBUF_INIT;
 
 	for (odb = the_repository->objects->odb;
 	     odb && !ds->ambiguous;
 	     odb = odb->next) {
 		int pos;
 
-		if (!odb->loose_objects_subdir_seen[subdir_nr]) {
-			strbuf_reset(&buf);
-			strbuf_addstr(&buf, odb->path);
-			for_each_file_in_obj_subdir(subdir_nr, &buf,
-						    append_loose_object,
-						    NULL, NULL,
-						    &odb->loose_objects_cache);
-			odb->loose_objects_subdir_seen[subdir_nr] = 1;
-		}
-
+		odb_load_loose_cache(odb, subdir_nr);
 		pos = oid_array_lookup(&odb->loose_objects_cache, &ds->bin_pfx);
 		if (pos < 0)
 			pos = -1 - pos;
@@ -125,8 +108,6 @@  static void find_short_object_filename(struct disambiguate_state *ds)
 			pos++;
 		}
 	}
-
-	strbuf_release(&buf);
 }
 
 static int match_sha(unsigned len, const unsigned char *a, const unsigned char *b)