diff mbox series

[3/3] packfile: use oidset for bad objects

Message ID 14d48124-d8bb-aa34-aad0-4203d699e17e@web.de (mailing list archive)
State Superseded
Headers show
Series packfile: use oidset for bad objects | expand

Commit Message

René Scharfe Sept. 11, 2021, 8:01 a.m. UTC
Store the object ID of broken pack entries in an oidset instead of
keeping only their hashes in an unsorted array.  The resulting code is
shorter and easier to read.  It also handles the (hopefully) very rare
case of having a high number of bad objects better.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 midx.c         | 13 ++++---------
 object-store.h |  4 ++--
 packfile.c     | 27 +++++----------------------
 3 files changed, 11 insertions(+), 33 deletions(-)

--
2.33.0

Comments

Jeff King Sept. 11, 2021, 2:26 p.m. UTC | #1
On Sat, Sep 11, 2021 at 10:01:40AM +0200, René Scharfe wrote:

> Store the object ID of broken pack entries in an oidset instead of
> keeping only their hashes in an unsorted array.  The resulting code is
> shorter and easier to read.  It also handles the (hopefully) very rare
> case of having a high number of bad objects better.

Yay, I'm very happy to see this kind of cleanup replacing ad hoc data
structures with well-tested ones.

> @@ -303,15 +304,9 @@ static int nth_midxed_pack_entry(struct repository *r,
>  	if (!is_pack_valid(p))
>  		return 0;
> 
> -	if (p->num_bad_objects) {
> -		uint32_t i;
> -		struct object_id oid;
> -		nth_midxed_object_oid(&oid, m, pos);
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid.hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return 0;
> -	}
> +	nth_midxed_object_oid(&oid, m, pos);
> +	if (oidset_contains(&p->bad_objects, &oid))
> +		return 0;

Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
the old code, we'd skip that in the common case that we had no corrupt
objects, but now we'll pay the cost regardless. memcpy() isn't _that_
expensive, but I'd expect this to be a relatively hot code path.

Is it worth sticking all of this inside:

  if (oidset_size(&p->bad_objects))

?

> diff --git a/packfile.c b/packfile.c
> index 04080a558b..8f6d1d6328 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1163,29 +1163,17 @@ int unpack_object_header(struct packed_git *p,
> 
>  void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
>  {
> -	unsigned i;
> -	const unsigned hashsz = the_hash_algo->rawsz;
> -	for (i = 0; i < p->num_bad_objects; i++)
> -		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
> -			return;

I cringed at the hasheq() and hashcpy() calls in the earlier patches.
Happy to see them go away now. :)

> -	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
> -				      st_mult(GIT_MAX_RAWSZ,
> -					      st_add(p->num_bad_objects, 1)));
> -	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
> -	p->num_bad_objects++;
> +	oidset_insert(&p->bad_objects, oid);
>  }

So now marking a bad object  is a one-liner. We _could_ just inline it
at the callers, but I like keeping the implementation abstract.

>  const struct packed_git *has_packed_and_bad(struct repository *r,
>  					    const struct object_id *oid)
>  {
>  	struct packed_git *p;
> -	unsigned i;
> 
>  	for (p = r->objects->packed_git; p; p = p->next)
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid->hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return p;
> +		if (oidset_contains(&p->bad_objects, oid))
> +			return p;
>  	return NULL;
>  }

Not related to your patch, but I noticed how terribly inefficient this
function could be in a repo with a lot of packs. But we only call it
once in the error case right before we die(), so a linear scan is no
problem.

> @@ -2016,13 +2004,8 @@ static int fill_pack_entry(const struct object_id *oid,
>  {
>  	off_t offset;
> 
> -	if (p->num_bad_objects) {
> -		unsigned i;
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid->hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return 0;
> -	}
> +	if (oidset_contains(&p->bad_objects, oid))
> +		return 0;

And this one (and the previous) have the oid already, so they don't have
to worry about optimizing the is-it-empty check first.

-Peff
René Scharfe Sept. 11, 2021, 4:08 p.m. UTC | #2
Am 11.09.21 um 16:26 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 10:01:40AM +0200, René Scharfe wrote:
>
>> Store the object ID of broken pack entries in an oidset instead of
>> keeping only their hashes in an unsorted array.  The resulting code is
>> shorter and easier to read.  It also handles the (hopefully) very rare
>> case of having a high number of bad objects better.
>
> Yay, I'm very happy to see this kind of cleanup replacing ad hoc data
> structures with well-tested ones.
>
>> @@ -303,15 +304,9 @@ static int nth_midxed_pack_entry(struct repository *r,
>>  	if (!is_pack_valid(p))
>>  		return 0;
>>
>> -	if (p->num_bad_objects) {
>> -		uint32_t i;
>> -		struct object_id oid;
>> -		nth_midxed_object_oid(&oid, m, pos);
>> -		for (i = 0; i < p->num_bad_objects; i++)
>> -			if (hasheq(oid.hash,
>> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
>> -				return 0;
>> -	}
>> +	nth_midxed_object_oid(&oid, m, pos);
>> +	if (oidset_contains(&p->bad_objects, &oid))
>> +		return 0;
>
> Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
> the old code, we'd skip that in the common case that we had no corrupt
> objects, but now we'll pay the cost regardless. memcpy() isn't _that_
> expensive, but I'd expect this to be a relatively hot code path.
>
> Is it worth sticking all of this inside:
>
>   if (oidset_size(&p->bad_objects))
>
> ?

Hard to say.  It would certainly match the old code more closely.  Is a
function call cheaper than copying 32 bytes?  Depends on the CPU and
whether the hash is cached, I guess.  And cached it probably is, because
the caller did a binary search for it..

We can pass on the original oid to avoid the nth_midxed_object_oid()
call, but inlining the whole thing might even be nicer.

René
Jeff King Sept. 11, 2021, 5:03 p.m. UTC | #3
On Sat, Sep 11, 2021 at 06:08:38PM +0200, René Scharfe wrote:

> >> +	nth_midxed_object_oid(&oid, m, pos);
> >> +	if (oidset_contains(&p->bad_objects, &oid))
> >> +		return 0;
> >
> > Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
> > the old code, we'd skip that in the common case that we had no corrupt
> > objects, but now we'll pay the cost regardless. memcpy() isn't _that_
> > expensive, but I'd expect this to be a relatively hot code path.
> >
> > Is it worth sticking all of this inside:
> >
> >   if (oidset_size(&p->bad_objects))
> >
> > ?
> 
> Hard to say.  It would certainly match the old code more closely.  Is a
> function call cheaper than copying 32 bytes?  Depends on the CPU and
> whether the hash is cached, I guess.  And cached it probably is, because
> the caller did a binary search for it..

You already have a function call for nth_midxed_object_oid(), so
checking oidset_size() would be a strict improvement.

> We can pass on the original oid to avoid the nth_midxed_object_oid()
> call, but inlining the whole thing might even be nicer.

Yeah, it occurs to me that oidset_size() would be a good candidate for
inlining, if that's what you mean.

-Peff
René Scharfe Sept. 11, 2021, 5:16 p.m. UTC | #4
Am 11.09.21 um 19:03 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 06:08:38PM +0200, René Scharfe wrote:
>
>>>> +	nth_midxed_object_oid(&oid, m, pos);
>>>> +	if (oidset_contains(&p->bad_objects, &oid))
>>>> +		return 0;
>>>
>>> Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
>>> the old code, we'd skip that in the common case that we had no corrupt
>>> objects, but now we'll pay the cost regardless. memcpy() isn't _that_
>>> expensive, but I'd expect this to be a relatively hot code path.
>>>
>>> Is it worth sticking all of this inside:
>>>
>>>   if (oidset_size(&p->bad_objects))
>>>
>>> ?
>>
>> Hard to say.  It would certainly match the old code more closely.  Is a
>> function call cheaper than copying 32 bytes?  Depends on the CPU and
>> whether the hash is cached, I guess.  And cached it probably is, because
>> the caller did a binary search for it..
>
> You already have a function call for nth_midxed_object_oid(), so
> checking oidset_size() would be a strict improvement.

If I read the assembly correctly nth_midxed_object_oid() is inlined by the
compiler in my build, as is nth_midxed_pack_entry().  Both are defined in
the same file, so other compilers may easily do the same.

>> We can pass on the original oid to avoid the nth_midxed_object_oid()
>> call, but inlining the whole thing might even be nicer.
>
> Yeah, it occurs to me that oidset_size() would be a good candidate for
> inlining, if that's what you mean.

True, but I meant something else (see patch 4/3). :)

René
diff mbox series

Patch

diff --git a/midx.c b/midx.c
index 321c6fdd2f..01623fb339 100644
--- a/midx.c
+++ b/midx.c
@@ -283,6 +283,7 @@  static int nth_midxed_pack_entry(struct repository *r,
 {
 	uint32_t pack_int_id;
 	struct packed_git *p;
+	struct object_id oid;

 	if (pos >= m->num_objects)
 		return 0;
@@ -303,15 +304,9 @@  static int nth_midxed_pack_entry(struct repository *r,
 	if (!is_pack_valid(p))
 		return 0;

-	if (p->num_bad_objects) {
-		uint32_t i;
-		struct object_id oid;
-		nth_midxed_object_oid(&oid, m, pos);
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid.hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	nth_midxed_object_oid(&oid, m, pos);
+	if (oidset_contains(&p->bad_objects, &oid))
+		return 0;

 	e->offset = nth_midxed_offset(m, pos);
 	e->p = p;
diff --git a/object-store.h b/object-store.h
index b4dc6668aa..c7bead66f6 100644
--- a/object-store.h
+++ b/object-store.h
@@ -10,6 +10,7 @@ 
 #include "khash.h"
 #include "dir.h"
 #include "oidtree.h"
+#include "oidset.h"

 struct object_directory {
 	struct object_directory *next;
@@ -75,9 +76,8 @@  struct packed_git {
 	const void *index_data;
 	size_t index_size;
 	uint32_t num_objects;
-	uint32_t num_bad_objects;
 	uint32_t crc_offset;
-	unsigned char *bad_object_sha1;
+	struct oidset bad_objects;
 	int index_version;
 	time_t mtime;
 	int pack_fd;
diff --git a/packfile.c b/packfile.c
index 04080a558b..8f6d1d6328 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1163,29 +1163,17 @@  int unpack_object_header(struct packed_git *p,

 void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 {
-	unsigned i;
-	const unsigned hashsz = the_hash_algo->rawsz;
-	for (i = 0; i < p->num_bad_objects; i++)
-		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
-			return;
-	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
-				      st_mult(GIT_MAX_RAWSZ,
-					      st_add(p->num_bad_objects, 1)));
-	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
-	p->num_bad_objects++;
+	oidset_insert(&p->bad_objects, oid);
 }

 const struct packed_git *has_packed_and_bad(struct repository *r,
 					    const struct object_id *oid)
 {
 	struct packed_git *p;
-	unsigned i;

 	for (p = r->objects->packed_git; p; p = p->next)
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return p;
+		if (oidset_contains(&p->bad_objects, oid))
+			return p;
 	return NULL;
 }

@@ -2016,13 +2004,8 @@  static int fill_pack_entry(const struct object_id *oid,
 {
 	off_t offset;

-	if (p->num_bad_objects) {
-		unsigned i;
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	if (oidset_contains(&p->bad_objects, oid))
+		return 0;

 	offset = find_pack_entry_one(oid->hash, p);
 	if (!offset)