Message ID | 20210410152140.3525040-9-sandals@crustytoothpaste.net (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | SHA-256 / SHA-1 interop, part 1 | expand |
Just an observation here: comparing 256 bytes every time would seem to have one nice bonus side effect and one potentially bad, but vanishingly unlikely, side effect: a 160 byte null hash will now compare equal to a 256 byte null hash (good), but a 160 byte hash extended to 256 bytes will compare equal to a 256 byte hash that just happens to end in 96 bytes of zero (bad, but I would guess, will never actually happen). Chris On Sat, Apr 10, 2021 at 8:23 AM brian m. carlson <sandals@crustytoothpaste.net> wrote: > > Currently, when we compare two object IDs, we have to take a branch to > determine what the hash size is supposed to be. The compiler can > optimize well for a single length, but has trouble when there are two > possible lengths. > > There is, however, an alternative: we can ensure that we always compare > the full length of the hash buffer, but in turn we must zero the > remainder of the buffer when using SHA-1; otherwise, we'll end up with > incompatible junk at the end of otherwise equivalent object IDs that > will prevent them from matching. This is an acceptable tradeoff, > because we generally read an object ID in once, but then compare it > against others multiple times. > > This latter approach also has some benefits as well: since we will have > annotated every location in which we load an object ID into an instance > of struct object_id, if we want to set the hash algorithm for the object > ID, we can do so at the same time. > > Adopt this latter approach, since it provides us greater flexibility and > lets us read and store object IDs for multiple algorithms at once. > > Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net> > --- > hash.h | 13 ++++++++++--- > hex.c | 9 ++++++--- > notes.c | 3 +++ > object-file.c | 1 + > 4 files changed, 20 insertions(+), 6 deletions(-) > > diff --git a/hash.h b/hash.h > index c8f03d8aee..04eba5c56b 100644 > --- a/hash.h > +++ b/hash.h > @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2) > > static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2) > { > - return hashcmp(oid1->hash, oid2->hash); > + return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); > } > > static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) > @@ -221,7 +221,7 @@ static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) > > static inline int oideq(const struct object_id *oid1, const struct object_id *oid2) > { > - return hasheq(oid1->hash, oid2->hash); > + return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); > } > > static inline int is_null_oid(const struct object_id *oid) > @@ -258,7 +258,9 @@ static inline void oidclr(struct object_id *oid) > > static inline void oidread(struct object_id *oid, const unsigned char *hash) > { > - memcpy(oid->hash, hash, the_hash_algo->rawsz); > + size_t rawsz = the_hash_algo->rawsz; > + memcpy(oid->hash, hash, rawsz); > + memset(oid->hash + rawsz, 0, GIT_MAX_RAWSZ - rawsz); > } > > static inline int is_empty_blob_sha1(const unsigned char *sha1) > @@ -281,6 +283,11 @@ static inline int is_empty_tree_oid(const struct object_id *oid) > return oideq(oid, the_hash_algo->empty_tree); > } > > +static inline void oid_pad_buffer(struct object_id *oid, const struct git_hash_algo *algop) > +{ > + memset(oid->hash + algop->rawsz, 0, GIT_MAX_RAWSZ - algop->rawsz); > +} > + > const char *empty_tree_oid_hex(void); > const char *empty_blob_oid_hex(void); > > diff --git a/hex.c b/hex.c > index da51e64929..5fa3e71cb9 100644 > --- a/hex.c > +++ b/hex.c > @@ -69,7 +69,10 @@ int get_sha1_hex(const char *hex, unsigned char *sha1) > int get_oid_hex_algop(const char *hex, struct object_id *oid, > const struct git_hash_algo *algop) > { > - return get_hash_hex_algop(hex, oid->hash, algop); > + int ret = get_hash_hex_algop(hex, oid->hash, algop); > + if (!ret) > + oid_pad_buffer(oid, algop); > + return ret; > } > > /* > @@ -80,7 +83,7 @@ int get_oid_hex_any(const char *hex, struct object_id *oid) > { > int i; > for (i = GIT_HASH_NALGOS - 1; i > 0; i--) { > - if (!get_hash_hex_algop(hex, oid->hash, &hash_algos[i])) > + if (!get_oid_hex_algop(hex, oid, &hash_algos[i])) > return i; > } > return GIT_HASH_UNKNOWN; > @@ -95,7 +98,7 @@ int parse_oid_hex_algop(const char *hex, struct object_id *oid, > const char **end, > const struct git_hash_algo *algop) > { > - int ret = get_hash_hex_algop(hex, oid->hash, algop); > + int ret = get_oid_hex_algop(hex, oid, algop); > if (!ret) > *end = hex + algop->hexsz; > return ret; > diff --git a/notes.c b/notes.c > index a44b25858f..1dfe9e2b9f 100644 > --- a/notes.c > +++ b/notes.c > @@ -455,6 +455,8 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree, > CALLOC_ARRAY(l, 1); > oidcpy(&l->key_oid, &object_oid); > oidcpy(&l->val_oid, &entry.oid); > + oid_pad_buffer(&l->key_oid, the_hash_algo); > + oid_pad_buffer(&l->val_oid, the_hash_algo); > if (note_tree_insert(t, node, n, l, type, > combine_notes_concatenate)) > die("Failed to load %s %s into notes tree " > @@ -484,6 +486,7 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree, > strbuf_addch(&non_note_path, '/'); > } > strbuf_addstr(&non_note_path, entry.path); > + oid_pad_buffer(&entry.oid, the_hash_algo); > add_non_note(t, strbuf_detach(&non_note_path, NULL), > entry.mode, entry.oid.hash); > } > diff --git a/object-file.c b/object-file.c > index 3f43c376e7..8e338247cc 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -2352,6 +2352,7 @@ int for_each_file_in_obj_subdir(unsigned int subdir_nr, > if (namelen == the_hash_algo->hexsz - 2 && > !hex_to_bytes(oid.hash + 1, de->d_name, > the_hash_algo->rawsz - 1)) { > + oid_pad_buffer(&oid, the_hash_algo); > if (obj_cb) { > r = obj_cb(&oid, path->buf, data); > if (r)
On Sat, Apr 10 2021, brian m. carlson wrote: > Currently, when we compare two object IDs, we have to take a branch to > determine what the hash size is supposed to be. The compiler can > optimize well for a single length, but has trouble when there are two > possible lengths. This would benefit from some performance/perf numbers. When this code was first changed like this in 183a638b7da (hashcmp: assert constant hash size, 2018-08-23) we had: Test v2.18.0 v2.19.0-rc0 HEAD ------------------------------------------------------------------------------ 0001.2: 34.24(33.81+0.43) 34.83(34.42+0.40) +1.7% 33.90(33.47+0.42) -1.0% Then it was later modified in 0dab7129ab1 (cache: make hashcmp and hasheq work with larger hashes, 2018-11-14). > @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2) > > static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2) > { > - return hashcmp(oid1->hash, oid2->hash); > + return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); > } hashcmp is now: if (the_hash_algo->rawsz == GIT_MAX_RAWSZ) return memcmp(sha1, sha2, GIT_MAX_RAWSZ); return memcmp(sha1, sha2, GIT_SHA1_RAWSZ); Wouldn't it make more sense to amend it to just be a memcmp wrapper/macro if we're going to not make this conditional on the hash algorithm, or are there other callsites where we still want the old way of doing it? > > static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) > @@ -221,7 +221,7 @@ static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) > > static inline int oideq(const struct object_id *oid1, const struct object_id *oid2) > { > - return hasheq(oid1->hash, oid2->hash); > + return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); > } Ditto hasheq v.s. !memcmp: if (the_hash_algo->rawsz == GIT_MAX_RAWSZ) return !memcmp(sha1, sha2, GIT_MAX_RAWSZ); return !memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
On 2021-04-11 at 11:36:33, Ævar Arnfjörð Bjarmason wrote: > > On Sat, Apr 10 2021, brian m. carlson wrote: > > > Currently, when we compare two object IDs, we have to take a branch to > > determine what the hash size is supposed to be. The compiler can > > optimize well for a single length, but has trouble when there are two > > possible lengths. > > This would benefit from some performance/perf numbers. When this code > was first changed like this in 183a638b7da (hashcmp: assert constant > hash size, 2018-08-23) we had: > > Test v2.18.0 v2.19.0-rc0 HEAD > ------------------------------------------------------------------------------ > 0001.2: 34.24(33.81+0.43) 34.83(34.42+0.40) +1.7% 33.90(33.47+0.42) -1.0% > > Then it was later modified in 0dab7129ab1 (cache: make hashcmp and > hasheq work with larger hashes, 2018-11-14). I can do some perf numbers. > > @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2) > > > > static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2) > > { > > - return hashcmp(oid1->hash, oid2->hash); > > + return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); > > } > > hashcmp is now: > > if (the_hash_algo->rawsz == GIT_MAX_RAWSZ) > return memcmp(sha1, sha2, GIT_MAX_RAWSZ); > return memcmp(sha1, sha2, GIT_SHA1_RAWSZ); > > Wouldn't it make more sense to amend it to just be a memcmp > wrapper/macro if we're going to not make this conditional on the hash > algorithm, or are there other callsites where we still want the old way > of doing it? No, we can't do that. With oidcmp, we know the buffer is large enough. However, in some cases, the buffer in hashcmp is not large enough. For example, we may be at the end of a SHA-1 tree object and we'd segfault. I did try that and I quickly found that it was totally broken.
diff --git a/hash.h b/hash.h index c8f03d8aee..04eba5c56b 100644 --- a/hash.h +++ b/hash.h @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2) static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2) { - return hashcmp(oid1->hash, oid2->hash); + return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); } static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) @@ -221,7 +221,7 @@ static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2) static inline int oideq(const struct object_id *oid1, const struct object_id *oid2) { - return hasheq(oid1->hash, oid2->hash); + return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ); } static inline int is_null_oid(const struct object_id *oid) @@ -258,7 +258,9 @@ static inline void oidclr(struct object_id *oid) static inline void oidread(struct object_id *oid, const unsigned char *hash) { - memcpy(oid->hash, hash, the_hash_algo->rawsz); + size_t rawsz = the_hash_algo->rawsz; + memcpy(oid->hash, hash, rawsz); + memset(oid->hash + rawsz, 0, GIT_MAX_RAWSZ - rawsz); } static inline int is_empty_blob_sha1(const unsigned char *sha1) @@ -281,6 +283,11 @@ static inline int is_empty_tree_oid(const struct object_id *oid) return oideq(oid, the_hash_algo->empty_tree); } +static inline void oid_pad_buffer(struct object_id *oid, const struct git_hash_algo *algop) +{ + memset(oid->hash + algop->rawsz, 0, GIT_MAX_RAWSZ - algop->rawsz); +} + const char *empty_tree_oid_hex(void); const char *empty_blob_oid_hex(void); diff --git a/hex.c b/hex.c index da51e64929..5fa3e71cb9 100644 --- a/hex.c +++ b/hex.c @@ -69,7 +69,10 @@ int get_sha1_hex(const char *hex, unsigned char *sha1) int get_oid_hex_algop(const char *hex, struct object_id *oid, const struct git_hash_algo *algop) { - return get_hash_hex_algop(hex, oid->hash, algop); + int ret = get_hash_hex_algop(hex, oid->hash, algop); + if (!ret) + oid_pad_buffer(oid, algop); + return ret; } /* @@ -80,7 +83,7 @@ int get_oid_hex_any(const char *hex, struct object_id *oid) { int i; for (i = GIT_HASH_NALGOS - 1; i > 0; i--) { - if (!get_hash_hex_algop(hex, oid->hash, &hash_algos[i])) + if (!get_oid_hex_algop(hex, oid, &hash_algos[i])) return i; } return GIT_HASH_UNKNOWN; @@ -95,7 +98,7 @@ int parse_oid_hex_algop(const char *hex, struct object_id *oid, const char **end, const struct git_hash_algo *algop) { - int ret = get_hash_hex_algop(hex, oid->hash, algop); + int ret = get_oid_hex_algop(hex, oid, algop); if (!ret) *end = hex + algop->hexsz; return ret; diff --git a/notes.c b/notes.c index a44b25858f..1dfe9e2b9f 100644 --- a/notes.c +++ b/notes.c @@ -455,6 +455,8 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree, CALLOC_ARRAY(l, 1); oidcpy(&l->key_oid, &object_oid); oidcpy(&l->val_oid, &entry.oid); + oid_pad_buffer(&l->key_oid, the_hash_algo); + oid_pad_buffer(&l->val_oid, the_hash_algo); if (note_tree_insert(t, node, n, l, type, combine_notes_concatenate)) die("Failed to load %s %s into notes tree " @@ -484,6 +486,7 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree, strbuf_addch(&non_note_path, '/'); } strbuf_addstr(&non_note_path, entry.path); + oid_pad_buffer(&entry.oid, the_hash_algo); add_non_note(t, strbuf_detach(&non_note_path, NULL), entry.mode, entry.oid.hash); } diff --git a/object-file.c b/object-file.c index 3f43c376e7..8e338247cc 100644 --- a/object-file.c +++ b/object-file.c @@ -2352,6 +2352,7 @@ int for_each_file_in_obj_subdir(unsigned int subdir_nr, if (namelen == the_hash_algo->hexsz - 2 && !hex_to_bytes(oid.hash + 1, de->d_name, the_hash_algo->rawsz - 1)) { + oid_pad_buffer(&oid, the_hash_algo); if (obj_cb) { r = obj_cb(&oid, path->buf, data); if (r)
Currently, when we compare two object IDs, we have to take a branch to determine what the hash size is supposed to be. The compiler can optimize well for a single length, but has trouble when there are two possible lengths. There is, however, an alternative: we can ensure that we always compare the full length of the hash buffer, but in turn we must zero the remainder of the buffer when using SHA-1; otherwise, we'll end up with incompatible junk at the end of otherwise equivalent object IDs that will prevent them from matching. This is an acceptable tradeoff, because we generally read an object ID in once, but then compare it against others multiple times. This latter approach also has some benefits as well: since we will have annotated every location in which we load an object ID into an instance of struct object_id, if we want to set the hash algorithm for the object ID, we can do so at the same time. Adopt this latter approach, since it provides us greater flexibility and lets us read and store object IDs for multiple algorithms at once. Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net> --- hash.h | 13 ++++++++++--- hex.c | 9 ++++++--- notes.c | 3 +++ object-file.c | 1 + 4 files changed, 20 insertions(+), 6 deletions(-)