mbox series

[v2,0/8] hash: introduce unsafe_hash_algo(), drop unsafe_ variants

Message ID cover.1736363652.git.me@ttaylorr.com (mailing list archive)
Headers show
Series hash: introduce unsafe_hash_algo(), drop unsafe_ variants | expand

Message

Taylor Blau Jan. 8, 2025, 7:14 p.m. UTC
(This series is rebased on 'master', which is 14650065b7
(RelNotes/2.48.0: fix typos etc., 2025-01-07) at the time of writing).

The bulk of this series is unchanged since last time, but a new seventh
patch that further hardens the hashfile_checkpoint callers on top of
Patrick's recent series[1].

The original cover letter is as follows:

------------

This series implements an idea discussed in [2] which suggests that we
introduce a way to access a wrapped version of a 'struct git_hash_algo'
which represents the unsafe variant of that algorithm, rather than
having individual unsafe_ functions (like unsafe_init_fn() versus
init_fn(), etc.).

This approach is relatively straightforward to implement, and removes a
significant deficiency in the original implementation of
unsafe/non-cryptographic hash functions by making it impossible to
switch between safe- and unsafe variants of hash functions. It also
cleans up the sha1-unsafe test helper's implementation by removing a
large number of "if (unsafe)"-style conditionals.

The series is laid out as follows:

  * The first two patches prepare the hashfile API for the upcoming
    change:

      csum-file: store the hash algorithm as a struct field
      csum-file.c: extract algop from hashfile_checksum_valid()

  * The next patch implements the new 'unsafe_hash_algo()' function at
    the heart of this series' approach:

      hash.h: introduce `unsafe_hash_algo()`

  * The next two patches convert existing callers to use the new
    'unsafe_hash_algo()' function, instead of switching between safe and
    unsafe_ variants of individual functions:

      csum-file.c: use unsafe_hash_algo()
      t/helper/test-hash.c: use unsafe_hash_algo()

  * The final patch drops the unsafe_ function variants following all
    callers being converted to use the new pattern:

      hash.h: drop unsafe_ function variants

Thanks in advance for your review!

[1]: https://lore.kernel.org/git/20241230-pks-meson-sha1-unsafe-v1-0-efb276e171f5@pks.im/
[2]: https://lore.kernel.org/git/20241107013915.GA961214@coredump.intra.peff.net/

Taylor Blau (8):
  t/helper/test-tool: implement sha1-unsafe helper
  csum-file: store the hash algorithm as a struct field
  csum-file.c: extract algop from hashfile_checksum_valid()
  hash.h: introduce `unsafe_hash_algo()`
  csum-file.c: use unsafe_hash_algo()
  t/helper/test-hash.c: use unsafe_hash_algo()
  csum-file: introduce hashfile_checkpoint_init()
  hash.h: drop unsafe_ function variants

 builtin/fast-import.c  |  2 +-
 bulk-checkin.c         |  9 ++++++---
 csum-file.c            | 40 +++++++++++++++++++++++++---------------
 csum-file.h            |  2 ++
 hash.h                 | 20 +++++---------------
 object-file.c          | 41 ++++++++++++++++++++++++++---------------
 t/helper/test-hash.c   |  4 +++-
 t/helper/test-sha1.c   |  7 ++++++-
 t/helper/test-sha1.sh  | 38 ++++++++++++++++++++++----------------
 t/helper/test-sha256.c |  2 +-
 t/helper/test-tool.c   |  1 +
 t/helper/test-tool.h   |  3 ++-
 12 files changed, 100 insertions(+), 69 deletions(-)

Range-diff against v1:
2:  d8c1fc78b57 ! 1:  4c1523a04f1 t/helper/test-tool: implement sha1-unsafe helper
    @@ Metadata
      ## Commit message ##
         t/helper/test-tool: implement sha1-unsafe helper
     
    -    Add a new helper similar to 't/helper/test-tool sha1' called instead
    -    "sha1-unsafe" which uses the unsafe variant of Git's SHA-1 wrappers.
    +    With the new "unsafe" SHA-1 build knob, it is convenient to have a
    +    test-tool that can exercise Git's unsafe SHA-1 wrappers for testing,
    +    similar to 't/helper/test-tool sha1'.
     
    -    While we're at it, modify the test-sha1.sh script to exercise both
    -    the sha1 and sha1-unsafe test tools to ensure that both produce the
    -    expected hash values.
    +    Implement that helper by altering the implementation of that test-tool
    +    (in cmd_hash_impl(), which is generic and parameterized over different
    +    hash functions) to conditionally run the unsafe variants of the chosen
    +    hash function, and expose the new behavior via a new 'sha1-unsafe' test
    +    helper.
     
         Signed-off-by: Taylor Blau <me@ttaylorr.com>
     
    + ## t/helper/test-hash.c ##
    +@@
    + #include "test-tool.h"
    + #include "hex.h"
    + 
    +-int cmd_hash_impl(int ac, const char **av, int algo)
    ++int cmd_hash_impl(int ac, const char **av, int algo, int unsafe)
    + {
    + 	git_hash_ctx ctx;
    + 	unsigned char hash[GIT_MAX_HEXSZ];
    +@@ t/helper/test-hash.c: int cmd_hash_impl(int ac, const char **av, int algo)
    + 			die("OOPS");
    + 	}
    + 
    +-	algop->init_fn(&ctx);
    ++	if (unsafe)
    ++		algop->unsafe_init_fn(&ctx);
    ++	else
    ++		algop->init_fn(&ctx);
    + 
    + 	while (1) {
    + 		ssize_t sz, this_sz;
    +@@ t/helper/test-hash.c: int cmd_hash_impl(int ac, const char **av, int algo)
    + 		}
    + 		if (this_sz == 0)
    + 			break;
    +-		algop->update_fn(&ctx, buffer, this_sz);
    ++		if (unsafe)
    ++			algop->unsafe_update_fn(&ctx, buffer, this_sz);
    ++		else
    ++			algop->update_fn(&ctx, buffer, this_sz);
    + 	}
    +-	algop->final_fn(hash, &ctx);
    ++	if (unsafe)
    ++		algop->unsafe_final_fn(hash, &ctx);
    ++	else
    ++		algop->final_fn(hash, &ctx);
    + 
    + 	if (binary)
    + 		fwrite(hash, 1, algop->rawsz, stdout);
    +
      ## t/helper/test-sha1.c ##
    +@@
    + 
    + int cmd__sha1(int ac, const char **av)
    + {
    +-	return cmd_hash_impl(ac, av, GIT_HASH_SHA1);
    ++	return cmd_hash_impl(ac, av, GIT_HASH_SHA1, 0);
    + }
    + 
    + int cmd__sha1_is_sha1dc(int argc UNUSED, const char **argv UNUSED)
     @@ t/helper/test-sha1.c: int cmd__sha1_is_sha1dc(int argc UNUSED, const char **argv UNUSED)
      #endif
      	return 1;
    @@ t/helper/test-sha1.sh
      da39a3ee5e6b4b0d3255bfef95601890afd80709 0
      3f786850e387550fdab836ed7e6dc881de23001b 0 a
     
    + ## t/helper/test-sha256.c ##
    +@@
    + 
    + int cmd__sha256(int ac, const char **av)
    + {
    +-	return cmd_hash_impl(ac, av, GIT_HASH_SHA256);
    ++	return cmd_hash_impl(ac, av, GIT_HASH_SHA256, 0);
    + }
    +
      ## t/helper/test-tool.c ##
     @@ t/helper/test-tool.c: static struct test_cmd cmds[] = {
      	{ "serve-v2", cmd__serve_v2 },
    @@ t/helper/test-tool.h: int cmd__scrap_cache_tree(int argc, const char **argv);
      int cmd__sha256(int argc, const char **argv);
      int cmd__sigchain(int argc, const char **argv);
      int cmd__simple_ipc(int argc, const char **argv);
    +@@ t/helper/test-tool.h: int cmd__windows_named_pipe(int argc, const char **argv);
    + #endif
    + int cmd__write_cache(int argc, const char **argv);
    + 
    +-int cmd_hash_impl(int ac, const char **av, int algo);
    ++int cmd_hash_impl(int ac, const char **av, int algo, int unsafe);
    + 
    + #endif
3:  380133a1142 = 2:  99cc44895b5 csum-file: store the hash algorithm as a struct field
4:  e5076f003bf = 3:  1ffab2f8289 csum-file.c: extract algop from hashfile_checksum_valid()
5:  17f92dba34b = 4:  99dcbe2e716 hash.h: introduce `unsafe_hash_algo()`
6:  0d8e12599e2 = 5:  2dcc2aa6803 csum-file.c: use unsafe_hash_algo()
7:  a49a41703e2 = 6:  a2b9ef03080 t/helper/test-hash.c: use unsafe_hash_algo()
1:  0e2fcee6894 ! 7:  94c07fd8a55 t/helper/test-sha1: prepare for an unsafe mode
    @@ Metadata
     Author: Taylor Blau <me@ttaylorr.com>
     
      ## Commit message ##
    -    t/helper/test-sha1: prepare for an unsafe mode
    +    csum-file: introduce hashfile_checkpoint_init()
     
    -    With the new "unsafe" SHA-1 build knob, it would be convenient to have
    -    a test-tool that can exercise Git's unsafe SHA-1 wrappers for testing,
    -    similar to 't/helper/test-tool sha1'.
    +    In 106140a99f (builtin/fast-import: fix segfault with unsafe SHA1
    +    backend, 2024-12-30) and 9218c0bfe1 (bulk-checkin: fix segfault with
    +    unsafe SHA1 backend, 2024-12-30), we observed the effects of failing to
    +    initialize a hashfile_checkpoint with the same hash function
    +    implementation as is used by the hashfile it is used to checkpoint.
     
    -    Prepare for such a helper by altering the implementation of that
    -    test-tool (in cmd_hash_impl(), which is generic and parameterized over
    -    different hash functions) to conditionally run the unsafe variants of
    -    the chosen hash function.
    +    While both 106140a99f and 9218c0bfe1 work around the immediate crash,
    +    changing the hash function implementation within the hashfile API to,
    +    for example, the non-unsafe variant would re-introduce the crash. This
    +    is a result of the tight coupling between initializing hashfiles and
    +    hashfile_checkpoints.
     
    -    The following commit will add a new test-tool which makes use of this
    -    new parameter.
    +    Introduce and use a new function which ensures that both parts of a
    +    hashfile and hashfile_checkpoint pair use the same hash function
    +    implementation to avoid such crashes.
     
    +    A few things worth noting:
    +
    +      - In the change to builtin/fast-import.c::stream_blob(), we can see
    +        that by removing the explicit reference to
    +        'the_hash_algo->unsafe_init_fn()', we are hardened against the
    +        hashfile API changing away from the_hash_algo (or its unsafe
    +        variant) in the future.
    +
    +      - The bulk-checkin code no longer needs to explicitly zero-initialize
    +        the hashfile_checkpoint, since it is now done as a result of calling
    +        'hashfile_checkpoint_init()'.
    +
    +      - Also in the bulk-checkin code, we add an additional call to
    +        prepare_to_stream() outside of the main loop in order to initialize
    +        'state->f' so we know which hash function implementation to use when
    +        calling 'hashfile_checkpoint_init()'.
    +
    +        This is OK, since subsequent 'prepare_to_stream()' calls are noops.
    +        However, we only need to call 'prepare_to_stream()' when we have the
    +        HASH_WRITE_OBJECT bit set in our flags. Without that bit, calling
    +        'prepare_to_stream()' does not assign 'state->f', so we have nothing
    +        to initialize.
    +
    +      - Other uses of the 'checkpoint' in 'deflate_blob_to_pack()' are
    +        appropriately guarded.
    +
    +    Helped-by: Patrick Steinhardt <ps@pks.im>
         Signed-off-by: Taylor Blau <me@ttaylorr.com>
     
    - ## t/helper/test-hash.c ##
    -@@
    - #include "test-tool.h"
    - #include "hex.h"
    + ## builtin/fast-import.c ##
    +@@ builtin/fast-import.c: static void stream_blob(uintmax_t len, struct object_id *oidout, uintmax_t mark)
    + 		|| (pack_size + PACK_SIZE_THRESHOLD + len) < pack_size)
    + 		cycle_packfile();
      
    --int cmd_hash_impl(int ac, const char **av, int algo)
    -+int cmd_hash_impl(int ac, const char **av, int algo, int unsafe)
    - {
    +-	the_hash_algo->unsafe_init_fn(&checkpoint.ctx);
    ++	hashfile_checkpoint_init(pack_file, &checkpoint);
    + 	hashfile_checkpoint(pack_file, &checkpoint);
    + 	offset = checkpoint.offset;
    + 
    +
    + ## bulk-checkin.c ##
    +@@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
      	git_hash_ctx ctx;
    - 	unsigned char hash[GIT_MAX_HEXSZ];
    -@@ t/helper/test-hash.c: int cmd_hash_impl(int ac, const char **av, int algo)
    - 			die("OOPS");
    - 	}
    + 	unsigned char obuf[16384];
    + 	unsigned header_len;
    +-	struct hashfile_checkpoint checkpoint = {0};
    ++	struct hashfile_checkpoint checkpoint;
    + 	struct pack_idx_entry *idx = NULL;
      
    --	algop->init_fn(&ctx);
    -+	if (unsafe)
    -+		algop->unsafe_init_fn(&ctx);
    -+	else
    -+		algop->init_fn(&ctx);
    + 	seekback = lseek(fd, 0, SEEK_CUR);
    +@@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    + 					  OBJ_BLOB, size);
    + 	the_hash_algo->init_fn(&ctx);
    + 	the_hash_algo->update_fn(&ctx, obuf, header_len);
    +-	the_hash_algo->unsafe_init_fn(&checkpoint.ctx);
    + 
    + 	/* Note: idx is non-NULL when we are writing */
    +-	if ((flags & HASH_WRITE_OBJECT) != 0)
    ++	if ((flags & HASH_WRITE_OBJECT) != 0) {
    + 		CALLOC_ARRAY(idx, 1);
    + 
    ++		prepare_to_stream(state, flags);
    ++		hashfile_checkpoint_init(state->f, &checkpoint);
    ++	}
    ++
    + 	already_hashed_to = 0;
      
      	while (1) {
    - 		ssize_t sz, this_sz;
    -@@ t/helper/test-hash.c: int cmd_hash_impl(int ac, const char **av, int algo)
    - 		}
    - 		if (this_sz == 0)
    - 			break;
    --		algop->update_fn(&ctx, buffer, this_sz);
    -+		if (unsafe)
    -+			algop->unsafe_update_fn(&ctx, buffer, this_sz);
    -+		else
    -+			algop->update_fn(&ctx, buffer, this_sz);
    - 	}
    --	algop->final_fn(hash, &ctx);
    -+	if (unsafe)
    -+		algop->unsafe_final_fn(hash, &ctx);
    -+	else
    -+		algop->final_fn(hash, &ctx);
    - 
    - 	if (binary)
    - 		fwrite(hash, 1, algop->rawsz, stdout);
     
    - ## t/helper/test-sha1.c ##
    -@@
    - 
    - int cmd__sha1(int ac, const char **av)
    - {
    --	return cmd_hash_impl(ac, av, GIT_HASH_SHA1);
    -+	return cmd_hash_impl(ac, av, GIT_HASH_SHA1, 0);
    + ## csum-file.c ##
    +@@ csum-file.c: struct hashfile *hashfd_throughput(int fd, const char *name, struct progress *tp
    + 	return hashfd_internal(fd, name, tp, 8 * 1024);
      }
      
    - int cmd__sha1_is_sha1dc(int argc UNUSED, const char **argv UNUSED)
    -
    - ## t/helper/test-sha256.c ##
    -@@
    - 
    - int cmd__sha256(int ac, const char **av)
    ++void hashfile_checkpoint_init(struct hashfile *f,
    ++			      struct hashfile_checkpoint *checkpoint)
    ++{
    ++	memset(checkpoint, 0, sizeof(*checkpoint));
    ++	f->algop->init_fn(&checkpoint->ctx);
    ++}
    ++
    + void hashfile_checkpoint(struct hashfile *f, struct hashfile_checkpoint *checkpoint)
      {
    --	return cmd_hash_impl(ac, av, GIT_HASH_SHA256);
    -+	return cmd_hash_impl(ac, av, GIT_HASH_SHA256, 0);
    - }
    + 	hashflush(f);
     
    - ## t/helper/test-tool.h ##
    -@@ t/helper/test-tool.h: int cmd__windows_named_pipe(int argc, const char **argv);
    - #endif
    - int cmd__write_cache(int argc, const char **argv);
    + ## csum-file.h ##
    +@@ csum-file.h: struct hashfile_checkpoint {
    + 	git_hash_ctx ctx;
    + };
      
    --int cmd_hash_impl(int ac, const char **av, int algo);
    -+int cmd_hash_impl(int ac, const char **av, int algo, int unsafe);
    ++void hashfile_checkpoint_init(struct hashfile *, struct hashfile_checkpoint *);
    + void hashfile_checkpoint(struct hashfile *, struct hashfile_checkpoint *);
    + int hashfile_truncate(struct hashfile *, struct hashfile_checkpoint *);
      
    - #endif
8:  4081ad08549 = 8:  f5579883816 hash.h: drop unsafe_ function variants

base-commit: 14650065b76b28d3cfa9453356ac5669b19e706e