diff mbox series

[15/19] midx: support reading incremental MIDX chains

Message ID 28579fa29266b65c7c6b915678dff35f50cc051d.1717715060.git.me@ttaylorr.com (mailing list archive)
State Accepted
Commit b80236d0e3386e5fdfaff0a7480212a3f39cbee3
Headers show
Series midx: incremental multi-pack indexes, part one | expand

Commit Message

Taylor Blau June 6, 2024, 11:05 p.m. UTC
Now that the MIDX machinery's internals have been taught to understand
incremental MIDXs over the previous handful of commits, the MIDX
machinery itself can begin reading incremental MIDXs.

(Note that while the on-disk format for incremental MIDXs has been
defined, the writing end has not been implemented. This will take place
in the commit after next.)

The core of this change involves following the order specified in the
MIDX chain and opening up MIDXs in the chain one-by-one, adding them to
the previous layer's `->base_midx` pointer at each step.

In order to implement this, the `load_multi_pack_index()` function is
taught to call a new `load_multi_pack_index_chain()` function if loading
a non-incremental MIDX failed via `load_multi_pack_index_one()`.

When loading a MIDX chain, `load_midx_chain_fd_st()` reads each line in
the file one-by-one and dispatches calls to
`load_multi_pack_index_one()` to read each layer of the MIDX chain. When
a layer was successfully read, it is added to the MIDX chain by calling
`add_midx_to_chain()` which validates the contents of the `BASE` chunk,
performs some bounds checks on the number of combined packs and objects,
and attaches the new MIDX by assigning its `base_midx` pointer to the
existing part of the chain.

As a supplement to this, introduce a new mode in the test-read-midx
test-tool which allows us to read the information for a specific MIDX in
the chain by specifying its trailing checksum via the command-line
arguments like so:

    $ test-tool read-midx .git/objects [checksum]

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 midx.c                    | 184 +++++++++++++++++++++++++++++++++++---
 midx.h                    |   7 ++
 packfile.c                |   5 +-
 t/helper/test-read-midx.c |  24 +++--
 4 files changed, 201 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/midx.c b/midx.c
index cb7b623b5d..ac44fcefc2 100644
--- a/midx.c
+++ b/midx.c
@@ -89,7 +89,9 @@  static int midx_read_object_offsets(const unsigned char *chunk_start,
 
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz)
 
-struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local)
+static struct multi_pack_index *load_multi_pack_index_one(const char *object_dir,
+							  const char *midx_name,
+							  int local)
 {
 	struct multi_pack_index *m = NULL;
 	int fd;
@@ -97,31 +99,26 @@  struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 	size_t midx_size;
 	void *midx_map = NULL;
 	uint32_t hash_version;
-	struct strbuf midx_name = STRBUF_INIT;
 	uint32_t i;
 	const char *cur_pack_name;
 	struct chunkfile *cf = NULL;
 
-	get_midx_filename(&midx_name, object_dir);
-
-	fd = git_open(midx_name.buf);
+	fd = git_open(midx_name);
 
 	if (fd < 0)
 		goto cleanup_fail;
 	if (fstat(fd, &st)) {
-		error_errno(_("failed to read %s"), midx_name.buf);
+		error_errno(_("failed to read %s"), midx_name);
 		goto cleanup_fail;
 	}
 
 	midx_size = xsize_t(st.st_size);
 
 	if (midx_size < MIDX_MIN_SIZE) {
-		error(_("multi-pack-index file %s is too small"), midx_name.buf);
+		error(_("multi-pack-index file %s is too small"), midx_name);
 		goto cleanup_fail;
 	}
 
-	strbuf_release(&midx_name);
-
 	midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0);
 	close(fd);
 
@@ -211,7 +208,6 @@  struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 
 cleanup_fail:
 	free(m);
-	strbuf_release(&midx_name);
 	free_chunkfile(cf);
 	if (midx_map)
 		munmap(midx_map, midx_size);
@@ -220,6 +216,173 @@  struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 	return NULL;
 }
 
+void get_midx_chain_dirname(struct strbuf *buf, const char *object_dir)
+{
+	strbuf_addf(buf, "%s/pack/multi-pack-index.d", object_dir);
+}
+
+void get_midx_chain_filename(struct strbuf *buf, const char *object_dir)
+{
+	get_midx_chain_dirname(buf, object_dir);
+	strbuf_addstr(buf, "/multi-pack-index-chain");
+}
+
+void get_split_midx_filename_ext(struct strbuf *buf, const char *object_dir,
+				 const unsigned char *hash, const char *ext)
+{
+	get_midx_chain_dirname(buf, object_dir);
+	strbuf_addf(buf, "/multi-pack-index-%s.%s", hash_to_hex(hash), ext);
+}
+
+static int open_multi_pack_index_chain(const char *chain_file,
+				       int *fd, struct stat *st)
+{
+	*fd = git_open(chain_file);
+	if (*fd < 0)
+		return 0;
+	if (fstat(*fd, st)) {
+		close(*fd);
+		return 0;
+	}
+	if (st->st_size < the_hash_algo->hexsz) {
+		close(*fd);
+		if (!st->st_size) {
+			/* treat empty files the same as missing */
+			errno = ENOENT;
+		} else {
+			warning(_("multi-pack-index chain file too small"));
+			errno = EINVAL;
+		}
+		return 0;
+	}
+	return 1;
+}
+
+static int add_midx_to_chain(struct multi_pack_index *midx,
+			     struct multi_pack_index *midx_chain,
+			     struct object_id *oids,
+			     int n)
+{
+	if (midx_chain) {
+		if (unsigned_add_overflows(midx_chain->num_packs,
+					   midx_chain->num_packs_in_base)) {
+			warning(_("pack count in base MIDX too high: %"PRIuMAX),
+				(uintmax_t)midx_chain->num_packs_in_base);
+			return 0;
+		}
+		if (unsigned_add_overflows(midx_chain->num_objects,
+					   midx_chain->num_objects_in_base)) {
+			warning(_("object count in base MIDX too high: %"PRIuMAX),
+				(uintmax_t)midx_chain->num_objects_in_base);
+			return 0;
+		}
+		midx->num_packs_in_base = midx_chain->num_packs +
+			midx_chain->num_packs_in_base;
+		midx->num_objects_in_base = midx_chain->num_objects +
+			midx_chain->num_objects_in_base;
+	}
+
+	midx->base_midx = midx_chain;
+	midx->has_chain = 1;
+
+	return 1;
+}
+
+static struct multi_pack_index *load_midx_chain_fd_st(const char *object_dir,
+						      int local,
+						      int fd, struct stat *st,
+						      int *incomplete_chain)
+{
+	struct multi_pack_index *midx_chain = NULL;
+	struct strbuf buf = STRBUF_INIT;
+	struct object_id *layers = NULL;
+	int valid = 1;
+	uint32_t i, count;
+	FILE *fp = xfdopen(fd, "r");
+
+	count = st->st_size / (the_hash_algo->hexsz + 1);
+	CALLOC_ARRAY(layers, count);
+
+	for (i = 0; i < count; i++) {
+		struct multi_pack_index *m;
+
+		if (strbuf_getline_lf(&buf, fp) == EOF)
+			break;
+
+		if (get_oid_hex(buf.buf, &layers[i])) {
+			warning(_("invalid multi-pack-index chain: line '%s' "
+				  "not a hash"),
+				buf.buf);
+			valid = 0;
+			break;
+		}
+
+		valid = 0;
+
+		strbuf_reset(&buf);
+		get_split_midx_filename_ext(&buf, object_dir, layers[i].hash,
+					    MIDX_EXT_MIDX);
+		m = load_multi_pack_index_one(object_dir, buf.buf, local);
+
+		if (m) {
+			if (add_midx_to_chain(m, midx_chain, layers, i)) {
+				midx_chain = m;
+				valid = 1;
+			} else {
+				close_midx(m);
+			}
+		}
+		if (!valid) {
+			warning(_("unable to find all multi-pack index files"));
+			break;
+		}
+	}
+
+	free(layers);
+	fclose(fp);
+	strbuf_release(&buf);
+
+	*incomplete_chain = !valid;
+	return midx_chain;
+}
+
+static struct multi_pack_index *load_multi_pack_index_chain(const char *object_dir,
+							    int local)
+{
+	struct strbuf chain_file = STRBUF_INIT;
+	struct stat st;
+	int fd;
+	struct multi_pack_index *m = NULL;
+
+	get_midx_chain_filename(&chain_file, object_dir);
+	if (open_multi_pack_index_chain(chain_file.buf, &fd, &st)) {
+		int incomplete;
+		/* ownership of fd is taken over by load function */
+		m = load_midx_chain_fd_st(object_dir, local, fd, &st,
+					  &incomplete);
+	}
+
+	strbuf_release(&chain_file);
+	return m;
+}
+
+struct multi_pack_index *load_multi_pack_index(const char *object_dir,
+					       int local)
+{
+	struct strbuf midx_name = STRBUF_INIT;
+	struct multi_pack_index *m;
+
+	get_midx_filename(&midx_name, object_dir);
+
+	m = load_multi_pack_index_one(object_dir, midx_name.buf, local);
+	if (!m)
+		m = load_multi_pack_index_chain(object_dir, local);
+
+	strbuf_release(&midx_name);
+
+	return m;
+}
+
 void close_midx(struct multi_pack_index *m)
 {
 	uint32_t i;
@@ -228,6 +391,7 @@  void close_midx(struct multi_pack_index *m)
 		return;
 
 	close_midx(m->next);
+	close_midx(m->base_midx);
 
 	munmap((unsigned char *)m->data, m->data_len);
 
diff --git a/midx.h b/midx.h
index 86af7dfc5e..94de16a8c4 100644
--- a/midx.h
+++ b/midx.h
@@ -24,6 +24,7 @@  struct bitmapped_pack;
 #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
 #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
 #define MIDX_CHUNKID_REVINDEX 0x52494458 /* "RIDX" */
+#define MIDX_CHUNKID_BASE 0x42415345 /* "BASE" */
 #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
 #define MIDX_LARGE_OFFSET_NEEDED 0x80000000
 
@@ -50,6 +51,7 @@  struct multi_pack_index {
 	int preferred_pack_idx;
 
 	int local;
+	int has_chain;
 
 	const unsigned char *chunk_pack_names;
 	size_t chunk_pack_names_len;
@@ -80,11 +82,16 @@  struct multi_pack_index {
 
 #define MIDX_EXT_REV "rev"
 #define MIDX_EXT_BITMAP "bitmap"
+#define MIDX_EXT_MIDX "midx"
 
 const unsigned char *get_midx_checksum(struct multi_pack_index *m);
 void get_midx_filename(struct strbuf *out, const char *object_dir);
 void get_midx_filename_ext(struct strbuf *out, const char *object_dir,
 			   const unsigned char *hash, const char *ext);
+void get_midx_chain_dirname(struct strbuf *buf, const char *object_dir);
+void get_midx_chain_filename(struct strbuf *buf, const char *object_dir);
+void get_split_midx_filename_ext(struct strbuf *buf, const char *object_dir,
+				 const unsigned char *hash, const char *ext);
 
 struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local);
 int prepare_midx_pack(struct repository *r, struct multi_pack_index *m, uint32_t pack_int_id);
diff --git a/packfile.c b/packfile.c
index d4df7fdeea..85f0345435 100644
--- a/packfile.c
+++ b/packfile.c
@@ -878,7 +878,8 @@  static void prepare_pack(const char *full_name, size_t full_name_len,
 	if (!report_garbage)
 		return;
 
-	if (!strcmp(file_name, "multi-pack-index"))
+	if (!strcmp(file_name, "multi-pack-index") ||
+	    !strcmp(file_name, "multi-pack-index.d"))
 		return;
 	if (starts_with(file_name, "multi-pack-index") &&
 	    (ends_with(file_name, ".bitmap") || ends_with(file_name, ".rev")))
@@ -1062,7 +1063,7 @@  struct packed_git *get_all_packs(struct repository *r)
 	prepare_packed_git(r);
 	for (m = r->objects->multi_pack_index; m; m = m->next) {
 		uint32_t i;
-		for (i = 0; i < m->num_packs; i++)
+		for (i = 0; i < m->num_packs + m->num_packs_in_base; i++)
 			prepare_midx_pack(r, m, i);
 	}
 
diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c
index 4acae41bb9..f9148328e3 100644
--- a/t/helper/test-read-midx.c
+++ b/t/helper/test-read-midx.c
@@ -7,8 +7,10 @@ 
 #include "packfile.h"
 #include "setup.h"
 #include "gettext.h"
+#include "pack-revindex.h"
 
-static int read_midx_file(const char *object_dir, int show_objects)
+static int read_midx_file(const char *object_dir, const char *checksum,
+			  int show_objects)
 {
 	uint32_t i;
 	struct multi_pack_index *m;
@@ -19,6 +21,13 @@  static int read_midx_file(const char *object_dir, int show_objects)
 	if (!m)
 		return 1;
 
+	if (checksum) {
+		while (m && strcmp(hash_to_hex(get_midx_checksum(m)), checksum))
+			m = m->base_midx;
+		if (!m)
+			return 1;
+	}
+
 	printf("header: %08x %d %d %d %d\n",
 	       m->signature,
 	       m->version,
@@ -52,7 +61,8 @@  static int read_midx_file(const char *object_dir, int show_objects)
 		struct pack_entry e;
 
 		for (i = 0; i < m->num_objects; i++) {
-			nth_midxed_object_oid(&oid, m, i);
+			nth_midxed_object_oid(&oid, m,
+					      i + m->num_objects_in_base);
 			fill_midx_entry(the_repository, &oid, &e, m);
 
 			printf("%s %"PRIu64"\t%s\n",
@@ -109,7 +119,7 @@  static int read_midx_bitmapped_packs(const char *object_dir)
 	if (!midx)
 		return 1;
 
-	for (i = 0; i < midx->num_packs; i++) {
+	for (i = 0; i < midx->num_packs + midx->num_packs_in_base; i++) {
 		if (nth_bitmapped_pack(the_repository, midx, &pack, i) < 0)
 			return 1;
 
@@ -125,16 +135,16 @@  static int read_midx_bitmapped_packs(const char *object_dir)
 
 int cmd__read_midx(int argc, const char **argv)
 {
-	if (!(argc == 2 || argc == 3))
-		usage("read-midx [--show-objects|--checksum|--preferred-pack|--bitmap] <object-dir>");
+	if (!(argc == 2 || argc == 3 || argc == 4))
+		usage("read-midx [--show-objects|--checksum|--preferred-pack|--bitmap] <object-dir> <checksum>");
 
 	if (!strcmp(argv[1], "--show-objects"))
-		return read_midx_file(argv[2], 1);
+		return read_midx_file(argv[2], argv[3], 1);
 	else if (!strcmp(argv[1], "--checksum"))
 		return read_midx_checksum(argv[2]);
 	else if (!strcmp(argv[1], "--preferred-pack"))
 		return read_midx_preferred_pack(argv[2]);
 	else if (!strcmp(argv[1], "--bitmap"))
 		return read_midx_bitmapped_packs(argv[2]);
-	return read_midx_file(argv[1], 0);
+	return read_midx_file(argv[1], argv[2], 0);
 }