diff mbox series

[6/8] connected: implement connectivity check via temporary object dirs

Message ID d01685a3ec82c3415654779c122927388fc433db.1621451532.git.ps@pks.im (mailing list archive)
State New, archived
Headers show
Series Speed up connectivity checks via quarantine dir | expand

Commit Message

Patrick Steinhardt May 19, 2021, 7:13 p.m. UTC
The connectivity checks are currently implemented via git-rev-list(1):
we simply ignore any preexisting refs via `--not --all`, and pass all
new refs which are to be checked via its stdin. While this works well
enough for repositories which do not have a lot of references, it's
clear that `--not --all` will linearly scale with the number of refs
which do exist: for each reference, we'll walk through its commit as
well as its five parent commits (defined via `SLOP`). Given that many
major hosting sites which use a pull/merge request workflow keep refs to
the request's HEAD, this effectively means that `--not --all` will do a
nearly complete graph walk.

We can compute the set of pushed objects much more easily though. We
know exactly which objects were just received, which is the set of
objects which are in git-receive-pack(1)'s object quarantine directory.
While this may overshoot new objects because the client may have sent
objects which we already know, it lets us avoid doing a graph walk of
preexisting commits.

The implementation of this is quite simple: we iterate through all loose
and packed objects in the object directory, and for each object we check
whether its immediate referenced objects exist. There is no need to do a
walk beyond these direct links: if the receiving repository already has
the directly referenced object, then it's not a new object in the first
place and thus does not need a re-check. If the referenced object is a
new one and we can load it, then we know that it will eventually be
checked via the quarantined objects walk, too.

Finally, after we have concluded that all quarantined objects are indeed
fully connected, the only thing left to check is whether all reference
updates reference an existing object.

This new connectivity check will be wired up in a subsequent commit.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 connected.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 connected.h |  19 ++++++
 2 files changed, 211 insertions(+)
diff mbox series

Patch

diff --git a/connected.c b/connected.c
index b18299fdf0..ab0eb9f974 100644
--- a/connected.c
+++ b/connected.c
@@ -6,6 +6,13 @@ 
 #include "transport.h"
 #include "packfile.h"
 #include "promisor-remote.h"
+#include "object.h"
+#include "tree-walk.h"
+#include "commit.h"
+#include "tag.h"
+#include "progress.h"
+#include "tmp-objdir.h"
+#include "oidset.h"
 
 /*
  * If we feed all the commits we want to verify to this command
@@ -151,3 +158,188 @@  int check_connected(oid_iterate_fn fn, void *cb_data,
 	sigchain_pop(SIGPIPE);
 	return finish_command(&rev_list) || err;
 }
+
+struct connected_payload {
+	struct repository *repo;
+	struct progress *progress;
+	struct oidset seen;
+	size_t checked_objects;
+	size_t missing_objects;
+};
+
+static void check_missing(struct connected_payload *payload, const struct object_id *oid)
+{
+	if (oidset_contains(&payload->seen, oid))
+		return;
+	if (oid_object_info(payload->repo, oid, NULL) <= 0)
+		payload->missing_objects++;
+	oidset_insert(&payload->seen, oid);
+	display_progress(payload->progress, ++payload->checked_objects);
+}
+
+static int check_quarantined_object(const struct object_id *oid, enum object_type type,
+				    void *content, unsigned long size, void *data)
+{
+	struct connected_payload *payload = data;
+	struct object *object;
+	int eaten;
+
+	object = parse_object_buffer(payload->repo, oid, type, size, content, &eaten);
+	if (!object) {
+		if (!eaten)
+			free(content);
+		error(_("could not parse quarantined object %s"), oid_to_hex(oid));
+		return -1;
+	}
+	if (!eaten)
+		free(content);
+
+	if (type == OBJ_TREE) {
+		struct tree *tree = (struct tree *)object;
+		struct tree_desc tree_desc;
+		struct name_entry entry;
+
+		if (init_tree_desc_gently(&tree_desc, tree->buffer, tree->size))
+			return -1;
+		while (tree_entry_gently(&tree_desc, &entry))
+			check_missing(payload, &entry.oid);
+
+		free_tree_buffer(tree);
+	} else if (type == OBJ_COMMIT) {
+		struct commit *commit = (struct commit *) object;
+		struct commit_list *parents;
+
+		check_missing(payload, get_commit_tree_oid(commit));
+		for (parents = commit->parents; parents; parents = parents->next)
+			check_missing(payload, &parents->item->object.oid);
+	} else if (type == OBJ_TAG) {
+		check_missing(payload, get_tagged_oid((struct tag *) object));
+	} else {
+		error(_("unexpected object type"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int loose_object_iterator(const struct object_id *oid,
+				 const char *path, void *_data)
+{
+	struct connected_payload *payload = _data;
+	enum object_type type;
+	unsigned long size;
+	void *content;
+
+	if (read_loose_object(path, oid, &type, &size, NULL) < 0)
+		return -1;
+
+	oidset_insert(&payload->seen, oid);
+	display_progress(payload->progress, ++payload->checked_objects);
+
+	if (type == OBJ_BLOB)
+		return 0;
+	if (read_loose_object(path, oid, &type, &size, &content) < 0)
+		return -1;
+
+	return check_quarantined_object(oid, type, content, size, payload);
+}
+
+static int packed_object_iterator(const struct object_id *oid,
+				  struct packed_git *pack, uint32_t pos,
+				  void *_data)
+{
+	off_t off = nth_packed_object_offset(pack, pos);
+	struct connected_payload *payload = _data;
+	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
+	unsigned long size;
+	void *content;
+
+	oi.typep = &type;
+	if (packed_object_info(payload->repo, pack, off, &oi) < 0)
+		return -1;
+
+	oidset_insert(&payload->seen, oid);
+	display_progress(payload->progress, ++payload->checked_objects);
+
+	if (type == OBJ_BLOB)
+		return 0;
+
+	oi.contentp = &content;
+	oi.sizep = &size;
+
+	if (packed_object_info(payload->repo, pack, off, &oi) < 0)
+		return -1;
+
+	return check_quarantined_object(oid, type, content, size, payload);
+}
+
+static int pack_dir_iterator(const char *full_path, size_t full_path_len,
+			     const char *file_path, void *data)
+{
+	if (ends_with(full_path, ".idx")) {
+		struct packed_git *pack = add_packed_git(full_path, full_path_len, 1);
+		if (!pack) {
+			error(_("unable to add quarantined pack"));
+			return -1;
+		}
+
+		if (for_each_object_in_pack(pack, packed_object_iterator, data,
+					    FOR_EACH_OBJECT_PACK_ORDER) < 0) {
+			error(_("unable to iterate packed objects"));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int check_connected_quarantine(struct repository *repo,
+			       struct tmp_objdir *quarantine,
+			       oid_iterate_fn fn, void *cb_data)
+{
+	struct connected_payload payload = {
+		.repo = repo,
+		.seen = OIDSET_INIT,
+	};
+	struct object_id oid;
+	int ret;
+
+	payload.progress = start_delayed_progress("Checking connectivity", 0);
+
+	/*
+	 * Iterate through all loose and packed objects part of the object
+	 * quarantine and verify that for each of them, all directly referenced
+	 * objects exist.
+	 */
+	if (for_each_loose_file_in_objdir(tmp_objdir_path(quarantine),
+					  loose_object_iterator, NULL,
+					  NULL, &payload) < 0) {
+		error(_("unable to check quarantined loose objects"));
+		ret = -1;
+		goto out;
+	}
+
+	if (for_each_file_in_pack_dir(tmp_objdir_path(quarantine),
+				      pack_dir_iterator, &payload) < 0) {
+		error(_("unable to check quarantined packed objects"));
+		ret = -1;
+		goto out;
+	}
+
+	/*
+	 * After we've verified that all quarantined objects are indeed
+	 * connected, we need to verify that all new tips are contained in the
+	 * repository, too.
+	 */
+	while (!fn(cb_data, &oid)) {
+		if (oid_object_info(repo, &oid, NULL) <= 0)
+			payload.missing_objects++;
+		display_progress(payload.progress, ++payload.checked_objects);
+	}
+
+out:
+	stop_progress(&payload.progress);
+	oidset_clear(&payload.seen);
+	return ret || payload.missing_objects != 0;
+}
diff --git a/connected.h b/connected.h
index 8d5a6b3ad6..336b6f362e 100644
--- a/connected.h
+++ b/connected.h
@@ -3,6 +3,7 @@ 
 
 struct object_id;
 struct transport;
+struct tmp_objdir;
 
 /*
  * Take callback data, and return next object name in the buffer.
@@ -62,4 +63,22 @@  struct check_connected_options {
 int check_connected(oid_iterate_fn fn, void *cb_data,
 		    struct check_connected_options *opt);
 
+/*
+ * Make sure that all objects in the given quarantine directory are connected
+ * and that all OIDs returned by the given callback are backed by an existing
+ * object. The quarantine directory must have been made available to the
+ * repository via `add_to_alternates_memory()`.
+ *
+ * Given that objects are enumerated via the quarantine directory directly,
+ * this check should in general be more efficient than `check_connected()`
+ * while being more pedantic at the same time (e.g. quarantined objects which
+ * aren't referenced by anything but which do have missing links would get
+ * rejected).
+ *
+ * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
+ */
+int check_connected_quarantine(struct repository *repo,
+			       struct tmp_objdir *quarantine,
+			       oid_iterate_fn fn, void *cb_data);
+
 #endif /* CONNECTED_H */