diff mbox series

[01/10] files-backend: add object check for regular ref

Message ID Z3qN1T3lJoj82ckl@ArchLinux (mailing list archive)
State Superseded
Headers show
Series add more ref consistency checks | expand

Commit Message

shejialuo Jan. 5, 2025, 1:49 p.m. UTC
Although we use "parse_loose_ref_content" to check whether the object id
is correct, we never parse it into the "struct object" structure thus we
ignore checking whether there is a real object existing in the repo and
whether the object type is correct.

Use "parse_object" to parse the oid for the regular ref content. If the
object does not exist, report the error to the user by reusing the fsck
message "BAD_REF_CONTENT".

Then, we need to check the type of the object. Just like "git-fsck(1)",
we only report "not a commit" error when the ref is a branch. Last,
update the test to exercise the code.

Mentored-by: Patrick Steinhardt <ps@pks.im>
Mentored-by: Karthik Nayak <karthik.188@gmail.com>
Signed-off-by: shejialuo <shejialuo@gmail.com>
---
 refs/files-backend.c     | 50 ++++++++++++++++++++++++++++++++--------
 t/t0602-reffiles-fsck.sh | 30 ++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 10 deletions(-)

Comments

Karthik Nayak Jan. 7, 2025, 2:17 p.m. UTC | #1
shejialuo <shejialuo@gmail.com> writes:

> Although we use "parse_loose_ref_content" to check whether the object id
> is correct, we never parse it into the "struct object" structure thus we
> ignore checking whether there is a real object existing in the repo and
> whether the object type is correct.
>
> Use "parse_object" to parse the oid for the regular ref content. If the
> object does not exist, report the error to the user by reusing the fsck
> message "BAD_REF_CONTENT".
>
> Then, we need to check the type of the object. Just like "git-fsck(1)",
> we only report "not a commit" error when the ref is a branch. Last,
> update the test to exercise the code.

I found this a bit confusing at first, the code does clear up the
confusion. Perhaps we can say something like:

  Branches that do not point to a commit type are explicitly called out,
  similar to 'git-fsck(1)'.

>
> Mentored-by: Patrick Steinhardt <ps@pks.im>
> Mentored-by: Karthik Nayak <karthik.188@gmail.com>
> Signed-off-by: shejialuo <shejialuo@gmail.com>
> ---
>  refs/files-backend.c     | 50 ++++++++++++++++++++++++++++++++--------
>  t/t0602-reffiles-fsck.sh | 30 ++++++++++++++++++++++++
>  2 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/refs/files-backend.c b/refs/files-backend.c
> index 64f51f0da9..0a4912c009 100644
> --- a/refs/files-backend.c
> +++ b/refs/files-backend.c
> @@ -20,6 +20,7 @@
>  #include "../lockfile.h"
>  #include "../object.h"
>  #include "../object-file.h"
> +#include "../packfile.h"
>  #include "../path.h"
>  #include "../dir.h"
>  #include "../chdir-notify.h"
> @@ -3589,6 +3590,34 @@ static int files_fsck_symref_target(struct fsck_options *o,
>  	return ret;
>  }
>
> +static int files_fsck_refs_oid(struct fsck_options *o,
> +			       struct ref_store *ref_store,
> +			       struct fsck_ref_report report,
> +			       const char *target_name,
> +			       struct object_id *oid)
> +{
> +	struct object *obj;
> +	int ret = 0;
> +
> +	if (is_promisor_object(ref_store->repo, oid))
> +		return 0;
> +
> +	obj = parse_object(ref_store->repo, oid);
> +	if (!obj) {
> +		ret |= fsck_report_ref(o, &report,
> +				       FSCK_MSG_BAD_REF_CONTENT,
> +				       "points to non-existing object %s",
> +				       oid_to_hex(oid));

Nit: The two conditionals here are mutually exclusive. So we don't have
to do `ret |=`, no? We don't even need `ret` here, we could simply do a
`return fsck_report_ref(...)`.

> +	} else if (obj->type != OBJ_COMMIT && is_branch(target_name)) {
> +		ret |= fsck_report_ref(o, &report,
> +				       FSCK_MSG_BAD_REF_CONTENT,
> +				       "points to non-commit object %s",
> +				       oid_to_hex(oid));
> +	}

Since this is a single lined if/else, we can skip the braces here.

> +	return ret;
> +}
> +
>  static int files_fsck_refs_content(struct ref_store *ref_store,
>  				   struct fsck_options *o,
>  				   const char *target_name,
> @@ -3654,18 +3683,19 @@ static int files_fsck_refs_content(struct ref_store *ref_store,
>  	}
>
>  	if (!(type & REF_ISSYMREF)) {
> +		ret |= files_fsck_refs_oid(o, ref_store, report, target_name, &oid);
> +
>  		if (!*trailing) {
> -			ret = fsck_report_ref(o, &report,
> -					      FSCK_MSG_REF_MISSING_NEWLINE,
> -					      "misses LF at the end");
> -			goto cleanup;
> -		}
> -		if (*trailing != '\n' || *(trailing + 1)) {
> -			ret = fsck_report_ref(o, &report,
> -					      FSCK_MSG_TRAILING_REF_CONTENT,
> -					      "has trailing garbage: '%s'", trailing);
> -			goto cleanup;
> +			ret |= fsck_report_ref(o, &report,
> +					       FSCK_MSG_REF_MISSING_NEWLINE,
> +					       "misses LF at the end");
> +		} else if (*trailing != '\n' || *(trailing + 1)) {
> +			ret |= fsck_report_ref(o, &report,
> +					       FSCK_MSG_TRAILING_REF_CONTENT,
> +					       "has trailing garbage: '%s'", trailing);
>  		}
> +
> +		goto cleanup;
>  	} else {
>  		ret = files_fsck_symref_target(o, &report, &referent, 0);
>  		goto cleanup;
> diff --git a/t/t0602-reffiles-fsck.sh b/t/t0602-reffiles-fsck.sh
> index d4a08b823b..75f234a94a 100755
> --- a/t/t0602-reffiles-fsck.sh
> +++ b/t/t0602-reffiles-fsck.sh
> @@ -161,8 +161,10 @@ test_expect_success 'regular ref content should be checked (individual)' '
>  	test_when_finished "rm -rf repo" &&
>  	git init repo &&
>  	branch_dir_prefix=.git/refs/heads &&
> +	tag_dir_prefix=.git/refs/tags &&
>  	cd repo &&
>  	test_commit default &&
> +	git branch branch-1 &&
>  	mkdir -p "$branch_dir_prefix/a/b" &&
>
>  	git refs verify 2>err &&
> @@ -198,6 +200,28 @@ test_expect_success 'regular ref content should be checked (individual)' '
>  	rm $branch_dir_prefix/branch-no-newline &&
>  	test_cmp expect err &&
>
> +	for non_existing_oid in "$(test_oid 001)" "$(test_oid 002)"
> +	do
> +		printf "%s\n" $non_existing_oid >$branch_dir_prefix/invalid-commit &&
> +		test_must_fail git refs verify 2>err &&
> +		cat >expect <<-EOF &&
> +		error: refs/heads/invalid-commit: badRefContent: points to non-existing object $non_existing_oid
> +		EOF
> +		rm $branch_dir_prefix/invalid-commit &&
> +		test_cmp expect err || return 1
> +	done &&
> +
> +	for tree_oid in "$(git rev-parse main^{tree})" "$(git rev-parse branch-1^{tree})"
> +	do
> +		printf "%s\n" $tree_oid >$branch_dir_prefix/branch-tree &&
> +		test_must_fail git refs verify 2>err &&
> +		cat >expect <<-EOF &&
> +		error: refs/heads/branch-tree: badRefContent: points to non-commit object $tree_oid

Reading this error here, I think it would be nicer to say
'badRefContent: branch points to ....' so we know that the specified ref
is a branch.

> +		EOF
> +		rm $branch_dir_prefix/branch-tree &&
> +		test_cmp expect err || return 1
> +	done &&
> +
>  	for trailing_content in " garbage" "    more garbage"
>  	do
>  		printf "%s" "$(git rev-parse main)$trailing_content" >$branch_dir_prefix/branch-garbage &&
> @@ -244,15 +268,21 @@ test_expect_success 'regular ref content should be checked (aggregate)' '
>  	bad_content_1=$(git rev-parse main)x &&
>  	bad_content_2=xfsazqfxcadas &&
>  	bad_content_3=Xfsazqfxcadas &&
> +	non_existing_oid=$(test_oid 001) &&
> +	tree_oid=$(git rev-parse main^{tree}) &&
>  	printf "%s" $bad_content_1 >$tag_dir_prefix/tag-bad-1 &&
>  	printf "%s" $bad_content_2 >$tag_dir_prefix/tag-bad-2 &&
>  	printf "%s" $bad_content_3 >$branch_dir_prefix/a/b/branch-bad &&
>  	printf "%s" "$(git rev-parse main)" >$branch_dir_prefix/branch-no-newline &&
>  	printf "%s garbage" "$(git rev-parse main)" >$branch_dir_prefix/branch-garbage &&
> +	printf "%s\n" $non_existing_oid >$branch_dir_prefix/branch-non-existing-oid &&
> +	printf "%s\n" $tree_oid >$branch_dir_prefix/branch-tree &&
>
>  	test_must_fail git refs verify 2>err &&
>  	cat >expect <<-EOF &&
>  	error: refs/heads/a/b/branch-bad: badRefContent: $bad_content_3
> +	error: refs/heads/branch-non-existing-oid: badRefContent: points to non-existing object $non_existing_oid
> +	error: refs/heads/branch-tree: badRefContent: points to non-commit object $tree_oid
>  	error: refs/tags/tag-bad-1: badRefContent: $bad_content_1
>  	error: refs/tags/tag-bad-2: badRefContent: $bad_content_2
>  	warning: refs/heads/branch-garbage: trailingRefContent: has trailing garbage: '\'' garbage'\''
> --
> 2.47.1
Patrick Steinhardt Jan. 16, 2025, 1:57 p.m. UTC | #2
On Sun, Jan 05, 2025 at 09:49:09PM +0800, shejialuo wrote:
> Although we use "parse_loose_ref_content" to check whether the object id
> is correct, we never parse it into the "struct object" structure thus we
> ignore checking whether there is a real object existing in the repo and
> whether the object type is correct.
> 
> Use "parse_object" to parse the oid for the regular ref content. If the
> object does not exist, report the error to the user by reusing the fsck
> message "BAD_REF_CONTENT".
> 
> Then, we need to check the type of the object. Just like "git-fsck(1)",
> we only report "not a commit" error when the ref is a branch. Last,
> update the test to exercise the code.

I wonder whether it wouldn't make more sense to put this into a generic
part of `git refs verify`. This isn't a check for whether the format of
the files backend is correct, but rather a check whether the refdb is
sane. As such, it also applies do the reftable backend.

So should we maybe extend `git refs verify` so that it also knows to
perform generic checks that apply independent of the backend in use?

Patrick
shejialuo Jan. 17, 2025, 1:40 p.m. UTC | #3
On Thu, Jan 16, 2025 at 02:57:25PM +0100, Patrick Steinhardt wrote:
> On Sun, Jan 05, 2025 at 09:49:09PM +0800, shejialuo wrote:
> > Although we use "parse_loose_ref_content" to check whether the object id
> > is correct, we never parse it into the "struct object" structure thus we
> > ignore checking whether there is a real object existing in the repo and
> > whether the object type is correct.
> > 
> > Use "parse_object" to parse the oid for the regular ref content. If the
> > object does not exist, report the error to the user by reusing the fsck
> > message "BAD_REF_CONTENT".
> > 
> > Then, we need to check the type of the object. Just like "git-fsck(1)",
> > we only report "not a commit" error when the ref is a branch. Last,
> > update the test to exercise the code.
> 
> I wonder whether it wouldn't make more sense to put this into a generic
> part of `git refs verify`. This isn't a check for whether the format of
> the files backend is correct, but rather a check whether the refdb is
> sane. As such, it also applies do the reftable backend.
> 
> So should we maybe extend `git refs verify` so that it also knows to
> perform generic checks that apply independent of the backend in use?
> 

I somehow understand your meaning here and I think what your meaning
here is that we could use internal ref method to parse the oid after we
check the format of the ref files. Thus, we could totally make these two
different kinds of checks separately.

However, if we have already parsed the raw ref files, we could reuse the
parsed hex and then use "parse_object" to get the object id to check.
This is the main reason why I add this check now.

And I agree with your thinking here. Actually, we may put this into
object check part. Because in "git-fsck(1)", we parse the refdb to know
whether an object is dangling or not.

I will postpone these checks in the later patches. Really thanks here
for this wonderful suggestion.

Thanks,
Jialuo
Patrick Steinhardt Jan. 24, 2025, 7:54 a.m. UTC | #4
On Fri, Jan 17, 2025 at 09:40:18PM +0800, shejialuo wrote:
> On Thu, Jan 16, 2025 at 02:57:25PM +0100, Patrick Steinhardt wrote:
> > On Sun, Jan 05, 2025 at 09:49:09PM +0800, shejialuo wrote:
> > > Although we use "parse_loose_ref_content" to check whether the object id
> > > is correct, we never parse it into the "struct object" structure thus we
> > > ignore checking whether there is a real object existing in the repo and
> > > whether the object type is correct.
> > > 
> > > Use "parse_object" to parse the oid for the regular ref content. If the
> > > object does not exist, report the error to the user by reusing the fsck
> > > message "BAD_REF_CONTENT".
> > > 
> > > Then, we need to check the type of the object. Just like "git-fsck(1)",
> > > we only report "not a commit" error when the ref is a branch. Last,
> > > update the test to exercise the code.
> > 
> > I wonder whether it wouldn't make more sense to put this into a generic
> > part of `git refs verify`. This isn't a check for whether the format of
> > the files backend is correct, but rather a check whether the refdb is
> > sane. As such, it also applies do the reftable backend.
> > 
> > So should we maybe extend `git refs verify` so that it also knows to
> > perform generic checks that apply independent of the backend in use?
> > 
> 
> I somehow understand your meaning here and I think what your meaning
> here is that we could use internal ref method to parse the oid after we
> check the format of the ref files. Thus, we could totally make these two
> different kinds of checks separately.
> 
> However, if we have already parsed the raw ref files, we could reuse the
> parsed hex and then use "parse_object" to get the object id to check.
> This is the main reason why I add this check now.
> 
> And I agree with your thinking here. Actually, we may put this into
> object check part. Because in "git-fsck(1)", we parse the refdb to know
> whether an object is dangling or not.
> 
> I will postpone these checks in the later patches. Really thanks here
> for this wonderful suggestion.

Yeah. I'm thinking ahead a bit in this context and want to avoid that we
eventually have to reimplement the same set of checks for every single
ref backend that we have. So separating the backend-generic bits from
the non-generic ones is what I'm after.

Patrick
diff mbox series

Patch

diff --git a/refs/files-backend.c b/refs/files-backend.c
index 64f51f0da9..0a4912c009 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -20,6 +20,7 @@ 
 #include "../lockfile.h"
 #include "../object.h"
 #include "../object-file.h"
+#include "../packfile.h"
 #include "../path.h"
 #include "../dir.h"
 #include "../chdir-notify.h"
@@ -3589,6 +3590,34 @@  static int files_fsck_symref_target(struct fsck_options *o,
 	return ret;
 }
 
+static int files_fsck_refs_oid(struct fsck_options *o,
+			       struct ref_store *ref_store,
+			       struct fsck_ref_report report,
+			       const char *target_name,
+			       struct object_id *oid)
+{
+	struct object *obj;
+	int ret = 0;
+
+	if (is_promisor_object(ref_store->repo, oid))
+		return 0;
+
+	obj = parse_object(ref_store->repo, oid);
+	if (!obj) {
+		ret |= fsck_report_ref(o, &report,
+				       FSCK_MSG_BAD_REF_CONTENT,
+				       "points to non-existing object %s",
+				       oid_to_hex(oid));
+	} else if (obj->type != OBJ_COMMIT && is_branch(target_name)) {
+		ret |= fsck_report_ref(o, &report,
+				       FSCK_MSG_BAD_REF_CONTENT,
+				       "points to non-commit object %s",
+				       oid_to_hex(oid));
+	}
+
+	return ret;
+}
+
 static int files_fsck_refs_content(struct ref_store *ref_store,
 				   struct fsck_options *o,
 				   const char *target_name,
@@ -3654,18 +3683,19 @@  static int files_fsck_refs_content(struct ref_store *ref_store,
 	}
 
 	if (!(type & REF_ISSYMREF)) {
+		ret |= files_fsck_refs_oid(o, ref_store, report, target_name, &oid);
+
 		if (!*trailing) {
-			ret = fsck_report_ref(o, &report,
-					      FSCK_MSG_REF_MISSING_NEWLINE,
-					      "misses LF at the end");
-			goto cleanup;
-		}
-		if (*trailing != '\n' || *(trailing + 1)) {
-			ret = fsck_report_ref(o, &report,
-					      FSCK_MSG_TRAILING_REF_CONTENT,
-					      "has trailing garbage: '%s'", trailing);
-			goto cleanup;
+			ret |= fsck_report_ref(o, &report,
+					       FSCK_MSG_REF_MISSING_NEWLINE,
+					       "misses LF at the end");
+		} else if (*trailing != '\n' || *(trailing + 1)) {
+			ret |= fsck_report_ref(o, &report,
+					       FSCK_MSG_TRAILING_REF_CONTENT,
+					       "has trailing garbage: '%s'", trailing);
 		}
+
+		goto cleanup;
 	} else {
 		ret = files_fsck_symref_target(o, &report, &referent, 0);
 		goto cleanup;
diff --git a/t/t0602-reffiles-fsck.sh b/t/t0602-reffiles-fsck.sh
index d4a08b823b..75f234a94a 100755
--- a/t/t0602-reffiles-fsck.sh
+++ b/t/t0602-reffiles-fsck.sh
@@ -161,8 +161,10 @@  test_expect_success 'regular ref content should be checked (individual)' '
 	test_when_finished "rm -rf repo" &&
 	git init repo &&
 	branch_dir_prefix=.git/refs/heads &&
+	tag_dir_prefix=.git/refs/tags &&
 	cd repo &&
 	test_commit default &&
+	git branch branch-1 &&
 	mkdir -p "$branch_dir_prefix/a/b" &&
 
 	git refs verify 2>err &&
@@ -198,6 +200,28 @@  test_expect_success 'regular ref content should be checked (individual)' '
 	rm $branch_dir_prefix/branch-no-newline &&
 	test_cmp expect err &&
 
+	for non_existing_oid in "$(test_oid 001)" "$(test_oid 002)"
+	do
+		printf "%s\n" $non_existing_oid >$branch_dir_prefix/invalid-commit &&
+		test_must_fail git refs verify 2>err &&
+		cat >expect <<-EOF &&
+		error: refs/heads/invalid-commit: badRefContent: points to non-existing object $non_existing_oid
+		EOF
+		rm $branch_dir_prefix/invalid-commit &&
+		test_cmp expect err || return 1
+	done &&
+
+	for tree_oid in "$(git rev-parse main^{tree})" "$(git rev-parse branch-1^{tree})"
+	do
+		printf "%s\n" $tree_oid >$branch_dir_prefix/branch-tree &&
+		test_must_fail git refs verify 2>err &&
+		cat >expect <<-EOF &&
+		error: refs/heads/branch-tree: badRefContent: points to non-commit object $tree_oid
+		EOF
+		rm $branch_dir_prefix/branch-tree &&
+		test_cmp expect err || return 1
+	done &&
+
 	for trailing_content in " garbage" "    more garbage"
 	do
 		printf "%s" "$(git rev-parse main)$trailing_content" >$branch_dir_prefix/branch-garbage &&
@@ -244,15 +268,21 @@  test_expect_success 'regular ref content should be checked (aggregate)' '
 	bad_content_1=$(git rev-parse main)x &&
 	bad_content_2=xfsazqfxcadas &&
 	bad_content_3=Xfsazqfxcadas &&
+	non_existing_oid=$(test_oid 001) &&
+	tree_oid=$(git rev-parse main^{tree}) &&
 	printf "%s" $bad_content_1 >$tag_dir_prefix/tag-bad-1 &&
 	printf "%s" $bad_content_2 >$tag_dir_prefix/tag-bad-2 &&
 	printf "%s" $bad_content_3 >$branch_dir_prefix/a/b/branch-bad &&
 	printf "%s" "$(git rev-parse main)" >$branch_dir_prefix/branch-no-newline &&
 	printf "%s garbage" "$(git rev-parse main)" >$branch_dir_prefix/branch-garbage &&
+	printf "%s\n" $non_existing_oid >$branch_dir_prefix/branch-non-existing-oid &&
+	printf "%s\n" $tree_oid >$branch_dir_prefix/branch-tree &&
 
 	test_must_fail git refs verify 2>err &&
 	cat >expect <<-EOF &&
 	error: refs/heads/a/b/branch-bad: badRefContent: $bad_content_3
+	error: refs/heads/branch-non-existing-oid: badRefContent: points to non-existing object $non_existing_oid
+	error: refs/heads/branch-tree: badRefContent: points to non-commit object $tree_oid
 	error: refs/tags/tag-bad-1: badRefContent: $bad_content_1
 	error: refs/tags/tag-bad-2: badRefContent: $bad_content_2
 	warning: refs/heads/branch-garbage: trailingRefContent: has trailing garbage: '\'' garbage'\''