diff mbox series

ls-files: support --recurse-submodules --stage

Message ID 20220218223212.1139366-1-jonathantanmy@google.com (mailing list archive)
State Superseded
Headers show
Series ls-files: support --recurse-submodules --stage | expand

Commit Message

Jonathan Tan Feb. 18, 2022, 10:32 p.m. UTC
e77aa336f1 ("ls-files: optionally recurse into submodules", 2016-10-10)
taught ls-files the --recurse-submodules argument, but only in a limited
set of circumstances. In particular, --stage was unsupported, perhaps
because there was no repo_find_unique_abbrev(), which was only
introduced in 8bb95572b0 ("sha1-name.c: add
repo_find_unique_abbrev_r()", 2019-04-16). This function is needed for
using --recurse-submodules with --stage.

Now that we have repo_find_unique_abbrev(), teach support for this
combination of arguments.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
I got the similar-hashing object contents from Ævar's work in [1].

[1] https://lore.kernel.org/git/patch-v7-1.6-28c01b7f8a5-20220111T130811Z-avarab@gmail.com/
---
 Documentation/git-ls-files.txt         |  2 +-
 builtin/ls-files.c                     |  4 ++--
 t/t3007-ls-files-recurse-submodules.sh | 20 +++++++++++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

Comments

Junio C Hamano Feb. 19, 2022, 12:33 a.m. UTC | #1
Jonathan Tan <jonathantanmy@google.com> writes:

> diff --git a/builtin/ls-files.c b/builtin/ls-files.c
> index f7ea56cc63..e791b65e7e 100644
> --- a/builtin/ls-files.c
> +++ b/builtin/ls-files.c
> @@ -244,7 +244,7 @@ static void show_ce(struct repository *repo, struct dir_struct *dir,
>  			printf("%s%06o %s %d\t",
>  			       tag,
>  			       ce->ce_mode,
> -			       find_unique_abbrev(&ce->oid, abbrev),
> +			       repo_find_unique_abbrev(repo, &ce->oid, abbrev),
>  			       ce_stage(ce));
>  		}

Quite straight-forward.  At this point, repo is the repository we
are currently working in (which will be updated to the submodule
repository by show_submodule() and passed down to show_files()), so
the only thing we need to do is to make sure we use that repo
consistently.  Makes sense.

>  		write_eolinfo(repo->index, ce, fullname);
> @@ -726,7 +726,7 @@ int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
>  		setup_work_tree();
>  
>  	if (recurse_submodules &&
> -	    (show_stage || show_deleted || show_others || show_unmerged ||
> +	    (show_deleted || show_others || show_unmerged ||
>  	     show_killed || show_modified || show_resolve_undo || with_tree))
>  		die("ls-files --recurse-submodules unsupported mode");
>  
> diff --git a/t/t3007-ls-files-recurse-submodules.sh b/t/t3007-ls-files-recurse-submodules.sh
> index 4a08000713..3d2da360d1 100755
> --- a/t/t3007-ls-files-recurse-submodules.sh
> +++ b/t/t3007-ls-files-recurse-submodules.sh
> @@ -34,6 +34,25 @@ test_expect_success 'ls-files correctly outputs files in submodule' '
>  	test_cmp expect actual
>  '
>  
> +test_expect_success '--stage' '
> +	# In order to test hash abbreviation, write two objects that have the
> +	# same first 4 hexadecimal characters in their (SHA-1) hashes.
> +	echo brocdnra >submodule/c &&
> +	git -C submodule commit -am "update c" &&
> +	echo brigddsv >submodule/c &&
> +	git -C submodule commit -am "update c again" &&
> +
> +	cat >expect <<-\EOF &&
> +	100644 6da7 0	.gitmodules
> +	100644 7898 0	a
> +	100644 6178 0	b/b
> +	100644 dead9 0	submodule/c
> +	EOF
> +
> +	git ls-files --stage --recurse-submodules --abbrev=4 >actual &&
> +	test_cmp expect actual
> +'
> +
>  test_expect_success 'ls-files correctly outputs files in submodule with -z' '
>  	lf_to_nul >expect <<-\EOF &&
>  	.gitmodules
> @@ -292,7 +311,6 @@ test_incompatible_with_recurse_submodules () {
>  test_incompatible_with_recurse_submodules --deleted
>  test_incompatible_with_recurse_submodules --modified
>  test_incompatible_with_recurse_submodules --others
> -test_incompatible_with_recurse_submodules --stage
>  test_incompatible_with_recurse_submodules --killed
>  test_incompatible_with_recurse_submodules --unmerged
Ævar Arnfjörð Bjarmason Feb. 19, 2022, 3:11 a.m. UTC | #2
On Fri, Feb 18 2022, Jonathan Tan wrote:

> e77aa336f1 ("ls-files: optionally recurse into submodules", 2016-10-10)
> taught ls-files the --recurse-submodules argument, but only in a limited
> set of circumstances. In particular, --stage was unsupported, perhaps
> because there was no repo_find_unique_abbrev(), which was only
> introduced in 8bb95572b0 ("sha1-name.c: add
> repo_find_unique_abbrev_r()", 2019-04-16). This function is needed for
> using --recurse-submodules with --stage.
>
> Now that we have repo_find_unique_abbrev(), teach support for this
> combination of arguments.
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
> I got the similar-hashing object contents from Ævar's work in [1].

Hah! FWIW that was made by this script I hacked up at the time:
	
	#!/usr/bin/env perl
	use v5.32.0;
	use strict;
	use warnings;
	use Digest::SHA qw(sha1_hex sha256_hex);
	
	# Usage:
	## prefix= type=bad git find-colliding-hashes | tee garbage-coll-bad.txt
	## prefix= type=bad want=bad git find-colliding-hashes | tee garbage-coll-bad.txt
	
	$| = 1;
	my $s = $ENV{s} // "s";
	my %seen;
	my $type = $ENV{type} // "blob";
	my $prefix = $ENV{prefix} // "";
	my $want = $ENV{want} // "";
	while ($s++) {
		my $str = $prefix . $s;
		my $l = length($str) + 1;
		my $p = "$type $l\0$str\n";
		my $o = sha1_hex($p);
		next if length $want && index($o, $want) != 0;
		my $n = sha256_hex($p);
		my $os = substr($o, 0, 4);
		my $ns = substr($n, 0, 4);
		if ($os eq $ns) {
			say "hash($str) = [$os, $ns]" . ($seen{$os} ? " SEEN" : "");
			$seen{$os} = 1;
		}
	}

https://gist.github.com/avar/9e4c2bde7fbdc888b031713065a9eaf6 has some
more colliding blob prefixes, which I generated until I got bored with
it...

> +test_expect_success '--stage' '
> +	# In order to test hash abbreviation, write two objects that have the
> +	# same first 4 hexadecimal characters in their (SHA-1) hashes.
> +	echo brocdnra >submodule/c &&
> +	git -C submodule commit -am "update c" &&
> +	echo brigddsv >submodule/c &&
> +	git -C submodule commit -am "update c again" &&
> +
> +	cat >expect <<-\EOF &&
> +	100644 6da7 0	.gitmodules
> +	100644 7898 0	a
> +	100644 6178 0	b/b
> +	100644 dead9 0	submodule/c
> +	EOF

This test though will break, as you can see with:

    GIT_TEST_DEFAULT_HASH=sha256 ./t3007-ls-files-recurse-submodules.sh

So you'll need at least something like:

diff --git a/t/t3007-ls-files-recurse-submodules.sh b/t/t3007-ls-files-recurse-submodules.sh
index 3d2da360d17..0fe69da8dcf 100755
--- a/t/t3007-ls-files-recurse-submodules.sh
+++ b/t/t3007-ls-files-recurse-submodules.sh
@@ -42,10 +42,10 @@ test_expect_success '--stage' '
 	echo brigddsv >submodule/c &&
 	git -C submodule commit -am "update c again" &&
 
-	cat >expect <<-\EOF &&
-	100644 6da7 0	.gitmodules
-	100644 7898 0	a
-	100644 6178 0	b/b
+	cat >expect <<-EOF &&
+	100644 $(git rev-parse --short=4 HEAD:.gitmodules) 0	.gitmodules
+	100644 $(git rev-parse --short=4 HEAD:a) 0	a
+	100644 $(git rev-parse --short=4 HEAD:b/b) 0	b/b
 	100644 dead9 0	submodule/c
 	EOF
 
But then the problem is that one is dead9 and the other dead6, I was
just trying to find 4-char prefixes.

But having indulged in all that, I'm now entirely confused about why any
of this needs to be tested here.

You're adding --stage, which will give us --stage-y output, and it was
previously incompatible with --recurse-submodules. Having the two
combine is good!

But why do we need to test the OID abbreviation at all, isn't that a bit
too much paranoia? Isn't it sufficient just do:

    opts="--stage --abbrev=4" &&
    git -C submodule ls-files $opts >expect &&
    git ls-files --recurse-submodules $opts --stage >raw &&
    grep submodule raw >actual &&
    test_cmp expect actual

Or well, then the path won't be the same, but I think you get the
idea.

I.e. don't we just want to test that the submodule is indeed included
here, not that some particular feature works in combination with it.

Supposing that repo_find_unique_abbrev() won't work might be a bit too
much paranoia, and I'm more test-happy than most :)

I'd think that if we should test anything it would be more meaningful to
e.g. test the sort order of the returned entries.

Your test case won't disambiguate between index entries being returned
in sort order v.s. just "submodules at the end". Since "s" sorts after
0, a and b.

Presumably it does the former, but I'd think distinguishing those would
be one meaningful test of actual --recurse-submodules --stage
functionality.
Taylor Blau Feb. 19, 2022, 3:50 a.m. UTC | #3
On Sat, Feb 19, 2022 at 04:11:30AM +0100, Ævar Arnfjörð Bjarmason wrote:
> But why do we need to test the OID abbreviation at all, isn't that a bit
> too much paranoia? Isn't it sufficient just do:
>
>     opts="--stage --abbrev=4" &&
>     git -C submodule ls-files $opts >expect &&
>     git ls-files --recurse-submodules $opts --stage >raw &&
>     grep submodule raw >actual &&
>     test_cmp expect actual

Yeah; I like this direction. I try to err on the side of reconstructing
the whole output and then calling test_cmp on it. I usually go that way
because it's nice for somebody reading the test script to see what the
output is supposed to look like.

But because you have to have a bunch of $(git rev-parse --short=4 ...)
in subshells everywhere in order to produce the right output, I don't
think the result resembles the actual output in this case.

I'd probably go a bit further than what you propose, maybe replacing
that grep with:

    grep "0.submodule" actual

to make sure that we got the correct stage number, too. But I agree with
you on the general direction.

Thanks,
Taylor
Junio C Hamano Feb. 21, 2022, 1:48 a.m. UTC | #4
Jonathan Tan <jonathantanmy@google.com> writes:

> +test_expect_success '--stage' '
> +	# In order to test hash abbreviation, write two objects that have the
> +	# same first 4 hexadecimal characters in their (SHA-1) hashes.

What about linux-sha256 jobs?
e.g. https://github.com/git/git/runs/5267681569?check_suite_focus=true

> +	echo brocdnra >submodule/c &&
> +	git -C submodule commit -am "update c" &&
> +	echo brigddsv >submodule/c &&
> +	git -C submodule commit -am "update c again" &&
> +
> +	cat >expect <<-\EOF &&
> +	100644 6da7 0	.gitmodules
> +	100644 7898 0	a
> +	100644 6178 0	b/b
> +	100644 dead9 0	submodule/c
> +	EOF
> +
> +	git ls-files --stage --recurse-submodules --abbrev=4 >actual &&
> +	test_cmp expect actual
> +'
> +
>  test_expect_success 'ls-files correctly outputs files in submodule with -z' '
>  	lf_to_nul >expect <<-\EOF &&
>  	.gitmodules
> @@ -292,7 +311,6 @@ test_incompatible_with_recurse_submodules () {
>  test_incompatible_with_recurse_submodules --deleted
>  test_incompatible_with_recurse_submodules --modified
>  test_incompatible_with_recurse_submodules --others
> -test_incompatible_with_recurse_submodules --stage
>  test_incompatible_with_recurse_submodules --killed
>  test_incompatible_with_recurse_submodules --unmerged
Taylor Blau Feb. 21, 2022, 2:45 a.m. UTC | #5
On Sun, Feb 20, 2022 at 05:48:38PM -0800, Junio C Hamano wrote:
> Jonathan Tan <jonathantanmy@google.com> writes:
>
> > +test_expect_success '--stage' '
> > +	# In order to test hash abbreviation, write two objects that have the
> > +	# same first 4 hexadecimal characters in their (SHA-1) hashes.
>
> What about linux-sha256 jobs?
> e.g. https://github.com/git/git/runs/5267681569?check_suite_focus=true

Yeah; this is indeed broken. See the discussion beginning at [1] for
some options.

[1]: https://lore.kernel.org/git/220219.868ru7fsad.gmgdl@evledraar.gmail.com/

Thanks,
Taylor
Junio C Hamano Feb. 21, 2022, 6:19 p.m. UTC | #6
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> This test though will break, as you can see with:
>
>     GIT_TEST_DEFAULT_HASH=sha256 ./t3007-ls-files-recurse-submodules.sh
>
> So you'll need at least something like:
>
> diff --git a/t/t3007-ls-files-recurse-submodules.sh b/t/t3007-ls-files-recurse-submodules.sh
> index 3d2da360d17..0fe69da8dcf 100755
> --- a/t/t3007-ls-files-recurse-submodules.sh
> +++ b/t/t3007-ls-files-recurse-submodules.sh
> @@ -42,10 +42,10 @@ test_expect_success '--stage' '
>  	echo brigddsv >submodule/c &&
>  	git -C submodule commit -am "update c again" &&
>  
> -	cat >expect <<-\EOF &&
> -	100644 6da7 0	.gitmodules
> -	100644 7898 0	a
> -	100644 6178 0	b/b
> +	cat >expect <<-EOF &&
> +	100644 $(git rev-parse --short=4 HEAD:.gitmodules) 0	.gitmodules
> +	100644 $(git rev-parse --short=4 HEAD:a) 0	a
> +	100644 $(git rev-parse --short=4 HEAD:b/b) 0	b/b
>  	100644 dead9 0	submodule/c
>  	EOF
>  
> But then the problem is that one is dead9 and the other dead6, I was
> just trying to find 4-char prefixes.
>
> But having indulged in all that, I'm now entirely confused about why any
> of this needs to be tested here.
>
> You're adding --stage, which will give us --stage-y output, and it was
> previously incompatible with --recurse-submodules. Having the two
> combine is good!

I think what this is trying to make sure is that it (1) enabled the
combination and (2) uses the object store of the submodule when
disambiguating names of the objects from the submodule, because the
author suspects that the reason why these two options were made
incompatible in the first place was because long time ago there
wasn't a way to ask "here is an object name---please uniquify in the
context of _that_ repository".  So it is understandable to prepare
an object X in a submodule and another object Y in the superproject,
such that the abbreviated name of X in the context of the submodule
is different from the abbreviated name of X in the context of the
superproject (i.e. if X were in the superproject's object store,
because the object names of X and Y share the prefix, it may require
longer prefix to disambiguate from Y), and make sure that the uniquify
is indeed happening in the context of the submodule.

So, you are only concentrating on (1) but forgetting why the author
wants (2).
Ævar Arnfjörð Bjarmason Feb. 21, 2022, 6:51 p.m. UTC | #7
On Mon, Feb 21 2022, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> This test though will break, as you can see with:
>>
>>     GIT_TEST_DEFAULT_HASH=sha256 ./t3007-ls-files-recurse-submodules.sh
>>
>> So you'll need at least something like:
>>
>> diff --git a/t/t3007-ls-files-recurse-submodules.sh b/t/t3007-ls-files-recurse-submodules.sh
>> index 3d2da360d17..0fe69da8dcf 100755
>> --- a/t/t3007-ls-files-recurse-submodules.sh
>> +++ b/t/t3007-ls-files-recurse-submodules.sh
>> @@ -42,10 +42,10 @@ test_expect_success '--stage' '
>>  	echo brigddsv >submodule/c &&
>>  	git -C submodule commit -am "update c again" &&
>>  
>> -	cat >expect <<-\EOF &&
>> -	100644 6da7 0	.gitmodules
>> -	100644 7898 0	a
>> -	100644 6178 0	b/b
>> +	cat >expect <<-EOF &&
>> +	100644 $(git rev-parse --short=4 HEAD:.gitmodules) 0	.gitmodules
>> +	100644 $(git rev-parse --short=4 HEAD:a) 0	a
>> +	100644 $(git rev-parse --short=4 HEAD:b/b) 0	b/b
>>  	100644 dead9 0	submodule/c
>>  	EOF
>>  
>> But then the problem is that one is dead9 and the other dead6, I was
>> just trying to find 4-char prefixes.
>>
>> But having indulged in all that, I'm now entirely confused about why any
>> of this needs to be tested here.
>>
>> You're adding --stage, which will give us --stage-y output, and it was
>> previously incompatible with --recurse-submodules. Having the two
>> combine is good!
>
> I think what this is trying to make sure is that it (1) enabled the
> combination and (2) uses the object store of the submodule when
> disambiguating names of the objects from the submodule, because the
> author suspects that the reason why these two options were made
> incompatible in the first place was because long time ago there
> wasn't a way to ask "here is an object name---please uniquify in the
> context of _that_ repository".  So it is understandable to prepare
> an object X in a submodule and another object Y in the superproject,
> such that the abbreviated name of X in the context of the submodule
> is different from the abbreviated name of X in the context of the
> superproject (i.e. if X were in the superproject's object store,
> because the object names of X and Y share the prefix, it may require
> longer prefix to disambiguate from Y), and make sure that the uniquify
> is indeed happening in the context of the submodule.
>
> So, you are only concentrating on (1) but forgetting why the author
> wants (2).

Indeed. That makes sense, but it would really help to e.g. have the test
description make that goal explicit.
Jonathan Tan Feb. 24, 2022, 12:11 a.m. UTC | #8
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
> 
> On Mon, Feb 21 2022, Junio C Hamano wrote:
> > I think what this is trying to make sure is that it (1) enabled the
> > combination and (2) uses the object store of the submodule when
> > disambiguating names of the objects from the submodule, because the
> > author suspects that the reason why these two options were made
> > incompatible in the first place was because long time ago there
> > wasn't a way to ask "here is an object name---please uniquify in the
> > context of _that_ repository".  So it is understandable to prepare
> > an object X in a submodule and another object Y in the superproject,
> > such that the abbreviated name of X in the context of the submodule
> > is different from the abbreviated name of X in the context of the
> > superproject (i.e. if X were in the superproject's object store,
> > because the object names of X and Y share the prefix, it may require
> > longer prefix to disambiguate from Y), and make sure that the uniquify
> > is indeed happening in the context of the submodule.
> >
> > So, you are only concentrating on (1) but forgetting why the author
> > wants (2).
> 
> Indeed. That makes sense, but it would really help to e.g. have the test
> description make that goal explicit.

That indeed was why I did that (thanks, Junio). It looks like it doesn't
work with other hash algorithms, and maybe it's too much paranoia on my
part. I'll send an updated version following Ævar's suggestion.
diff mbox series

Patch

diff --git a/Documentation/git-ls-files.txt b/Documentation/git-ls-files.txt
index 48cc7c0b6f..0dabf3f0dd 100644
--- a/Documentation/git-ls-files.txt
+++ b/Documentation/git-ls-files.txt
@@ -156,7 +156,7 @@  a space) at the start of each line:
 
 --recurse-submodules::
 	Recursively calls ls-files on each active submodule in the repository.
-	Currently there is only support for the --cached mode.
+	Currently there is only support for the --cached and --stage modes.
 
 --abbrev[=<n>]::
 	Instead of showing the full 40-byte hexadecimal object
diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index f7ea56cc63..e791b65e7e 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -244,7 +244,7 @@  static void show_ce(struct repository *repo, struct dir_struct *dir,
 			printf("%s%06o %s %d\t",
 			       tag,
 			       ce->ce_mode,
-			       find_unique_abbrev(&ce->oid, abbrev),
+			       repo_find_unique_abbrev(repo, &ce->oid, abbrev),
 			       ce_stage(ce));
 		}
 		write_eolinfo(repo->index, ce, fullname);
@@ -726,7 +726,7 @@  int cmd_ls_files(int argc, const char **argv, const char *cmd_prefix)
 		setup_work_tree();
 
 	if (recurse_submodules &&
-	    (show_stage || show_deleted || show_others || show_unmerged ||
+	    (show_deleted || show_others || show_unmerged ||
 	     show_killed || show_modified || show_resolve_undo || with_tree))
 		die("ls-files --recurse-submodules unsupported mode");
 
diff --git a/t/t3007-ls-files-recurse-submodules.sh b/t/t3007-ls-files-recurse-submodules.sh
index 4a08000713..3d2da360d1 100755
--- a/t/t3007-ls-files-recurse-submodules.sh
+++ b/t/t3007-ls-files-recurse-submodules.sh
@@ -34,6 +34,25 @@  test_expect_success 'ls-files correctly outputs files in submodule' '
 	test_cmp expect actual
 '
 
+test_expect_success '--stage' '
+	# In order to test hash abbreviation, write two objects that have the
+	# same first 4 hexadecimal characters in their (SHA-1) hashes.
+	echo brocdnra >submodule/c &&
+	git -C submodule commit -am "update c" &&
+	echo brigddsv >submodule/c &&
+	git -C submodule commit -am "update c again" &&
+
+	cat >expect <<-\EOF &&
+	100644 6da7 0	.gitmodules
+	100644 7898 0	a
+	100644 6178 0	b/b
+	100644 dead9 0	submodule/c
+	EOF
+
+	git ls-files --stage --recurse-submodules --abbrev=4 >actual &&
+	test_cmp expect actual
+'
+
 test_expect_success 'ls-files correctly outputs files in submodule with -z' '
 	lf_to_nul >expect <<-\EOF &&
 	.gitmodules
@@ -292,7 +311,6 @@  test_incompatible_with_recurse_submodules () {
 test_incompatible_with_recurse_submodules --deleted
 test_incompatible_with_recurse_submodules --modified
 test_incompatible_with_recurse_submodules --others
-test_incompatible_with_recurse_submodules --stage
 test_incompatible_with_recurse_submodules --killed
 test_incompatible_with_recurse_submodules --unmerged