diff mbox series

[v11,3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data

Message ID 20211007203148.23888-3-someguy@effective-light.com (mailing list archive)
State Superseded
Headers show
Series [v11,1/3] grep: refactor next_match() and match_one_pattern() for external use | expand

Commit Message

Hamza Mahfooz Oct. 7, 2021, 8:31 p.m. UTC
If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:

    $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
    grep: (standard input): binary file matches

So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
the pattern is composed of only ascii characters), so long as the log
output is encoded using UTF-8.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
 grep.c                          |  6 +++--
 grep.h                          |  1 +
 revision.c                      |  2 ++
 t/t7812-grep-icase-non-ascii.sh | 48 +++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 2 deletions(-)

Comments

Junio C Hamano Oct. 8, 2021, 9:26 p.m. UTC | #1
Hamza Mahfooz <someguy@effective-light.com> writes:

> If we attempt to grep non-ascii log message text with an ascii pattern, we
> run into the following issue:
>
>     $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
>     grep: (standard input): binary file matches
>
> So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
> the pattern is composed of only ascii characters), so long as the log
> output is encoded using UTF-8.

We'd need this only if we are using pcre2 backend, no?  If that is
the case, that fact needs to be recorded in the proposed log message
to help later developers, when they wonder why this "all-the-things"
knob exists.

And if it is the case that this bit is needed only to work around a
glitch while using pcre2 backend, I'd rather want to see a solution
that does not need to contaminate the more generic "struct grep_opt"
data and "setup_revisions()" codepath.

In other words, can't the function compile_pcre2_pattern() make the
"is log output encoding utf8?" decision locally and act accordingly?

Thanks.
Junio C Hamano Oct. 9, 2021, 6:44 a.m. UTC | #2
Hamza Mahfooz <someguy@effective-light.com> writes:

> diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
> index e5d1e4ea68..42323b31c0 100755
> --- a/t/t7812-grep-icase-non-ascii.sh
> +++ b/t/t7812-grep-icase-non-ascii.sh
> @@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
> ...
> +	git add file5 &&
> +	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
> +	export GIT_COMMITTER_EMAIL="committer@example.com" &&

These lines are flagged as an error by test-lint; I've queued this
on top to make today's integration result to pass our tests.

Thanks.


 t/t7812-grep-icase-non-ascii.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index 42323b31c0..313d4894ad 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -72,8 +72,8 @@ test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern o
 	EOF
 	test_write_lines "fifth" >file5 &&
 	git add file5 &&
-	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
-	export GIT_COMMITTER_EMAIL="committer@example.com" &&
+	GIT_COMMITTER_NAME="Ç O Mîtter" GIT_COMMITTER_EMAIL="committer@example.com" &&
+	export GIT_COMMITTER_NAME GIT_COMMITTER_EMAIL &&
 	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
 	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
 	grep Commit: log >actual.raw &&
Hamza Mahfooz Oct. 9, 2021, 3:52 p.m. UTC | #3
On Fri, Oct 8 2021 at 11:44:14 PM -0700, Junio C Hamano 
<gitster@pobox.com> wrote:
> These lines are flagged as an error by test-lint; I've queued this
> on top to make today's integration result to pass our tests.

I fixed this issue in v12 [1] btw.

[1] 
https://lkml.kernel.org/r/20211008224918.603392-3-someguy@effective-light.com
diff mbox series

Patch

diff --git a/grep.c b/grep.c
index fe847a0111..978d30f053 100644
--- a/grep.c
+++ b/grep.c
@@ -382,8 +382,10 @@  static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
-	    !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+	if ((opt->utf8_all_the_things && !has_non_ascii(p->pattern)) ||
+	    (!opt->ignore_locale && is_utf8_locale() &&
+	     has_non_ascii(p->pattern) && !(!opt->ignore_case &&
+					    (p->fixed || p->is_fixed))))
 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
 
 #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
diff --git a/grep.h b/grep.h
index 808ad76f0c..c9ddd637d1 100644
--- a/grep.h
+++ b/grep.h
@@ -167,6 +167,7 @@  struct grep_opt {
 	int extended_regexp_option;
 	int pattern_type_option;
 	int ignore_locale;
+	int utf8_all_the_things;
 	char colors[NR_GREP_COLORS][COLOR_MAXLEN];
 	unsigned pre_context;
 	unsigned post_context;
diff --git a/revision.c b/revision.c
index 0dabb5a0bc..0d751dceb7 100644
--- a/revision.c
+++ b/revision.c
@@ -2874,6 +2874,8 @@  int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s
 				 &revs->grep_filter);
 	if (!is_encoding_utf8(get_log_output_encoding()))
 		revs->grep_filter.ignore_locale = 1;
+	else
+		revs->grep_filter.utf8_all_the_things = 1;
 	compile_grep_patterns(&revs->grep_filter);
 
 	if (revs->reverse && revs->reflog_info)
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index e5d1e4ea68..42323b31c0 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,6 +53,54 @@  test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
 	test_cmp expected actual
 '
 
+test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+	EOF
+	test_write_lines "forth" >file4 &&
+	git add file4 &&
+	git commit --author="À Ú Thor <author@example.com>" -m sécond &&
+	git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+	grep Author log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
+	EOF
+	test_write_lines "fifth" >file5 &&
+	git add file5 &&
+	export GIT_COMMITTER_NAME="Ç O Mîtter" &&
+	export GIT_COMMITTER_EMAIL="committer@example.com" &&
+	git -c i18n.commitEncoding=latin1 commit -m thïrd &&
+	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
+	grep Commit: log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
+	cat >expected <<-\EOF &&
+	    sé<BOLD;RED>con<RESET>d
+	EOF
+	git log -1 --color=always --perl-regexp --grep="con" >log &&
+	grep con log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
+	cat >expected <<-\EOF &&
+	    <BOLD;RED>thïrd<RESET>
+	EOF
+	git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
+	grep "th.*rd" log >actual.raw &&
+	test_decode_color <actual.raw >actual &&
+	test_cmp expected actual
+'
+
 test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
 	printf "\\200\\n" >invalid-0x80 &&
 	echo "ævar" >expected &&