From dc2c44fbb100fa609174d9069a70e2b54b0591ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Sat, 18 Dec 2021 20:50:02 +0100 Subject: [PATCH 1/2] grep/pcre2: use PCRE2_UTF even with ASCII patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit compile_pcre2_pattern() currently uses the option PCRE2_UTF only for patterns with non-ASCII characters. Patterns with ASCII wildcards can match non-ASCII strings, though. Without that option PCRE2 mishandles UTF-8 input, though -- it matches parts of multi-byte characters. Fix that by using PCRE2_UTF even for ASCII-only patterns. This is a remake of the reverted ae39ba431a (grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data, 2021-10-15). The change to the condition and the test are simplified and more targeted. Original-patch-by: Hamza Mahfooz Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- grep.c | 2 +- t/t7812-grep-icase-non-ascii.sh | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/grep.c b/grep.c index fe847a0111..5badb6d851 100644 --- a/grep.c +++ b/grep.c @@ -382,7 +382,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } options |= PCRE2_CASELESS; } - if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) && + if (!opt->ignore_locale && is_utf8_locale() && !(!opt->ignore_case && (p->fixed || p->is_fixed))) options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF); diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index e5d1e4ea68..ca3f24f807 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -123,4 +123,10 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: gr test_cmp invalid-0xe5 actual ' +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-literal ASCII from UTF-8' ' + git grep --perl-regexp -h -o -e ll. file >actual && + echo "lló" >expected && + test_cmp expected actual +' + test_done From 32e3e8bc551e7b10bbda07110ae7cb15442d0392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Sat, 18 Dec 2021 20:53:15 +0100 Subject: [PATCH 2/2] grep/pcre2: factor out literal variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patterns that contain no wildcards and don't have to be case-folded are literal. Give this condition a name to increase the readability of the boolean expression for enabling the option PCRE2_UTF. Signed-off-by: René Scharfe Signed-off-by: Junio C Hamano --- grep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grep.c b/grep.c index 5badb6d851..2b6ac3205d 100644 --- a/grep.c +++ b/grep.c @@ -362,6 +362,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt int jitret; int patinforet; size_t jitsizearg; + int literal = !opt->ignore_case && (p->fixed || p->is_fixed); /* * Call pcre2_general_context_create() before calling any @@ -382,8 +383,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } options |= PCRE2_CASELESS; } - if (!opt->ignore_locale && is_utf8_locale() && - !(!opt->ignore_case && (p->fixed || p->is_fixed))) + if (!opt->ignore_locale && is_utf8_locale() && !literal) options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF); #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER