From ecbff141a1c6e4a32a43f90db70cde2b2199b241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:08 +0200 Subject: [PATCH 01/22] grep/pcre2 tests: reword comments referring to kwset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kwset optimization has not been used by grep since 48de2a768cf (grep: remove the kwset optimization, 2019-07-01). Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7816-grep-binary-pattern.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t/t7816-grep-binary-pattern.sh b/t/t7816-grep-binary-pattern.sh index 60bab291e49..9d67a5fc4cf 100755 --- a/t/t7816-grep-binary-pattern.sh +++ b/t/t7816-grep-binary-pattern.sh @@ -59,7 +59,7 @@ test_expect_success 'setup' " git commit -m. " -# Simple fixed-string matching that can use kwset (no -i && non-ASCII) +# Simple fixed-string matching nul_match P P P '-F' 'yQf' nul_match P P P '-F' 'yQx' nul_match P P P '-Fi' 'YQf' @@ -78,7 +78,7 @@ nul_match P P P '-Fi' '[Y]QF' nul_match P P P '-F' 'æQ[ð]' nul_match P P P '-F' '[æ]Qð' -# The -F kwset codepath can't handle -i && non-ASCII... +# Matching pattern and subject case with -i nul_match P 1 1 '-i' '[æ]Qð' # ...PCRE v2 only matches non-ASCII with -i casefolding under UTF-8 From 6d0a40166e87a78ca17fddeee25228ded92a169e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:09 +0200 Subject: [PATCH 02/22] pickaxe tests: refactor to use test_commit --append --printf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the existing tests added in e0e7cb8080c (log -G: ignore binary files, 2018-12-14) to use the --append option I added in 3373518cc8b (test-lib functions: add an --append option to test_commit, 2021-01-12) and the --printf option added as part of an in-flight topic of mine this commit depends on. While I'm at it change some of the setup of the test to use a more sensible pattern, e.g. setting up a temporary repo instead of creating an orphan branch. Since the -G and -S options will behave the same way with truncated and removed content also change the "git rm" to emptying data.bin, that's just catering to how test_commit works. The resulting test is shorter. See also f5d79bf7dd6 (tests: refactor a few tests to use "test_commit --append", 2021-01-12) for prior similar refactoring. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4209-log-pickaxe.sh | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 5d06f5f45ea..ad45d8cfd0a 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -107,37 +107,35 @@ test_expect_success 'log -S --no-textconv (missing textconv tool)' ' ' test_expect_success 'setup log -[GS] binary & --text' ' - git checkout --orphan GS-binary-and-text && - git read-tree --empty && - printf "a\na\0a\n" >data.bin && - git add data.bin && - git commit -m "create binary file" data.bin && - printf "a\na\0a\n" >>data.bin && - git commit -m "modify binary file" data.bin && - git rm data.bin && - git commit -m "delete binary file" data.bin && - git log >full-log + test_create_repo GS-bin-txt && + test_commit -C GS-bin-txt --printf A data.bin "a\na\0a\n" && + test_commit -C GS-bin-txt --append --printf B data.bin "a\na\0a\n" && + test_commit -C GS-bin-txt C data.bin "" && + git -C GS-bin-txt log >full-log ' test_expect_success 'log -G ignores binary files' ' - git log -Ga >log && + git -C GS-bin-txt log -Ga >log && test_must_be_empty log ' test_expect_success 'log -G looks into binary files with -a' ' - git log -a -Ga >log && + git -C GS-bin-txt log -a -Ga >log && test_cmp log full-log ' test_expect_success 'log -G looks into binary files with textconv filter' ' - test_when_finished "rm .gitattributes" && - echo "* diff=bin" >.gitattributes && - git -c diff.bin.textconv=cat log -Ga >log && + test_when_finished "rm GS-bin-txt/.gitattributes" && + ( + cd GS-bin-txt && + echo "* diff=bin" >.gitattributes && + git -c diff.bin.textconv=cat log -Ga >../log + ) && test_cmp log full-log ' test_expect_success 'log -S looks into binary files' ' - git log -Sa >log && + git -C GS-bin-txt log -Sa >log && test_cmp log full-log ' From c9609398580518dffaadaace17188c901a4a4522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:10 +0200 Subject: [PATCH 03/22] pickaxe tests: add test for diffgrep_consume() internals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In diffgrep_consume() we generate a diff, and then advance past the "+" or "-" at the start of the line for matching. This has been done ever since the code was added in f506b8e8b5f (git log/diff: add -G that greps in the patch text, 2010-08-23). If we match "line" instead of "line + 1" no tests fail, i.e. we've got zero coverage for whether any of our searches match the beginning of the line or not. Let's add a test for this. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4209-log-pickaxe.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index ad45d8cfd0a..eacb9f0a1b5 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -106,6 +106,21 @@ test_expect_success 'log -S --no-textconv (missing textconv tool)' ' rm .gitattributes ' +test_expect_success 'setup log -[GS] plain' ' + test_create_repo GS-plain && + test_commit -C GS-plain --append A data.txt "a" && + test_commit -C GS-plain --append B data.txt "a a" && + test_commit -C GS-plain C data.txt "" && + git -C GS-plain log >full-log +' + +test_expect_success 'log -G trims diff new/old [-+]' ' + git -C GS-plain log -G"[+-]a" >log && + test_must_be_empty log && + git -C GS-plain log -G"^a" >log && + test_cmp log full-log +' + test_expect_success 'setup log -[GS] binary & --text' ' test_create_repo GS-bin-txt && test_commit -C GS-bin-txt --printf A data.bin "a\na\0a\n" && From 69ae93089cd892d70429f7f06371860d65cb1f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:11 +0200 Subject: [PATCH 04/22] pickaxe tests: add test for "log -S" not being a regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No test in our test suite checked for "log -S" being a fixed string, as opposed to "log -S --pickaxe-regex". Let's test for it. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4209-log-pickaxe.sh | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index eacb9f0a1b5..9fa770b5fbd 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -106,11 +106,18 @@ test_expect_success 'log -S --no-textconv (missing textconv tool)' ' rm .gitattributes ' -test_expect_success 'setup log -[GS] plain' ' +test_expect_success 'setup log -[GS] plain & regex' ' test_create_repo GS-plain && test_commit -C GS-plain --append A data.txt "a" && test_commit -C GS-plain --append B data.txt "a a" && - test_commit -C GS-plain C data.txt "" && + test_commit -C GS-plain --append C data.txt "b" && + test_commit -C GS-plain --append D data.txt "[b]" && + test_commit -C GS-plain E data.txt "" && + + # We also include E, the deletion commit + git -C GS-plain log --grep="[ABE]" >A-to-B-then-E-log && + git -C GS-plain log --grep="[CDE]" >C-to-D-then-E-log && + git -C GS-plain log --grep="[DE]" >D-then-E-log && git -C GS-plain log >full-log ' @@ -118,7 +125,24 @@ test_expect_success 'log -G trims diff new/old [-+]' ' git -C GS-plain log -G"[+-]a" >log && test_must_be_empty log && git -C GS-plain log -G"^a" >log && - test_cmp log full-log + test_cmp log A-to-B-then-E-log +' + +test_expect_success 'log -S is not a regex, but -S --pickaxe-regex is' ' + git -C GS-plain log -S"a" >log && + test_cmp log A-to-B-then-E-log && + + git -C GS-plain log -S"[a]" >log && + test_must_be_empty log && + + git -C GS-plain log -S"[a]" --pickaxe-regex >log && + test_cmp log A-to-B-then-E-log && + + git -C GS-plain log -S"[b]" >log && + test_cmp log D-then-E-log && + + git -C GS-plain log -S"[b]" --pickaxe-regex >log && + test_cmp log C-to-D-then-E-log ' test_expect_success 'setup log -[GS] binary & --text' ' From 064952fc34b9765282fec057b3af260eae7c75c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:12 +0200 Subject: [PATCH 05/22] pickaxe tests: test for -G, -S and --find-object incompatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test for the options sanity check added in 5e505257f2 (diff: properly error out when combining multiple pickaxe options, 2018-01-04). Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4209-log-pickaxe.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 9fa770b5fbd..21e22af1e7e 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -55,6 +55,17 @@ test_expect_success setup ' git rev-parse --verify HEAD >expect_second ' +test_expect_success 'usage' ' + test_expect_code 128 git log -Gregex -Sstring 2>err && + grep "mutually exclusive" err && + + test_expect_code 128 git log -Gregex --find-object=HEAD 2>err && + grep "mutually exclusive" err && + + test_expect_code 128 git log -Sstring --find-object=HEAD 2>err && + grep "mutually exclusive" err +' + test_log expect_initial --grep initial test_log expect_nomatch --grep InItial test_log_icase expect_initial --grep InItial From 7cd5d5b299497fb874897595c642144d7d894ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:13 +0200 Subject: [PATCH 06/22] pickaxe tests: add missing test for --no-pickaxe-regex being an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a missing test for --no-pickaxe-regex. This has been an error ever since before the -S or -G options were added, or since 7ae0b0cb65f (git-log (internal): more options., 2006-03-01). The reason for adding this test is that Junio suggested in [1] in response to a later test addition in this series that it might be good to support --no-pickaxe-regex in combination with -G. This would allow for fixed-string searching with -G, similr to grep's --fixed-strings mode. I agree that that would make sense if anyone would like to implement it, but since it dies right now let's first add this test to assert the existing long-standing behavior. We can always add support for --[no-]pickaxe-regex in combination with -G at some later date. 1. http://lore.kernel.org/git/xmqqwnto9pt7.fsf@gitster.g Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4209-log-pickaxe.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 21e22af1e7e..532bb875f02 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -66,6 +66,18 @@ test_expect_success 'usage' ' grep "mutually exclusive" err ' +test_expect_success 'usage: --no-pickaxe-regex' ' + cat >expect <<-\EOF && + fatal: unrecognized argument: --no-pickaxe-regex + EOF + + test_expect_code 128 git log -Sstring --no-pickaxe-regex 2>actual && + test_cmp expect actual && + + test_expect_code 128 git log -Gstring --no-pickaxe-regex 2>err && + test_cmp expect actual +' + test_log expect_initial --grep initial test_log expect_nomatch --grep InItial test_log_icase expect_initial --grep InItial From 188e9e28c5287f3f160b5f14e3e551b1c55c7301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:14 +0200 Subject: [PATCH 07/22] pickaxe: die when -G and --pickaxe-regex are combined MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the -G and --pickaxe-regex options are combined we simply ignore the --pickaxe-regex option. Let's die instead as suggested by our documentation, since -G is always a regex. When --pickaxe-regex was added in d01d8c6782 (Support for pickaxe matching regular expressions, 2006-03-29) only the -S option existed. Then when -G was added in f506b8e8b5 (git log/diff: add -G that greps in the patch text, 2010-08-23) neither the documentation for --pickaxe-regex was updated accordingly, nor was something like this assertion added. Since 5bc3f0b567 (diffcore-pickaxe doc: document -S and -G properly, 2013-05-31) we've claimed that --pickaxe-regex should only be used with -S, but have silently tolerated combining it with -G, let's die instead. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diff.c | 3 +++ diff.h | 2 ++ t/t4209-log-pickaxe.sh | 5 +++++ 3 files changed, 10 insertions(+) diff --git a/diff.c b/diff.c index 4acccd9d7ed..f9e86bca04e 100644 --- a/diff.c +++ b/diff.c @@ -4628,6 +4628,9 @@ void diff_setup_done(struct diff_options *options) if (HAS_MULTI_BITS(options->pickaxe_opts & DIFF_PICKAXE_KINDS_MASK)) die(_("-G, -S and --find-object are mutually exclusive")); + if (HAS_MULTI_BITS(options->pickaxe_opts & DIFF_PICKAXE_KINDS_G_REGEX_MASK)) + die(_("-G and --pickaxe-regex are mutually exclusive, use --pickaxe-regex with -S")); + /* * Most of the time we can say "there are changes" * only by checking if there are changed paths, but diff --git a/diff.h b/diff.h index c8f3faea8aa..5e110d349be 100644 --- a/diff.h +++ b/diff.h @@ -556,6 +556,8 @@ int git_config_rename(const char *var, const char *value); #define DIFF_PICKAXE_KINDS_MASK (DIFF_PICKAXE_KIND_S | \ DIFF_PICKAXE_KIND_G | \ DIFF_PICKAXE_KIND_OBJFIND) +#define DIFF_PICKAXE_KINDS_G_REGEX_MASK (DIFF_PICKAXE_KIND_G | \ + DIFF_PICKAXE_REGEX) #define DIFF_PICKAXE_IGNORE_CASE 32 diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 532bb875f02..772c6c1a7c8 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -66,6 +66,11 @@ test_expect_success 'usage' ' grep "mutually exclusive" err ' +test_expect_success 'usage: --pickaxe-regex' ' + test_expect_code 128 git log -Gregex --pickaxe-regex 2>err && + grep "mutually exclusive" err +' + test_expect_success 'usage: --no-pickaxe-regex' ' cat >expect <<-\EOF && fatal: unrecognized argument: --no-pickaxe-regex From d26ec8800969ea1b692e0c87100dc4235cfa12e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:15 +0200 Subject: [PATCH 08/22] pickaxe: die when --find-object and --pickaxe-all are combined MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neither the --pickaxe-all documentation nor --find-object's has ever suggested that you can combine the two. See f506b8e8b5 (git log/diff: add -G that greps in the patch text, 2010-08-23) and 15af58c1ad (diffcore: add a pickaxe option to find a specific blob, 2018-01-04). But we've silently tolerated it, which makes the logic in diffcore_pickaxe() harder to reason about. Let's assert that we won't have the two combined. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diff.c | 3 +++ diff.h | 2 ++ t/t4209-log-pickaxe.sh | 3 +++ 3 files changed, 8 insertions(+) diff --git a/diff.c b/diff.c index f9e86bca04e..c1f47a7f013 100644 --- a/diff.c +++ b/diff.c @@ -4631,6 +4631,9 @@ void diff_setup_done(struct diff_options *options) if (HAS_MULTI_BITS(options->pickaxe_opts & DIFF_PICKAXE_KINDS_G_REGEX_MASK)) die(_("-G and --pickaxe-regex are mutually exclusive, use --pickaxe-regex with -S")); + if (HAS_MULTI_BITS(options->pickaxe_opts & DIFF_PICKAXE_KINDS_ALL_OBJFIND_MASK)) + die(_("---pickaxe-all and --find-object are mutually exclusive, use --pickaxe-all with -G and -S")); + /* * Most of the time we can say "there are changes" * only by checking if there are changed paths, but diff --git a/diff.h b/diff.h index 5e110d349be..82254396f95 100644 --- a/diff.h +++ b/diff.h @@ -558,6 +558,8 @@ int git_config_rename(const char *var, const char *value); DIFF_PICKAXE_KIND_OBJFIND) #define DIFF_PICKAXE_KINDS_G_REGEX_MASK (DIFF_PICKAXE_KIND_G | \ DIFF_PICKAXE_REGEX) +#define DIFF_PICKAXE_KINDS_ALL_OBJFIND_MASK (DIFF_PICKAXE_ALL | \ + DIFF_PICKAXE_KIND_OBJFIND) #define DIFF_PICKAXE_IGNORE_CASE 32 diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 772c6c1a7c8..16166ffd3e6 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -63,6 +63,9 @@ test_expect_success 'usage' ' grep "mutually exclusive" err && test_expect_code 128 git log -Sstring --find-object=HEAD 2>err && + grep "mutually exclusive" err && + + test_expect_code 128 git log --pickaxe-all --find-object=HEAD 2>err && grep "mutually exclusive" err ' From a47fcbe6e412df295f1f5ffb8eca6dbe86d2b7bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:16 +0200 Subject: [PATCH 09/22] diff.h: move pickaxe fields together again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the pickaxe and pickaxe_opts fields next to each other again. In a past life they'd been on adjacent lines, but when they got moved from a global variable to the diff_options struct in 6b5ee137e5 (Diff clean-up., 2005-09-21) they got split apart. That split made sense at the time, the "char*" and "int" (flags) options were being grouped, but we've long since abandoned that pattern in the diff_options struct, and now it makes more sense to group these together again. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/diff.h b/diff.h index 82254396f95..8ba85c5e605 100644 --- a/diff.h +++ b/diff.h @@ -265,6 +265,7 @@ struct diff_options { * postimage of the diff_queue. */ const char *pickaxe; + unsigned pickaxe_opts; /* -I */ regex_t **ignore_regex; @@ -304,8 +305,6 @@ struct diff_options { /* The output format used when `diff_flush()` is run. */ int output_format; - unsigned pickaxe_opts; - /* Affects the way detection logic for complete rewrites, renames and * copies. */ From 102fdd2e07f72919060224b3eb3560524d6cf1f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:17 +0200 Subject: [PATCH 10/22] pickaxe/style: consolidate declarations and assignments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor contains() to do its assignments at the same time that it does its declarations. This code could have been refactored in ef90ab66e8e (pickaxe: use textconv for -S counting, 2012-10-28) when a function call between the declarations and assignments was removed. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index a9c6d60df22..a278b9b71d9 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -70,13 +70,9 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws) { - unsigned int cnt; - unsigned long sz; - const char *data; - - sz = mf->size; - data = mf->ptr; - cnt = 0; + unsigned int cnt = 0; + unsigned long sz = mf->size; + const char *data = mf->ptr; if (regexp) { regmatch_t regmatch; From d90d441c336cc120b47e025d14c3879864ce60c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:18 +0200 Subject: [PATCH 11/22] perf: add performance test for pickaxe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test for the -G and -S pickaxe options and related options. This test supports being run with GIT_TEST_LONG=1 to adjust the limit on the number of commits from 1k to 10k. The 1k limit seems to hit a good spot on git.git Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/perf/p4209-pickaxe.sh | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 t/perf/p4209-pickaxe.sh diff --git a/t/perf/p4209-pickaxe.sh b/t/perf/p4209-pickaxe.sh new file mode 100755 index 00000000000..f585a4465ae --- /dev/null +++ b/t/perf/p4209-pickaxe.sh @@ -0,0 +1,70 @@ +#!/bin/sh + +test_description="Test pickaxe performance" + +. ./perf-lib.sh + +test_perf_default_repo + +# Not --max-count, as that's the number of matching commit, so it's +# unbounded. We want to limit our revision walk here. +from_rev_desc= +from_rev= +max_count=1000 +if test_have_prereq EXPENSIVE +then + max_count=10000 +fi +from_rev=" $(git rev-list HEAD | head -n $max_count | tail -n 1).." +from_rev_desc=" .." + +for icase in \ + '' \ + '-i ' +do + # -S (no regex) + for pattern in \ + 'int main' \ + 'æ' + do + for opts in \ + '-S' + do + test_perf "git log $icase$opts'$pattern'$from_rev_desc" " + git log --pretty=format:%H $icase$opts'$pattern'$from_rev + " + done + done + + # -S (regex) + for pattern in \ + '(int|void|null)' \ + 'if *\([^ ]+ & ' \ + '[àáâãäåæñøùúûüýþ]' + do + for opts in \ + '--pickaxe-regex -S' + do + test_perf "git log $icase$opts'$pattern'$from_rev_desc" " + git log --pretty=format:%H $icase$opts'$pattern'$from_rev + " + done + done + + # -G + for pattern in \ + '(int|void|null)' \ + 'if *\([^ ]+ & ' \ + '[àáâãäåæñøùúûüýþ]' + do + for opts in \ + '-G' + do + test_perf "git log $icase$opts'$pattern'$from_rev_desc" " + git log --pretty=format:%H $icase$opts'$pattern'$from_rev + " + done + done +done + +test_done From 03c1f14acf5169d8e6c06a60c28ee5a6cfb9fd54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:19 +0200 Subject: [PATCH 12/22] pickaxe: refactor function selection in diffcore-pickaxe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's hard to read this codepath at a glance and reason about exactly what combination of -G and -S will compile either regexes or kwset, and whether we'll then dispatch to "diff_grep" or "has_changes". Then in the "--find-object" case we aren't using the callback function, but were previously passing down "has_changes". Refactor this code to exhaustively check "opts", it's now more obvious what callback function (or none) we want under what mode. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index a278b9b71d9..953b6ec1b4a 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -228,6 +228,7 @@ void diffcore_pickaxe(struct diff_options *o) int opts = o->pickaxe_opts; regex_t regex, *regexp = NULL; kwset_t kws = NULL; + pickaxe_fn fn; if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) { int cflags = REG_EXTENDED | REG_NEWLINE; @@ -235,6 +236,20 @@ void diffcore_pickaxe(struct diff_options *o) cflags |= REG_ICASE; regcomp_or_die(®ex, needle, cflags); regexp = ®ex; + + if (opts & DIFF_PICKAXE_KIND_G) + fn = diff_grep; + else if (opts & DIFF_PICKAXE_REGEX) + fn = has_changes; + else + /* + * We don't need to check the combination of + * -G and --pickaxe-regex, by the time we get + * here diff.c has already died if they're + * combined. See the usage tests in + * t4209-log-pickaxe.sh. + */ + BUG("unreachable"); } else if (opts & DIFF_PICKAXE_KIND_S) { if (o->pickaxe_opts & DIFF_PICKAXE_IGNORE_CASE && has_non_ascii(needle)) { @@ -251,10 +266,14 @@ void diffcore_pickaxe(struct diff_options *o) kwsincr(kws, needle, strlen(needle)); kwsprep(kws); } + fn = has_changes; + } else if (opts & DIFF_PICKAXE_KIND_OBJFIND) { + fn = NULL; + } else { + BUG("unknown pickaxe_opts flag"); } - pickaxe(&diff_queued_diff, o, regexp, kws, - (opts & DIFF_PICKAXE_KIND_G) ? diff_grep : has_changes); + pickaxe(&diff_queued_diff, o, regexp, kws, fn); if (regexp) regfree(regexp); From 2e197a759221921d42c5e94ae3f4897cd456ebb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:20 +0200 Subject: [PATCH 13/22] pickaxe: assert that we must have a needle under -G or -S MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assert early in diffcore_pickaxe() that we've got a needle to work with under -G and -S. This code is redundant to the check -G and -S get from parse-options.c's get_arg(), which I'm adding a test for. This check dates back to e1b161161d (diffcore-pickaxe: fix infinite loop on zero-length needle, 2007-01-25) when "git log -S" could send this code into an infinite loop. It was then later refactored in 8fa4b09fb1 (pickaxe: hoist empty needle check, 2012-10-28) into its current form, but it seemingly wasn't noticed that in the meantime a move to the parse-options.c API in dea007fb4c (diff: parse separate options like -S foo, 2010-08-05) had made it redundant. Let's retain some of the paranoia here with a BUG(), but there's no need to be checking this in the pickaxe_match() inner loop. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 6 +++--- t/t4209-log-pickaxe.sh | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 953b6ec1b4a..88b6ca840f6 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -132,9 +132,6 @@ static int pickaxe_match(struct diff_filepair *p, struct diff_options *o, oidset_contains(o->objfind, &p->two->oid)); } - if (!o->pickaxe[0]) - return 0; - if (o->flags.allow_textconv) { textconv_one = get_textconv(o->repo, p->one); textconv_two = get_textconv(o->repo, p->two); @@ -230,6 +227,9 @@ void diffcore_pickaxe(struct diff_options *o) kwset_t kws = NULL; pickaxe_fn fn; + if (opts & ~DIFF_PICKAXE_KIND_OBJFIND && + (!needle || !*needle)) + BUG("should have needle under -G or -S"); if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) { int cflags = REG_EXTENDED | REG_NEWLINE; if (o->pickaxe_opts & DIFF_PICKAXE_IGNORE_CASE) diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 16166ffd3e6..3f9aad0fdb0 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -56,6 +56,12 @@ test_expect_success setup ' ' test_expect_success 'usage' ' + test_expect_code 129 git log -S 2>err && + test_i18ngrep "switch.*requires a value" err && + + test_expect_code 129 git log -G 2>err && + test_i18ngrep "switch.*requires a value" err && + test_expect_code 128 git log -Gregex -Sstring 2>err && grep "mutually exclusive" err && From 52e011cd2b13e00fe40b1d59986948330b61e030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:21 +0200 Subject: [PATCH 14/22] pickaxe -S: support content with NULs under --pickaxe-regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bug in the matching routine powering -S --pickaxe-regex so that we won't abort early on content that has NULs in it. We've had a hard requirement on REG_STARTEND since 2f8952250a8 (regex: add regexec_buf() that can work on a non NUL-terminated string, 2016-09-21), but this sanity check dates back to d01d8c67828 (Support for pickaxe matching regular expressions, 2006-03-29). It wasn't needed anymore, and as the now-passing test shows, actively getting in our way. Since we always require REG_STARTEND support we do not need to stop at NULs. If we are dealing with a haystack with NUL in it. The needle may be behind that NUL. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 4 ++-- t/t4209-log-pickaxe.sh | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 88b6ca840f6..be0dd683b63 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -78,12 +78,12 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws) regmatch_t regmatch; int flags = 0; - while (sz && *data && + while (sz && !regexec_buf(regexp, data, sz, 1, ®match, flags)) { flags |= REG_NOTBOL; data += regmatch.rm_eo; sz -= regmatch.rm_eo; - if (sz && *data && regmatch.rm_so == regmatch.rm_eo) { + if (sz && regmatch.rm_so == regmatch.rm_eo) { data++; sz--; } diff --git a/t/t4209-log-pickaxe.sh b/t/t4209-log-pickaxe.sh index 3f9aad0fdb0..75795d0b492 100755 --- a/t/t4209-log-pickaxe.sh +++ b/t/t4209-log-pickaxe.sh @@ -215,4 +215,12 @@ test_expect_success 'log -S looks into binary files' ' test_cmp log full-log ' +test_expect_success 'log -S --pickaxe-regex looks into binary files' ' + git -C GS-bin-txt log --pickaxe-regex -Sa >log && + test_cmp log full-log && + + git -C GS-bin-txt log --pickaxe-regex -S"[a]" >log && + test_cmp log full-log +' + test_done From 5d35a9531cc9af717383bc11c2dfa4edc020ac56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:22 +0200 Subject: [PATCH 15/22] pickaxe: rename variables in has_changes() for brevity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the {one,two}_contains variables to c{1,2}. This will make a follow-up change easier to read. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index be0dd683b63..23362a23597 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -108,9 +108,9 @@ static int has_changes(mmfile_t *one, mmfile_t *two, struct diff_options *o, regex_t *regexp, kwset_t kws) { - unsigned int one_contains = one ? contains(one, regexp, kws) : 0; - unsigned int two_contains = two ? contains(two, regexp, kws) : 0; - return one_contains != two_contains; + unsigned int c1 = one ? contains(one, regexp, kws) : 0; + unsigned int c2 = two ? contains(two, regexp, kws) : 0; + return c1 != c2; } static int pickaxe_match(struct diff_filepair *p, struct diff_options *o, From 5b0672a26e51387bc18adad89eaa3ebb131b2e33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:23 +0200 Subject: [PATCH 16/22] pickaxe -S: slightly optimize contains() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the "log -S" switch counts occurrences of on the pre-image and post-image of a change. As soon as we know we had e.g. 1 before and 2 now we can stop, we don't need to keep counting past 2. With this change a diff between A and B may have different performance characteristics than between B and A. That's OK in this case, since we'll emit the same output, and the effect is to make one of them better. I'm picking a check of "one" first on the assumption that it's a more common case to have files grow over time than not. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 23362a23597..b7494fdf89c 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -68,7 +68,8 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, return ecbdata.hit; } -static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws) +static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws, + unsigned int limit) { unsigned int cnt = 0; unsigned long sz = mf->size; @@ -88,6 +89,9 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws) sz--; } cnt++; + + if (limit && cnt == limit) + return cnt; } } else { /* Classic exact string match */ @@ -99,6 +103,9 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws) sz -= offset + kwsm.size[0]; data += offset + kwsm.size[0]; cnt++; + + if (limit && cnt == limit) + return cnt; } } return cnt; @@ -108,8 +115,8 @@ static int has_changes(mmfile_t *one, mmfile_t *two, struct diff_options *o, regex_t *regexp, kwset_t kws) { - unsigned int c1 = one ? contains(one, regexp, kws) : 0; - unsigned int c2 = two ? contains(two, regexp, kws) : 0; + unsigned int c1 = one ? contains(one, regexp, kws, 0) : 0; + unsigned int c2 = two ? contains(two, regexp, kws, c1 + 1) : 0; return c1 != c2; } From a8d5eb6dc0d61625667b0d8155c425d3629baa12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:24 +0200 Subject: [PATCH 17/22] xdiff-interface: prepare for allowing early return MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the function prototype of xdiff_emit_line_fn to return an "int" instead of "void". Change all of those functions to "return 0", nothing checks those return values yet, and no behavior is being changed. In subsequent commits the interface will be changed to allow early return via this new return value. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- combine-diff.c | 5 +++-- diff.c | 26 +++++++++++++++----------- diffcore-pickaxe.c | 7 ++++--- range-diff.c | 3 ++- xdiff-interface.c | 3 ++- xdiff-interface.h | 2 +- 6 files changed, 27 insertions(+), 19 deletions(-) diff --git a/combine-diff.c b/combine-diff.c index 06635f91bc2..a12d3bc0d9c 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -403,11 +403,11 @@ static void consume_hunk(void *state_, state->sline[state->nb-1].p_lno[state->n] = state->ob; } -static void consume_line(void *state_, char *line, unsigned long len) +static int consume_line(void *state_, char *line, unsigned long len) { struct combine_diff_state *state = state_; if (!state->lost_bucket) - return; /* not in any hunk yet */ + return 0; /* not in any hunk yet */ switch (line[0]) { case '-': append_lost(state->lost_bucket, state->n, line+1, len-1); @@ -417,6 +417,7 @@ static void consume_line(void *state_, char *line, unsigned long len) state->lno++; break; } + return 0; } static void combine_diff(struct repository *r, diff --git a/diff.c b/diff.c index c1f47a7f013..7a03c581c79 100644 --- a/diff.c +++ b/diff.c @@ -2336,7 +2336,7 @@ static void find_lno(const char *line, struct emit_callback *ecbdata) ecbdata->lno_in_postimage = strtol(p + 1, NULL, 10); } -static void fn_out_consume(void *priv, char *line, unsigned long len) +static int fn_out_consume(void *priv, char *line, unsigned long len) { struct emit_callback *ecbdata = priv; struct diff_options *o = ecbdata->opt; @@ -2372,7 +2372,7 @@ static void fn_out_consume(void *priv, char *line, unsigned long len) len = sane_truncate_line(line, len); find_lno(line, ecbdata); emit_hunk_header(ecbdata, line, len); - return; + return 0; } if (ecbdata->diff_words) { @@ -2382,11 +2382,11 @@ static void fn_out_consume(void *priv, char *line, unsigned long len) if (line[0] == '-') { diff_words_append(line, len, &ecbdata->diff_words->minus); - return; + return 0; } else if (line[0] == '+') { diff_words_append(line, len, &ecbdata->diff_words->plus); - return; + return 0; } else if (starts_with(line, "\\ ")) { /* * Eat the "no newline at eof" marker as if we @@ -2395,11 +2395,11 @@ static void fn_out_consume(void *priv, char *line, unsigned long len) * defer processing. If this is the end of * preimage, more "+" lines may come after it. */ - return; + return 0; } diff_words_flush(ecbdata); emit_diff_symbol(o, s, line, len, 0); - return; + return 0; } switch (line[0]) { @@ -2423,6 +2423,7 @@ static void fn_out_consume(void *priv, char *line, unsigned long len) line, len, 0); break; } + return 0; } static void pprint_rename(struct strbuf *name, const char *a, const char *b) @@ -2522,7 +2523,7 @@ static struct diffstat_file *diffstat_add(struct diffstat_t *diffstat, return x; } -static void diffstat_consume(void *priv, char *line, unsigned long len) +static int diffstat_consume(void *priv, char *line, unsigned long len) { struct diffstat_t *diffstat = priv; struct diffstat_file *x = diffstat->files[diffstat->nr - 1]; @@ -2531,6 +2532,7 @@ static void diffstat_consume(void *priv, char *line, unsigned long len) x->added++; else if (line[0] == '-') x->deleted++; + return 0; } const char mime_boundary_leader[] = "------------"; @@ -3208,7 +3210,7 @@ static void checkdiff_consume_hunk(void *priv, data->lineno = nb - 1; } -static void checkdiff_consume(void *priv, char *line, unsigned long len) +static int checkdiff_consume(void *priv, char *line, unsigned long len) { struct checkdiff_t *data = priv; int marker_size = data->conflict_marker_size; @@ -3232,7 +3234,7 @@ static void checkdiff_consume(void *priv, char *line, unsigned long len) } bad = ws_check(line + 1, len - 1, data->ws_rule); if (!bad) - return; + return 0; data->status |= bad; err = whitespace_error_string(bad); fprintf(data->o->file, "%s%s:%d: %s.\n", @@ -3244,6 +3246,7 @@ static void checkdiff_consume(void *priv, char *line, unsigned long len) } else if (line[0] == ' ') { data->lineno++; } + return 0; } static unsigned char *deflate_it(char *data, @@ -6121,17 +6124,18 @@ void flush_one_hunk(struct object_id *result, git_hash_ctx *ctx) } } -static void patch_id_consume(void *priv, char *line, unsigned long len) +static int patch_id_consume(void *priv, char *line, unsigned long len) { struct patch_id_t *data = priv; int new_len; if (len > 12 && starts_with(line, "\\ ")) - return; + return 0; new_len = remove_space(line, len); the_hash_algo->update_fn(data->ctx, line, new_len); data->patchlen += new_len; + return 0; } static void patch_id_add_string(git_hash_ctx *ctx, const char *str) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index b7494fdf89c..27aa20be350 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -19,21 +19,22 @@ struct diffgrep_cb { int hit; }; -static void diffgrep_consume(void *priv, char *line, unsigned long len) +static int diffgrep_consume(void *priv, char *line, unsigned long len) { struct diffgrep_cb *data = priv; regmatch_t regmatch; if (line[0] != '+' && line[0] != '-') - return; + return 0; if (data->hit) /* * NEEDSWORK: we should have a way to terminate the * caller early. */ - return; + return 0; data->hit = !regexec_buf(data->regexp, line + 1, len - 1, 1, ®match, 0); + return 0; } static int diff_grep(mmfile_t *one, mmfile_t *two, diff --git a/range-diff.c b/range-diff.c index 116fb0735c6..83c90f946ea 100644 --- a/range-diff.c +++ b/range-diff.c @@ -274,9 +274,10 @@ static void find_exact_matches(struct string_list *a, struct string_list *b) hashmap_clear(&map); } -static void diffsize_consume(void *data, char *line, unsigned long len) +static int diffsize_consume(void *data, char *line, unsigned long len) { (*(int *)data)++; + return 0; } static void diffsize_hunk(void *data, long ob, long on, long nb, long nn, diff --git a/xdiff-interface.c b/xdiff-interface.c index 4d20069302b..5d8c8c67dc2 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -31,7 +31,7 @@ static int xdiff_out_hunk(void *priv_, return 0; } -static void consume_one(void *priv_, char *s, unsigned long size) +static int consume_one(void *priv_, char *s, unsigned long size) { struct xdiff_emit_state *priv = priv_; char *ep; @@ -43,6 +43,7 @@ static void consume_one(void *priv_, char *s, unsigned long size) size -= this_size; s += this_size; } + return 0; } static int xdiff_outf(void *priv_, mmbuffer_t *mb, int nbuf) diff --git a/xdiff-interface.h b/xdiff-interface.h index 93df26900c2..0198f9632f5 100644 --- a/xdiff-interface.h +++ b/xdiff-interface.h @@ -11,7 +11,7 @@ */ #define MAX_XDIFF_SIZE (1024UL * 1024 * 1023) -typedef void (*xdiff_emit_line_fn)(void *, char *, unsigned long); +typedef int (*xdiff_emit_line_fn)(void *, char *, unsigned long); typedef void (*xdiff_emit_hunk_fn)(void *data, long old_begin, long old_nr, long new_begin, long new_nr, From 9e204422985a518ac700889d1ca4d521b3a7bfb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:25 +0200 Subject: [PATCH 18/22] xdiff-interface: allow early return from xdiff_emit_line_fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finish the change started in the preceding commit and allow an early return from "xdiff_emit_line_fn" callbacks, this will allows diffcore-pickaxe.c to save itself redundant work. Our xdiff interface also had the limitation of not being able to abort early since the beginning, see d9ea73e0564 (combine-diff: refactor built-in xdiff interface., 2006-04-05). Although at that time "xdiff_emit_line_fn" was called "xdiff_emit_consume_fn", and "xdiff_emit_hunk_fn" didn't exist yet. There was some work in this area of xdiff-interface.[ch] recently with 3b40a090fd4 (diff: avoid generating unused hunk header lines, 2018-11-02) and 7c61e25fbf1 (diff: use hunk callback for word-diff, 2018-11-02). In combination those two changes allow us to not do any work on the hunks and diff at all, but didn't change the status quo with regards to consumers that e.g. want the diff lines, but might want to abort early. Whereas now we can abort e.g. on the first "-line" of a 1000 line diff if that's all we needed. This interface is rather scary as noted in the comment to xdiff-interface.h being added here, as noted there a future change could add more exit codes, and hack xdl_emit_diff() and friends to ignore or skip things more selectively as a result. I did not see an inherent reason for why xdl_emit_{diffrec,record}() could not be changed to ferry the "xdiff_emit_line_fn" error code upwards instead of returning -1 on all "ret < 0". But doing so would require corresponding changes in xdl_emit_diff(), xdl_diff(). I didn't see any issue with narrowly doing that to accomplish what I needed here, but it would leave xdiff's own return values in an inconsistent state. Instead I've left it at returning a more conventional (for git's own codebase) 1 for an early return, and translating it (or rather, all non-zero) to -1 for xdiff's consumption. The reason for most of the "stop" complexity in xdiff_outf() is because we want to be able to abort early, but do so in a way that doesn't skip the appropriate strbuf_reset() invocations. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- xdiff-interface.c | 18 ++++++++++++++---- xdiff-interface.h | 17 +++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/xdiff-interface.c b/xdiff-interface.c index 5d8c8c67dc2..50c0ef759dd 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -37,9 +37,12 @@ static int consume_one(void *priv_, char *s, unsigned long size) char *ep; while (size) { unsigned long this_size; + int ret; ep = memchr(s, '\n', size); this_size = (ep == NULL) ? size : (ep - s + 1); - priv->line_fn(priv->consume_callback_data, s, this_size); + ret = priv->line_fn(priv->consume_callback_data, s, this_size); + if (ret) + return ret; size -= this_size; s += this_size; } @@ -50,11 +53,14 @@ static int xdiff_outf(void *priv_, mmbuffer_t *mb, int nbuf) { struct xdiff_emit_state *priv = priv_; int i; + int stop = 0; if (!priv->line_fn) return 0; for (i = 0; i < nbuf; i++) { + if (stop) + return 1; if (mb[i].ptr[mb[i].size-1] != '\n') { /* Incomplete line */ strbuf_add(&priv->remainder, mb[i].ptr, mb[i].size); @@ -63,17 +69,21 @@ static int xdiff_outf(void *priv_, mmbuffer_t *mb, int nbuf) /* we have a complete line */ if (!priv->remainder.len) { - consume_one(priv, mb[i].ptr, mb[i].size); + stop = consume_one(priv, mb[i].ptr, mb[i].size); continue; } strbuf_add(&priv->remainder, mb[i].ptr, mb[i].size); - consume_one(priv, priv->remainder.buf, priv->remainder.len); + stop = consume_one(priv, priv->remainder.buf, priv->remainder.len); strbuf_reset(&priv->remainder); } + if (stop) + return -1; if (priv->remainder.len) { - consume_one(priv, priv->remainder.buf, priv->remainder.len); + stop = consume_one(priv, priv->remainder.buf, priv->remainder.len); strbuf_reset(&priv->remainder); } + if (stop) + return -1; return 0; } diff --git a/xdiff-interface.h b/xdiff-interface.h index 0198f9632f5..7d1724abb64 100644 --- a/xdiff-interface.h +++ b/xdiff-interface.h @@ -11,6 +11,23 @@ */ #define MAX_XDIFF_SIZE (1024UL * 1024 * 1023) +/** + * The `xdiff_emit_line_fn` function can return 1 to abort early, or 0 + * to continue processing. Note that doing so is an all-or-nothing + * affair, as returning 1 will return all the way to the top-level, + * e.g. the xdi_diff_outf() call to generate the diff. + * + * Thus returning 1 means you won't be getting any more diff lines. If + * you need something in-between those two options you'll to use + * `xdl_emit_hunk_consume_func_t` and implement your own version of + * xdl_emit_diff(). + * + * We may extend the interface in the future to understand other more + * granular return values. While you should return 1 to exit early, + * doing so will currently make your early return indistinguishable + * from an error internal to xdiff, xdiff itself will see that + * non-zero return and translate it to -1. + */ typedef int (*xdiff_emit_line_fn)(void *, char *, unsigned long); typedef void (*xdiff_emit_hunk_fn)(void *data, long old_begin, long old_nr, From fa59e7beb2a61d9cd1312b212075edf12935fdc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:26 +0200 Subject: [PATCH 19/22] pickaxe -G: terminate early on matching lines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Solve a long-standing item for "git log -Grx" of us e.g. finding "+ str" in the diff context and noting that we had a "hit", but xdiff diligently continuing to generate and spew the rest of the diff at us. This makes use of a new "early return" xdiff interface added by preceding commits. The TODO item (or, the NEEDSWORK comment) has been there since "git log -G" was implemented. See f506b8e8b5f (git log/diff: add -G that greps in the patch text, 2010-08-23). But now with the support added in the preceding changes to the xdiff-interface we can return early. Let's assert the behavior of that new early-return xdiff-interface by having a BUG() call here to die if it ever starts handing us needless work again. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 30 +++++++++++++++++++----------- xdiff-interface.h | 4 ++++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 27aa20be350..2147afef722 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -27,13 +27,12 @@ static int diffgrep_consume(void *priv, char *line, unsigned long len) if (line[0] != '+' && line[0] != '-') return 0; if (data->hit) - /* - * NEEDSWORK: we should have a way to terminate the - * caller early. - */ - return 0; - data->hit = !regexec_buf(data->regexp, line + 1, len - 1, 1, - ®match, 0); + BUG("Already matched in diffgrep_consume! Broken xdiff_emit_line_fn?"); + if (!regexec_buf(data->regexp, line + 1, len - 1, 1, + ®match, 0)) { + data->hit = 1; + return 1; + } return 0; } @@ -45,6 +44,7 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, struct diffgrep_cb ecbdata; xpparam_t xpp; xdemitconf_t xecfg; + int ret; if (!one) return !regexec_buf(regexp, two->ptr, two->size, @@ -63,10 +63,18 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, ecbdata.hit = 0; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; - if (xdi_diff_outf(one, two, discard_hunk_line, diffgrep_consume, - &ecbdata, &xpp, &xecfg)) - return 0; - return ecbdata.hit; + + /* + * An xdiff error might be our "data->hit" from above. See the + * comment for xdiff_emit_line_fn in xdiff-interface.h + */ + ret = xdi_diff_outf(one, two, discard_hunk_line, diffgrep_consume, + &ecbdata, &xpp, &xecfg); + if (ecbdata.hit) + return 1; + if (ret) + return ret; + return 0; } static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws, diff --git a/xdiff-interface.h b/xdiff-interface.h index 7d1724abb64..3b6819586da 100644 --- a/xdiff-interface.h +++ b/xdiff-interface.h @@ -27,6 +27,10 @@ * doing so will currently make your early return indistinguishable * from an error internal to xdiff, xdiff itself will see that * non-zero return and translate it to -1. + * + * See "diff_grep" in diffcore-pickaxe.c for a trick to work around + * this, i.e. using the "consume_callback_data" to note the desired + * early return. */ typedef int (*xdiff_emit_line_fn)(void *, char *, unsigned long); typedef void (*xdiff_emit_hunk_fn)(void *data, From f97fe358576c81d995843644ab58de1ebeb7fee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:27 +0200 Subject: [PATCH 20/22] pickaxe -G: don't special-case create/delete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of special-casing creations and deletions let's just generate a diff for them. This logic of not running a diff under -G if we don't have both sides dates back to the original implementation of -S in 52e9578985f ([PATCH] Introducing software archaeologist's tool "pickaxe"., 2005-05-21). In the case of -S we were not working with the xdiff interface and needed to do this, but when -G was implemented in f506b8e8b5f (git log/diff: add -G that greps in the patch text, 2010-08-23) this logic was diligently copied over. But as the performance test added earlier in this series shows, this does not make much of a difference. With: time GIT_TEST_LONG= GIT_PERF_REPEAT_COUNT=10 GIT_PERF_MAKE_OPTS='-j8 CFLAGS=-O3' ./run origin/next HEAD~ HEAD -- p4209-pickaxe.sh With the HEAD~ commit being the preceding "pickaxe -G: terminate early on matching lines" we get these results. Note that it's only the -G codepaths that are relevant to this change: Test origin/next HEAD~ HEAD ----------------------------------------------------------------------------------------------------------------------------------------- 4209.1: git log -S'int main' .. 0.35(0.32+0.03) 0.35(0.33+0.02) +0.0% 0.35(0.30+0.05) +0.0% 4209.2: git log -S'æ' .. 0.46(0.42+0.04) 0.46(0.41+0.05) +0.0% 0.46(0.42+0.04) +0.0% 4209.3: git log --pickaxe-regex -S'(int|void|null)' .. 0.65(0.62+0.02) 0.64(0.61+0.02) -1.5% 0.64(0.60+0.04) -1.5% 4209.4: git log --pickaxe-regex -S'if *\([^ ]+ & ' .. 0.52(0.45+0.06) 0.52(0.50+0.01) +0.0% 0.54(0.47+0.04) +3.8% 4209.5: git log --pickaxe-regex -S'[àáâãäåæñøùúûüýþ]' .. 0.39(0.34+0.05) 0.39(0.34+0.04) +0.0% 0.39(0.36+0.03) +0.0% 4209.6: git log -G'(int|void|null)' .. 0.60(0.55+0.04) 0.58(0.54+0.03) -3.3% 0.58(0.49+0.08) -3.3% 4209.7: git log -G'if *\([^ ]+ & ' .. 0.61(0.52+0.06) 0.59(0.53+0.05) -3.3% 0.59(0.54+0.05) -3.3% 4209.8: git log -G'[àáâãäåæñøùúûüýþ]' .. 0.61(0.51+0.07) 0.58(0.54+0.04) -4.9% 0.57(0.51+0.06) -6.6% 4209.9: git log -i -S'int main' .. 0.36(0.31+0.04) 0.36(0.34+0.02) +0.0% 0.35(0.32+0.03) -2.8% 4209.10: git log -i -S'æ' .. 0.36(0.33+0.03) 0.39(0.34+0.01) +8.3% 0.36(0.32+0.03) +0.0% 4209.11: git log -i --pickaxe-regex -S'(int|void|null)' .. 0.83(0.77+0.05) 0.82(0.77+0.05) -1.2% 0.80(0.75+0.04) -3.6% 4209.12: git log -i --pickaxe-regex -S'if *\([^ ]+ & ' .. 0.67(0.61+0.03) 0.64(0.61+0.03) -4.5% 0.63(0.61+0.02) -6.0% 4209.13: git log -i --pickaxe-regex -S'[àáâãäåæñøùúûüýþ]' .. 0.40(0.37+0.02) 0.40(0.37+0.03) +0.0% 0.40(0.36+0.04) +0.0% 4209.14: git log -i -G'(int|void|null)' .. 0.58(0.51+0.07) 0.59(0.52+0.06) +1.7% 0.58(0.52+0.05) +0.0% 4209.15: git log -i -G'if *\([^ ]+ & ' .. 0.60(0.54+0.05) 0.60(0.54+0.06) +0.0% 0.60(0.56+0.03) +0.0% 4209.16: git log -i -G'[àáâãäåæñøùúûüýþ]' .. 0.58(0.51+0.06) 0.57(0.52+0.05) -1.7% 0.60(0.48+0.09) +3.4% This small simplification really doesn't buy us much now, but I've got plans to both convert the pickaxe code to using a PCREv2 backend[1] and to implement additional pickaxe modes to do custom searches through the diff[2]. Always having the diff available under -G is going to help to simplify both of those changes. 1. https://lore.kernel.org/git/20210203032811.14979-22-avarab@gmail.com/ 2. https://lore.kernel.org/git/20190424152215.16251-3-avarab@gmail.com/ Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diffcore-pickaxe.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 2147afef722..96183f4cfab 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -40,19 +40,11 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, struct diff_options *o, regex_t *regexp, kwset_t kws) { - regmatch_t regmatch; struct diffgrep_cb ecbdata; xpparam_t xpp; xdemitconf_t xecfg; int ret; - if (!one) - return !regexec_buf(regexp, two->ptr, two->size, - 1, ®match, 0); - if (!two) - return !regexec_buf(regexp, one->ptr, one->size, - 1, ®match, 0); - /* * We have both sides; need to run textual diff and see if * the pattern appears on added/deleted lines. @@ -172,9 +164,7 @@ static int pickaxe_match(struct diff_filepair *p, struct diff_options *o, mf1.size = fill_textconv(o->repo, textconv_one, p->one, &mf1.ptr); mf2.size = fill_textconv(o->repo, textconv_two, p->two, &mf2.ptr); - ret = fn(DIFF_FILE_VALID(p->one) ? &mf1 : NULL, - DIFF_FILE_VALID(p->two) ? &mf2 : NULL, - o, regexp, kws); + ret = fn(&mf1, &mf2, o, regexp, kws); if (textconv_one) free(mf1.ptr); From 22233d43eb5479d1076e034eb5cdba973fa2b02a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:28 +0200 Subject: [PATCH 21/22] xdiff users: use designated initializers for out_line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Amend the code added in 611e42a5980 (xdiff: provide a separate emit callback for hunks, 2018-11-02) to be more readable by using designated initializers. This changes "priv" in rerere.c to be initialized to NULL as we did in merge-tree.c. That's not needed as we'll only use it if the callback is defined, but being consistent here is better and less verbose. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- builtin/merge-tree.c | 5 +---- builtin/rerere.c | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c index de8520778d2..5dc94d6f880 100644 --- a/builtin/merge-tree.c +++ b/builtin/merge-tree.c @@ -107,15 +107,12 @@ static void show_diff(struct merge_list *entry) mmfile_t src, dst; xpparam_t xpp; xdemitconf_t xecfg; - xdemitcb_t ecb; + xdemitcb_t ecb = { .out_line = show_outf }; memset(&xpp, 0, sizeof(xpp)); xpp.flags = 0; memset(&xecfg, 0, sizeof(xecfg)); xecfg.ctxlen = 3; - ecb.out_hunk = NULL; - ecb.out_line = show_outf; - ecb.priv = NULL; src.ptr = origin(entry, &size); if (!src.ptr) diff --git a/builtin/rerere.c b/builtin/rerere.c index fd3be17b976..83d7a778e37 100644 --- a/builtin/rerere.c +++ b/builtin/rerere.c @@ -28,7 +28,7 @@ static int diff_two(const char *file1, const char *label1, { xpparam_t xpp; xdemitconf_t xecfg; - xdemitcb_t ecb; + xdemitcb_t ecb = { .out_line = outf }; mmfile_t minus, plus; int ret; @@ -41,8 +41,6 @@ static int diff_two(const char *file1, const char *label1, xpp.flags = 0; memset(&xecfg, 0, sizeof(xecfg)); xecfg.ctxlen = 3; - ecb.out_hunk = NULL; - ecb.out_line = outf; ret = xdi_diff(&minus, &plus, &xpp, &xecfg, &ecb); free(minus.ptr); From 5d93460024541909337d6b08a8bec10b71caaf73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 12 Apr 2021 19:15:29 +0200 Subject: [PATCH 22/22] xdiff-interface: replace discard_hunk_line() with a flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the dummy discard_hunk_line() function added in 3b40a090fd4 (diff: avoid generating unused hunk header lines, 2018-11-02) in favor of having a new XDL_EMIT_NO_HUNK_HDR flag, for use along with the two existing and similar XDL_EMIT_* flags. Unlike the recently amended xdiff_emit_line_fn interface which'll be called in a loop in xdl_emit_diff(), the hunk header is only emitted once. It makes more sense to pass this as a flag than provide a dummy callback because that function may be able to skip doing certain work if it knows the caller is doing nothing with the hunk header. It would be possible to do so in the case of -U0 now, but the benefit of doing so is so small that I haven't bothered. But this leaves the door open to that, and more importantly makes the API use more intuitive. The reason we're putting a flag in the gap between 1<<0 and 1<<2 is that the old 1<<1 flag was removed in 907681e940d (xdiff: drop XDL_EMIT_COMMON, 2016-02-23) without re-ordering the remaining flags. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- diff.c | 7 ++++--- diffcore-pickaxe.c | 3 ++- xdiff-interface.c | 6 ------ xdiff-interface.h | 8 -------- xdiff/xdiff.h | 1 + xdiff/xemit.c | 3 ++- 6 files changed, 9 insertions(+), 19 deletions(-) diff --git a/diff.c b/diff.c index 7a03c581c79..fe3abac79fe 100644 --- a/diff.c +++ b/diff.c @@ -3725,7 +3725,8 @@ static void builtin_diffstat(const char *name_a, const char *name_b, xpp.anchors_nr = o->anchors_nr; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; - if (xdi_diff_outf(&mf1, &mf2, discard_hunk_line, + xecfg.flags = XDL_EMIT_NO_HUNK_HDR; + if (xdi_diff_outf(&mf1, &mf2, NULL, diffstat_consume, diffstat, &xpp, &xecfg)) die("unable to generate diffstat for %s", one->path); @@ -6233,8 +6234,8 @@ static int diff_get_patch_id(struct diff_options *options, struct object_id *oid xpp.flags = 0; xecfg.ctxlen = 3; - xecfg.flags = 0; - if (xdi_diff_outf(&mf1, &mf2, discard_hunk_line, + xecfg.flags = XDL_EMIT_NO_HUNK_HDR; + if (xdi_diff_outf(&mf1, &mf2, NULL, patch_id_consume, &data, &xpp, &xecfg)) return error("unable to generate patch-id diff for %s", p->one->path); diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 96183f4cfab..c88e50c6329 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -53,6 +53,7 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, memset(&xecfg, 0, sizeof(xecfg)); ecbdata.regexp = regexp; ecbdata.hit = 0; + xecfg.flags = XDL_EMIT_NO_HUNK_HDR; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; @@ -60,7 +61,7 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, * An xdiff error might be our "data->hit" from above. See the * comment for xdiff_emit_line_fn in xdiff-interface.h */ - ret = xdi_diff_outf(one, two, discard_hunk_line, diffgrep_consume, + ret = xdi_diff_outf(one, two, NULL, diffgrep_consume, &ecbdata, &xpp, &xecfg); if (ecbdata.hit) return 1; diff --git a/xdiff-interface.c b/xdiff-interface.c index 50c0ef759dd..95f13a93ff9 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -126,12 +126,6 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co return xdl_diff(&a, &b, xpp, xecfg, xecb); } -void discard_hunk_line(void *priv, - long ob, long on, long nb, long nn, - const char *func, long funclen) -{ -} - int xdi_diff_outf(mmfile_t *mf1, mmfile_t *mf2, xdiff_emit_hunk_fn hunk_fn, xdiff_emit_line_fn line_fn, diff --git a/xdiff-interface.h b/xdiff-interface.h index 3b6819586da..4301a7eef27 100644 --- a/xdiff-interface.h +++ b/xdiff-interface.h @@ -53,14 +53,6 @@ void xdiff_clear_find_func(xdemitconf_t *xecfg); int git_xmerge_config(const char *var, const char *value, void *cb); extern int git_xmerge_style; -/* - * Can be used as a no-op hunk_fn for xdi_diff_outf(), since a NULL - * one just sends the hunk line to the line_fn callback). - */ -void discard_hunk_line(void *priv, - long ob, long on, long nb, long nn, - const char *func, long funclen); - /* * Compare the strings l1 with l2 which are of size s1 and s2 respectively. * Returns 1 if the strings are deemed equal, 0 otherwise. diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index 7a046051468..b29deca5de8 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -50,6 +50,7 @@ extern "C" { /* xdemitconf_t.flags */ #define XDL_EMIT_FUNCNAMES (1 << 0) +#define XDL_EMIT_NO_HUNK_HDR (1 << 1) #define XDL_EMIT_FUNCCONTEXT (1 << 2) /* merge simplification levels */ diff --git a/xdiff/xemit.c b/xdiff/xemit.c index 9d7d6c50874..1cbf2b9829e 100644 --- a/xdiff/xemit.c +++ b/xdiff/xemit.c @@ -278,7 +278,8 @@ int xdl_emit_diff(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb, s1 - 1, funclineprev); funclineprev = s1 - 1; } - if (xdl_emit_hunk_hdr(s1 + 1, e1 - s1, s2 + 1, e2 - s2, + if (!(xecfg->flags & XDL_EMIT_NO_HUNK_HDR) && + xdl_emit_hunk_hdr(s1 + 1, e1 - s1, s2 + 1, e2 - s2, func_line.buf, func_line.len, ecb) < 0) return -1;