From ae1db7b31c54c95b1ecb2005f7a37e3645ec9eda Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:01 +0000 Subject: [PATCH 1/8] diffcore-rename: take advantage of "majority rules" to skip more renames In directory rename detection (when a directory is removed on one side of history and the other side adds new files to that directory), we work to find where the greatest number of files within that directory were renamed to so that the new files can be moved with the majority of the files. Naively, we can just do this by detecting renames for *all* files within the removed/renamed directory, looking at all the destination directories where files within that directory were moved, and if there is more than one such directory then taking the one with the greatest number of files as the directory where the old directory was renamed to. However, sometimes there are enough renames from exact rename detection or basename-guided rename detection that we have enough information to determine the majority winner already. Add a function meant to compute whether particular renames are still needed based on this majority rules check. The next several commits will then add the necessary infrastructure to get the information we need to compute which additional rename sources we can skip. An important side note for future further optimization: There is a possible improvement to this optimization that I have not yet attempted and will not be included in this series of patches: we could first check whether exact renames provide enough information for us to determine directory renames, and avoid doing basename-guided rename detection on some or all of the RELEVANT_LOCATION files within those directories. In effect, this variant would mean doing the handle_early_known_dir_renames() both after exact rename detection and again after basename-guided rename detection, though it would also mean decrementing the number of "unknown" renames for each rename we found from basename-guided rename detection. Adding this additional check for skippable renames right after exact rename detection might turn out to be valuable, especially for partial clones where it might allow us to download certain source files entirely. However, this particular optimization was actually the last one I did in original implementation order, and by the time I implemented this idea, every testcase I had was sufficiently fast that further optimization was unwarranted. If future testcases arise that tax rename detection more heavily (or perhaps partial clones can benefit from avoiding loading more objects), it may be worth implementing this more involved variant. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/diffcore-rename.c b/diffcore-rename.c index e8508541be1..a5d10afa221 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -1073,6 +1073,24 @@ static void remove_unneeded_paths_from_src(int detecting_copies, rename_src_nr = new_num_src; } +static void handle_early_known_dir_renames(struct dir_rename_info *info, + struct strset *relevant_sources, + struct strset *dirs_removed) +{ + /* + * Not yet implemented; directory renames are determined via an + * aggregate of all renames under them and using a "majority wins" + * rule. The fact that "majority wins", though, means we don't need + * all the renames under the given directory, we only need enough to + * ensure we have a majority. + * + * For now, we don't have enough information to know if we have a + * majority after exact renames and basename-guided rename detection, + * so just return early without doing any extra filtering. + */ + return; +} + void diffcore_rename_extended(struct diff_options *options, struct strset *relevant_sources, struct strset *dirs_removed, @@ -1208,9 +1226,16 @@ void diffcore_rename_extended(struct diff_options *options, * Cull sources, again: * - remove ones involved in renames (found via basenames) * - remove ones not found in relevant_sources + * and + * - remove ones in relevant_sources which are needed only + * for directory renames IF no ancestory directory + * actually needs to know any more individual path + * renames under them */ trace2_region_enter("diff", "cull basename", options->repo); remove_unneeded_paths_from_src(want_copies, relevant_sources); + handle_early_known_dir_renames(&info, relevant_sources, + dirs_removed); trace2_region_leave("diff", "cull basename", options->repo); } From a49b55d52e7dcfd628b6328c9098d734ebe7a97d Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:02 +0000 Subject: [PATCH 2/8] merge-ort, diffcore-rename: tweak dirs_removed and relevant_source type As noted in the previous commit, we want to be able to take advantage of the "majority rules" portion of directory rename detection to avoid detecting more renames than necessary. However, for diffcore-rename to take advantage of that, it needs to know whether a rename source file was needed for just directory rename detection reasons, or if it is wanted for potential three-way content merging. Modify relevant_sources from a strset to a strintmap, so we can encode additional information. We also modify dirs_removed from a strset to a strintmap at the same time because trying to determine what files are needed for directory rename detection will require us tracking a bit more information for each directory. This commit only changes the types of the two variables from strset to strintmap; it does not actually store any special values yet and for now only checks for presence of entries in the strintmap. Thus, the code is functionally identical to how it behaved before. Future commits will start associating values with each key for these two maps. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 47 ++++++++++++++++++++++++----------------------- diffcore.h | 6 +++--- merge-ort.c | 28 ++++++++++++++-------------- 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index a5d10afa221..6487825317f 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -371,7 +371,7 @@ struct dir_rename_info { struct strintmap idx_map; struct strmap dir_rename_guess; struct strmap *dir_rename_count; - struct strset *relevant_source_dirs; + struct strintmap *relevant_source_dirs; unsigned setup; }; @@ -429,7 +429,7 @@ static void increment_count(struct dir_rename_info *info, } static void update_dir_rename_counts(struct dir_rename_info *info, - struct strset *dirs_removed, + struct strintmap *dirs_removed, const char *oldname, const char *newname) { @@ -464,7 +464,7 @@ static void update_dir_rename_counts(struct dir_rename_info *info, /* Get old_dir, skip if its directory isn't relevant. */ dirname_munge(old_dir); if (info->relevant_source_dirs && - !strset_contains(info->relevant_source_dirs, old_dir)) + !strintmap_contains(info->relevant_source_dirs, old_dir)) break; /* Get new_dir */ @@ -509,7 +509,7 @@ static void update_dir_rename_counts(struct dir_rename_info *info, } } - if (strset_contains(dirs_removed, old_dir)) + if (strintmap_contains(dirs_removed, old_dir)) increment_count(info, old_dir, new_dir); else break; @@ -527,8 +527,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info, } static void initialize_dir_rename_info(struct dir_rename_info *info, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count) { struct hashmap_iter iter; @@ -555,12 +555,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info, info->relevant_source_dirs = dirs_removed; /* might be NULL */ } else { info->relevant_source_dirs = xmalloc(sizeof(struct strintmap)); - strset_init(info->relevant_source_dirs); - strset_for_each_entry(relevant_sources, &iter, entry) { + strintmap_init(info->relevant_source_dirs, 0 /* unused */); + strintmap_for_each_entry(relevant_sources, &iter, entry) { char *dirname = get_dirname(entry->key); if (!dirs_removed || - strset_contains(dirs_removed, dirname)) - strset_add(info->relevant_source_dirs, dirname); + strintmap_contains(dirs_removed, dirname)) + strintmap_set(info->relevant_source_dirs, + dirname, 0 /* value irrelevant */); free(dirname); } } @@ -624,7 +625,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count) } static void cleanup_dir_rename_info(struct dir_rename_info *info, - struct strset *dirs_removed, + struct strintmap *dirs_removed, int keep_dir_rename_count) { struct hashmap_iter iter; @@ -644,7 +645,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, /* relevant_source_dirs */ if (info->relevant_source_dirs && info->relevant_source_dirs != dirs_removed) { - strset_clear(info->relevant_source_dirs); + strintmap_clear(info->relevant_source_dirs); FREE_AND_NULL(info->relevant_source_dirs); } @@ -666,7 +667,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, const char *source_dir = entry->key; struct strintmap *counts = entry->value; - if (!strset_contains(dirs_removed, source_dir)) { + if (!strintmap_contains(dirs_removed, source_dir)) { string_list_append(&to_remove, source_dir); strintmap_clear(counts); continue; @@ -770,8 +771,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info) static int find_basename_matches(struct diff_options *options, int minimum_score, struct dir_rename_info *info, - struct strset *relevant_sources, - struct strset *dirs_removed) + struct strintmap *relevant_sources, + struct strintmap *dirs_removed) { /* * When I checked in early 2020, over 76% of file renames in linux @@ -863,7 +864,7 @@ static int find_basename_matches(struct diff_options *options, /* Skip irrelevant sources */ if (relevant_sources && - !strset_contains(relevant_sources, filename)) + !strintmap_contains(relevant_sources, filename)) continue; /* @@ -994,7 +995,7 @@ static int find_renames(struct diff_score *mx, int minimum_score, int copies, struct dir_rename_info *info, - struct strset *dirs_removed) + struct strintmap *dirs_removed) { int count = 0, i; @@ -1019,7 +1020,7 @@ static int find_renames(struct diff_score *mx, } static void remove_unneeded_paths_from_src(int detecting_copies, - struct strset *interesting) + struct strintmap *interesting) { int i, new_num_src; @@ -1061,7 +1062,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies, continue; /* If we don't care about the source path, skip it */ - if (interesting && !strset_contains(interesting, one->path)) + if (interesting && !strintmap_contains(interesting, one->path)) continue; if (new_num_src < i) @@ -1074,8 +1075,8 @@ static void remove_unneeded_paths_from_src(int detecting_copies, } static void handle_early_known_dir_renames(struct dir_rename_info *info, - struct strset *relevant_sources, - struct strset *dirs_removed) + struct strintmap *relevant_sources, + struct strintmap *dirs_removed) { /* * Not yet implemented; directory renames are determined via an @@ -1092,8 +1093,8 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, } void diffcore_rename_extended(struct diff_options *options, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count) { int detect_rename = options->detect_rename; diff --git a/diffcore.h b/diffcore.h index 737c93a6cc7..4f168b385fd 100644 --- a/diffcore.h +++ b/diffcore.h @@ -8,8 +8,8 @@ struct diff_options; struct repository; +struct strintmap; struct strmap; -struct strset; struct userdiff_driver; /* This header file is internal between diff.c and its diff transformers @@ -166,8 +166,8 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count); void diffcore_break(struct repository *, int); void diffcore_rename(struct diff_options *); void diffcore_rename_extended(struct diff_options *options, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count); void diffcore_merge_broken(void); void diffcore_pickaxe(struct diff_options *); diff --git a/merge-ort.c b/merge-ort.c index bd089cb9a76..6fa670396ce 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -74,7 +74,7 @@ struct rename_info { /* * dirs_removed: directories removed on a given side of history. */ - struct strset dirs_removed[3]; + struct strintmap dirs_removed[3]; /* * dir_rename_count: tracking where parts of a directory were renamed to @@ -106,7 +106,7 @@ struct rename_info { * If neither of those are true, we can skip rename detection for * that path. */ - struct strset relevant_sources[3]; + struct strintmap relevant_sources[3]; /* * dir_rename_mask: @@ -362,8 +362,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, int i; void (*strmap_func)(struct strmap *, int) = reinitialize ? strmap_partial_clear : strmap_clear; - void (*strset_func)(struct strset *) = - reinitialize ? strset_partial_clear : strset_clear; + void (*strintmap_func)(struct strintmap *) = + reinitialize ? strintmap_partial_clear : strintmap_clear; /* * We marked opti->paths with strdup_strings = 0, so that we @@ -395,7 +395,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, /* Free memory used by various renames maps */ for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) { - strset_func(&renames->dirs_removed[i]); + strintmap_func(&renames->dirs_removed[i]); partial_clear_dir_rename_count(&renames->dir_rename_count[i]); if (!reinitialize) @@ -403,7 +403,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, strmap_func(&renames->dir_renames[i], 0); - strset_func(&renames->relevant_sources[i]); + strintmap_func(&renames->relevant_sources[i]); } if (!reinitialize) { @@ -674,7 +674,7 @@ static void add_pair(struct merge_options *opt, unsigned location_relevant = (dir_rename_mask == 0x07); if (content_relevant || location_relevant) - strset_add(&renames->relevant_sources[side], pathname); + strintmap_set(&renames->relevant_sources[side], pathname, 1); } one = alloc_filespec(pathname); @@ -730,9 +730,9 @@ static void collect_rename_info(struct merge_options *opt, /* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */ unsigned sides = (0x07 - dirmask)/2; if (sides & 1) - strset_add(&renames->dirs_removed[1], fullname); + strintmap_set(&renames->dirs_removed[1], fullname, 1); if (sides & 2) - strset_add(&renames->dirs_removed[2], fullname); + strintmap_set(&renames->dirs_removed[2], fullname, 1); } if (filemask == 0 || filemask == 7) @@ -2161,7 +2161,7 @@ static inline int possible_side_renames(struct rename_info *renames, unsigned side_index) { return renames->pairs[side_index].nr > 0 && - !strset_empty(&renames->relevant_sources[side_index]); + !strintmap_empty(&renames->relevant_sources[side_index]); } static inline int possible_renames(struct rename_info *renames) @@ -3445,14 +3445,14 @@ static void merge_start(struct merge_options *opt, struct merge_result *result) /* Initialization of various renames fields */ renames = &opt->priv->renames; for (i = MERGE_SIDE1; i <= MERGE_SIDE2; i++) { - strset_init_with_options(&renames->dirs_removed[i], - NULL, 0); + strintmap_init_with_options(&renames->dirs_removed[i], + 0, NULL, 0); strmap_init_with_options(&renames->dir_rename_count[i], NULL, 1); strmap_init_with_options(&renames->dir_renames[i], NULL, 0); - strset_init_with_options(&renames->relevant_sources[i], - NULL, 0); + strintmap_init_with_options(&renames->relevant_sources[i], + 0, NULL, 0); } /* From fb52938eec1cf6ec3169152362fe774849f5ac9b Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:03 +0000 Subject: [PATCH 3/8] merge-ort: record the reason that we want a rename for a directory When one side of history renames a directory, and the other side of history added files to the old directory, directory rename detection is used to warn about the location of the added files so the user can move them to the old directory or keep them with the new one. This sets up three different types of directories: * directories that had new files added to them * directories underneath a directory that had new files added to them * directories where no new files were added to it or any leading path Save this information in dirs_removed; the next several commits will make use of this information. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 2 +- diffcore.h | 7 +++++++ merge-ort.c | 41 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 6487825317f..fafec66b29e 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -667,7 +667,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, const char *source_dir = entry->key; struct strintmap *counts = entry->value; - if (!strintmap_contains(dirs_removed, source_dir)) { + if (!strintmap_get(dirs_removed, source_dir)) { string_list_append(&to_remove, source_dir); strintmap_clear(counts); continue; diff --git a/diffcore.h b/diffcore.h index 4f168b385fd..d5a497b7a16 100644 --- a/diffcore.h +++ b/diffcore.h @@ -161,6 +161,13 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *, struct diff_filespec *); void diff_q(struct diff_queue_struct *, struct diff_filepair *); +/* dir_rename_relevance: the reason we want rename information for a dir */ +enum dir_rename_relevance { + NOT_RELEVANT = 0, + RELEVANT_FOR_ANCESTOR = 1, + RELEVANT_FOR_SELF = 2 +}; + void partial_clear_dir_rename_count(struct strmap *dir_rename_count); void diffcore_break(struct repository *, int); diff --git a/merge-ort.c b/merge-ort.c index 6fa670396ce..e2606c73ad8 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -73,6 +73,10 @@ struct rename_info { /* * dirs_removed: directories removed on a given side of history. + * + * The keys of dirs_removed[side] are the directories that were removed + * on the given side of history. The value of the strintmap for each + * directory is a value from enum dir_rename_relevance. */ struct strintmap dirs_removed[3]; @@ -729,10 +733,41 @@ static void collect_rename_info(struct merge_options *opt, if (dirmask == 1 || dirmask == 3 || dirmask == 5) { /* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */ unsigned sides = (0x07 - dirmask)/2; + unsigned relevance = (renames->dir_rename_mask == 0x07) ? + RELEVANT_FOR_ANCESTOR : NOT_RELEVANT; + /* + * Record relevance of this directory. However, note that + * when collect_merge_info_callback() recurses into this + * directory and calls collect_rename_info() on paths + * within that directory, if we find a path that was added + * to this directory on the other side of history, we will + * upgrade this value to RELEVANT_FOR_SELF; see below. + */ if (sides & 1) - strintmap_set(&renames->dirs_removed[1], fullname, 1); + strintmap_set(&renames->dirs_removed[1], fullname, + relevance); if (sides & 2) - strintmap_set(&renames->dirs_removed[2], fullname, 1); + strintmap_set(&renames->dirs_removed[2], fullname, + relevance); + } + + /* + * Here's the block that potentially upgrades to RELEVANT_FOR_SELF. + * When we run across a file added to a directory. In such a case, + * find the directory of the file and upgrade its relevance. + */ + if (renames->dir_rename_mask == 0x07 && + (filemask == 2 || filemask == 4)) { + /* + * Need directory rename for parent directory on other side + * of history from added file. Thus + * side = (~filemask & 0x06) >> 1 + * or + * side = 3 - (filemask/2). + */ + unsigned side = 3 - (filemask >> 1); + strintmap_set(&renames->dirs_removed[side], dirname, + RELEVANT_FOR_SELF); } if (filemask == 0 || filemask == 7) @@ -3446,7 +3481,7 @@ static void merge_start(struct merge_options *opt, struct merge_result *result) renames = &opt->priv->renames; for (i = MERGE_SIDE1; i <= MERGE_SIDE2; i++) { strintmap_init_with_options(&renames->dirs_removed[i], - 0, NULL, 0); + NOT_RELEVANT, NULL, 0); strmap_init_with_options(&renames->dir_rename_count[i], NULL, 1); strmap_init_with_options(&renames->dir_renames[i], From e54385b97a69e09ec12be5f8eef234777c801257 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:04 +0000 Subject: [PATCH 4/8] diffcore-rename: only compute dir_rename_count for relevant directories When one side adds files to a directory that the other side renamed, directory rename detection is used to either move the new paths to the newer directory or warn the user about the fact that another path location might be better. If a parent of the given directory had new files added to it, any renames in the current directory are also part of determining where the parent directory is renamed to. Thus, naively, we need to record each rename N times for a path at depth N. However, we can use the additional information added to dirs_removed in the last commit to avoid traversing all N parent directories in many cases. Let's use an example to explain how this works. If we have a path named src/old_dir/a/b/file.c and src/old_dir doesn't exist on one side of history, but the other added a file named src/old_dir/newfile.c, then if one side renamed src/old_dir/a/b/file.c => source/new_dir/a/b/file.c then this file would affect potential directory rename detection counts for src/old_dir/a/b => source/new_dir/a/b src/old_dir/a => source/new_dir/a src/old_dir => source/new_dir src => source adding a weight of 1 to each in dir_rename_counts. However, if src/ exists on both sides of history, then we don't need to track any entries for it in dir_rename_counts. That was implemented previously. What we are adding now, is that if no new files were added to src/old_dir/a or src/old_dir/b, then we don't need to have counts in dir_rename_count for those directories either. In short, we only need to track counts in dir_rename_count for directories whose dirs_removed value is RELEVANT_FOR_SELF. And as soon as we reach a directory that isn't in dirs_removed (signalled by returning the default value of NOT_RELEVANT from strintmap_get()), we can stop looking any further up the directory hierarchy. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index fafec66b29e..0a17abd4669 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -461,6 +461,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info, return; while (1) { + int drd_flag = NOT_RELEVANT; + /* Get old_dir, skip if its directory isn't relevant. */ dirname_munge(old_dir); if (info->relevant_source_dirs && @@ -509,16 +511,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info, } } - if (strintmap_contains(dirs_removed, old_dir)) + /* + * Above we suggested that we'd keep recording renames for + * all ancestor directories where the trailing directories + * matched, i.e. for + * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" + * we'd increment rename counts for each of + * a/b/c/d/e/ => a/b/some/thing/else/e/ + * a/b/c/d/ => a/b/some/thing/else/ + * However, we only need the rename counts for directories + * in dirs_removed whose value is RELEVANT_FOR_SELF. + * However, we add one special case of also recording it for + * first_time_in_loop because find_basename_matches() can + * use that as a hint to find a good pairing. + */ + if (dirs_removed) + drd_flag = strintmap_get(dirs_removed, old_dir); + if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop) increment_count(info, old_dir, new_dir); - else - break; + first_time_in_loop = 0; + if (drd_flag == NOT_RELEVANT) + break; /* If we hit toplevel directory ("") for old or new dir, quit */ if (!*old_dir || !*new_dir) break; - - first_time_in_loop = 0; } /* Free resources we don't need anymore */ From 0491d39297132c48e36c2f73cb0dfe7e22b3c52f Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:05 +0000 Subject: [PATCH 5/8] diffcore-rename: check if we have enough renames for directories early on As noted in the past few commits, if we can determine that a directory already has enough renames to determine how directory rename detection will be decided for that directory, then we can mark that directory as no longer needing any more renames detected for files underneath it. For such directories, we change the value in the dirs_removed map from RELEVANT_TO_SELF to RELEVANT_FOR_ANCESTOR. A subsequent patch will use this information while iterating over the remaining potential rename sources to mark ones that were only location_relevant as unneeded if no containing directory is still marked as RELEVANT_TO_SELF. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 73 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 0a17abd4669..8fa29076e0a 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts) return highest_destination_dir; } +static char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */ + +static int dir_rename_already_determinable(struct strintmap *counts) +{ + struct hashmap_iter iter; + struct strmap_entry *entry; + int first = 0, second = 0, unknown = 0; + strintmap_for_each_entry(counts, &iter, entry) { + const char *destination_dir = entry->key; + intptr_t count = (intptr_t)entry->value; + if (!strcmp(destination_dir, UNKNOWN_DIR)) { + unknown = count; + } else if (count >= first) { + second = first; + first = count; + } else if (count >= second) { + second = count; + } + } + return first > second + unknown; +} + static void increment_count(struct dir_rename_info *info, char *old_dir, char *new_dir) @@ -1096,17 +1118,48 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, struct strintmap *dirs_removed) { /* - * Not yet implemented; directory renames are determined via an - * aggregate of all renames under them and using a "majority wins" - * rule. The fact that "majority wins", though, means we don't need - * all the renames under the given directory, we only need enough to - * ensure we have a majority. - * - * For now, we don't have enough information to know if we have a - * majority after exact renames and basename-guided rename detection, - * so just return early without doing any extra filtering. + * Directory renames are determined via an aggregate of all renames + * under them and using a "majority wins" rule. The fact that + * "majority wins", though, means we don't need all the renames + * under the given directory, we only need enough to ensure we have + * a majority. */ - return; + + struct hashmap_iter iter; + struct strmap_entry *entry; + + if (!dirs_removed || !relevant_sources) + return; /* nothing to cull */ + if (break_idx) + return; /* culling incompatbile with break detection */ + + /* + * FIXME: Supplement dir_rename_count with number of potential + * renames, marking all potential rename sources as mapping to + * UNKNOWN_DIR. + */ + + /* + * For any directory which we need a potential rename detected for + * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check + * whether we have enough renames to satisfy the "majority rules" + * requirement such that detecting any more renames of files under + * it won't change the result. For any such directory, mark that + * we no longer need to detect a rename for it. However, since we + * might need to still detect renames for an ancestor of that + * directory, use RELEVANT_FOR_ANCESTOR. + */ + strmap_for_each_entry(info->dir_rename_count, &iter, entry) { + /* entry->key is source_dir */ + struct strintmap *counts = entry->value; + + if (strintmap_get(dirs_removed, entry->key) == + RELEVANT_FOR_SELF && + dir_rename_already_determinable(counts)) { + strintmap_set(dirs_removed, entry->key, + RELEVANT_FOR_ANCESTOR); + } + } } void diffcore_rename_extended(struct diff_options *options, From bf238b71377e924066ca5155a787c167177f706c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:06 +0000 Subject: [PATCH 6/8] diffcore-rename: add computation of number of unknown renames The previous commit can only be effective if we have a computation of the number of paths under a given directory which are still have pending renames, and expected this number to be recorded in the dir_rename_count map under the key UNKNOWN_DIR. Add the code necessary to compute these values. Note that this change means dir_rename_count might have a directory whose only entry (for UNKNOWN_DIR) was removed by the time merge-ort goes to check it. To account for this, merge-ort needs to check for the case where the max count is 0. With this change we are now computing the necessary value for each directory in dirs_removed, but are not using that value anywhere. The next two commits will make use of the values stored in dirs_removed in order to compute whether each relevant_source (that is needed only for directory rename detection) has become unnecessary. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 41 +++++++++++++++++++++++++++++++++++++---- merge-ort.c | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8fa29076e0a..9844cd48788 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -699,7 +699,8 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, /* * Although dir_rename_count was passed in * diffcore_rename_extended() and we want to keep it around and - * return it to that caller, we first want to remove any data + * return it to that caller, we first want to remove any counts in + * the maps associated with UNKNOWN_DIR entries and any data * associated with directories that weren't renamed. */ strmap_for_each_entry(info->dir_rename_count, &iter, entry) { @@ -711,6 +712,9 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, strintmap_clear(counts); continue; } + + if (strintmap_contains(counts, UNKNOWN_DIR)) + strintmap_remove(counts, UNKNOWN_DIR); } for (i = 0; i < to_remove.nr; ++i) strmap_remove(info->dir_rename_count, @@ -1125,6 +1129,7 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, * a majority. */ + int i; struct hashmap_iter iter; struct strmap_entry *entry; @@ -1134,10 +1139,38 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, return; /* culling incompatbile with break detection */ /* - * FIXME: Supplement dir_rename_count with number of potential - * renames, marking all potential rename sources as mapping to - * UNKNOWN_DIR. + * Supplement dir_rename_count with number of potential renames, + * marking all potential rename sources as mapping to UNKNOWN_DIR. */ + for (i = 0; i < rename_src_nr; i++) { + char *old_dir; + struct diff_filespec *one = rename_src[i].p->one; + + /* + * sources that are part of a rename will have already been + * removed by a prior call to remove_unneeded_paths_from_src() + */ + assert(!one->rename_used); + + old_dir = get_dirname(one->path); + while (*old_dir != '\0' && + NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) { + char *freeme = old_dir; + + increment_count(info, old_dir, UNKNOWN_DIR); + old_dir = get_dirname(old_dir); + + /* Free resources we don't need anymore */ + free(freeme); + } + /* + * old_dir and new_dir free'd in increment_count, but + * get_dirname() gives us a new pointer we need to free for + * old_dir. Also, if the loop runs 0 times we need old_dir + * to be freed. + */ + free(old_dir); + } /* * For any directory which we need a potential rename detected for diff --git a/merge-ort.c b/merge-ort.c index e2606c73ad8..f2b259986e2 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1546,6 +1546,9 @@ static void get_provisional_directory_renames(struct merge_options *opt, } } + if (max == 0) + continue; + if (bad_max == max) { path_msg(opt, source_dir, 0, _("CONFLICT (directory rename split): " From ec59da6015c457e84953a802fc6484ae2da2d774 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:07 +0000 Subject: [PATCH 7/8] merge-ort: record the reason that we want a rename for a file There are two different reasons we might want a rename for a file -- for three-way content merging or as part of directory rename detection. Record the reason. diffcore-rename will potentially be able to filter some of the ones marked as needed only for directory rename detection, if it can determine those directory renames based solely on renames found via exact rename detection and basename-guided rename detection. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore.h | 6 ++++++ merge-ort.c | 15 ++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/diffcore.h b/diffcore.h index d5a497b7a16..cf8d4cb8617 100644 --- a/diffcore.h +++ b/diffcore.h @@ -167,6 +167,12 @@ enum dir_rename_relevance { RELEVANT_FOR_ANCESTOR = 1, RELEVANT_FOR_SELF = 2 }; +/* file_rename_relevance: the reason(s) we want rename information for a file */ +enum file_rename_relevance { + RELEVANT_NO_MORE = 0, /* i.e. NOT relevant */ + RELEVANT_CONTENT = 1, + RELEVANT_LOCATION = 2 +}; void partial_clear_dir_rename_count(struct strmap *dir_rename_count); diff --git a/merge-ort.c b/merge-ort.c index f2b259986e2..7f5750ce6ab 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -99,16 +99,18 @@ struct rename_info { struct strmap dir_renames[3]; /* - * relevant_sources: deleted paths for which we need rename detection + * relevant_sources: deleted paths wanted in rename detection, and why * * relevant_sources is a set of deleted paths on each side of * history for which we need rename detection. If a path is deleted * on one side of history, we need to detect if it is part of a * rename if either - * * we need to detect renames for an ancestor directory * * the file is modified/deleted on the other side of history + * * we need to detect renames for an ancestor directory * If neither of those are true, we can skip rename detection for - * that path. + * that path. The reason is stored as a value from enum + * file_rename_relevance, as the reason can inform the algorithm in + * diffcore_rename_extended(). */ struct strintmap relevant_sources[3]; @@ -677,8 +679,11 @@ static void add_pair(struct merge_options *opt, unsigned content_relevant = (match_mask == 0); unsigned location_relevant = (dir_rename_mask == 0x07); - if (content_relevant || location_relevant) - strintmap_set(&renames->relevant_sources[side], pathname, 1); + if (content_relevant || location_relevant) { + /* content_relevant trumps location_relevant */ + strintmap_set(&renames->relevant_sources[side], pathname, + content_relevant ? RELEVANT_CONTENT : RELEVANT_LOCATION); + } } one = alloc_filespec(pathname); From 9bd342137eb4dfe3852f657f8afcc637e68b1439 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 13 Mar 2021 22:22:08 +0000 Subject: [PATCH 8/8] diffcore-rename: determine which relevant_sources are no longer relevant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As noted a few commits ago ("diffcore-rename: only compute dir_rename_count for relevant directories"), when a source file rename is used as part of directory rename detection, we need to increment counts for each ancestor directory in dirs_removed with value RELEVANT_FOR_SELF. However, a few commits ago ("diffcore-rename: check if we have enough renames for directories early on"), we may have downgraded all relevant ancestor directories from RELEVANT_FOR_SELF to RELEVANT_FOR_ANCESTOR. For a given file, if no ancestor directory is found in dirs_removed with a value of RELEVANT_FOR_SELF, then we can downgrade relevant_source[PATH] from RELEVANT_LOCATION to RELEVANT_NO_MORE. This means we can skip detecting a rename for that particular path (and any other paths in the same directory). For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 5.680 s ± 0.096 s 5.665 s ± 0.129 s mega-renames: 13.812 s ± 0.162 s 11.435 s ± 0.158 s just-one-mega: 506.0 ms ± 3.9 ms 494.2 ms ± 6.1 ms While this improvement looks rather modest for these testcases (because all the previous optimizations were sufficient to nearly remove all time spent in rename detection already), consider this alternative testcase tweaked from the ones in commit 557ac0350d as follows $ git switch -c add-empty-file v5.5 $ >drivers/gpu/drm/i915/new-empty-file $ git add drivers/gpu/drm/i915/new-empty-file $ git commit -m "new file" $ git switch 5.4-rename $ git cherry-pick --strategy=ort add-empty-file For this testcase, we see the following improvement: Before After pick-empty: 1.936 s ± 0.024 s 688.1 ms ± 4.2 ms So roughly a factor of 3 speedup. At $DAYJOB, there was a particular repository and cherry-pick that inspired this optimization; for that case I saw a speedup factor of 7 with this optimization. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 9844cd48788..7cc24592617 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -1129,7 +1129,7 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, * a majority. */ - int i; + int i, new_num_src; struct hashmap_iter iter; struct strmap_entry *entry; @@ -1193,6 +1193,55 @@ static void handle_early_known_dir_renames(struct dir_rename_info *info, RELEVANT_FOR_ANCESTOR); } } + + for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { + struct diff_filespec *one = rename_src[i].p->one; + int val; + + val = strintmap_get(relevant_sources, one->path); + + /* + * sources that were not found in relevant_sources should + * have already been removed by a prior call to + * remove_unneeded_paths_from_src() + */ + assert(val != -1); + + if (val == RELEVANT_LOCATION) { + int removable = 1; + char *dir = get_dirname(one->path); + while (1) { + char *freeme = dir; + int res = strintmap_get(dirs_removed, dir); + + /* Quit if not found or irrelevant */ + if (res == NOT_RELEVANT) + break; + /* If RELEVANT_FOR_SELF, can't remove */ + if (res == RELEVANT_FOR_SELF) { + removable = 0; + break; + } + /* Else continue searching upwards */ + assert(res == RELEVANT_FOR_ANCESTOR); + dir = get_dirname(dir); + free(freeme); + } + free(dir); + if (removable) { + strintmap_set(relevant_sources, one->path, + RELEVANT_NO_MORE); + continue; + } + } + + if (new_num_src < i) + memcpy(&rename_src[new_num_src], &rename_src[i], + sizeof(struct diff_rename_src)); + new_num_src++; + } + + rename_src_nr = new_num_src; } void diffcore_rename_extended(struct diff_options *options,