diff --git a/diffcore-rename.c b/diffcore-rename.c index 36a98f9c49..963ca58221 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -371,7 +371,7 @@ struct dir_rename_info { struct strintmap idx_map; struct strmap dir_rename_guess; struct strmap *dir_rename_count; - struct strset *relevant_source_dirs; + struct strintmap *relevant_source_dirs; unsigned setup; }; @@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts) return highest_destination_dir; } +static char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */ + +static int dir_rename_already_determinable(struct strintmap *counts) +{ + struct hashmap_iter iter; + struct strmap_entry *entry; + int first = 0, second = 0, unknown = 0; + strintmap_for_each_entry(counts, &iter, entry) { + const char *destination_dir = entry->key; + intptr_t count = (intptr_t)entry->value; + if (!strcmp(destination_dir, UNKNOWN_DIR)) { + unknown = count; + } else if (count >= first) { + second = first; + first = count; + } else if (count >= second) { + second = count; + } + } + return first > second + unknown; +} + static void increment_count(struct dir_rename_info *info, char *old_dir, char *new_dir) @@ -429,7 +451,7 @@ static void increment_count(struct dir_rename_info *info, } static void update_dir_rename_counts(struct dir_rename_info *info, - struct strset *dirs_removed, + struct strintmap *dirs_removed, const char *oldname, const char *newname) { @@ -461,10 +483,12 @@ static void update_dir_rename_counts(struct dir_rename_info *info, return; while (1) { + int drd_flag = NOT_RELEVANT; + /* Get old_dir, skip if its directory isn't relevant. */ dirname_munge(old_dir); if (info->relevant_source_dirs && - !strset_contains(info->relevant_source_dirs, old_dir)) + !strintmap_contains(info->relevant_source_dirs, old_dir)) break; /* Get new_dir */ @@ -509,16 +533,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info, } } - if (strset_contains(dirs_removed, old_dir)) + /* + * Above we suggested that we'd keep recording renames for + * all ancestor directories where the trailing directories + * matched, i.e. for + * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" + * we'd increment rename counts for each of + * a/b/c/d/e/ => a/b/some/thing/else/e/ + * a/b/c/d/ => a/b/some/thing/else/ + * However, we only need the rename counts for directories + * in dirs_removed whose value is RELEVANT_FOR_SELF. + * However, we add one special case of also recording it for + * first_time_in_loop because find_basename_matches() can + * use that as a hint to find a good pairing. + */ + if (dirs_removed) + drd_flag = strintmap_get(dirs_removed, old_dir); + if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop) increment_count(info, old_dir, new_dir); - else - break; + first_time_in_loop = 0; + if (drd_flag == NOT_RELEVANT) + break; /* If we hit toplevel directory ("") for old or new dir, quit */ if (!*old_dir || !*new_dir) break; - - first_time_in_loop = 0; } /* Free resources we don't need anymore */ @@ -527,8 +566,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info, } static void initialize_dir_rename_info(struct dir_rename_info *info, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count) { struct hashmap_iter iter; @@ -555,12 +594,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info, info->relevant_source_dirs = dirs_removed; /* might be NULL */ } else { info->relevant_source_dirs = xmalloc(sizeof(struct strintmap)); - strset_init(info->relevant_source_dirs); - strset_for_each_entry(relevant_sources, &iter, entry) { + strintmap_init(info->relevant_source_dirs, 0 /* unused */); + strintmap_for_each_entry(relevant_sources, &iter, entry) { char *dirname = get_dirname(entry->key); if (!dirs_removed || - strset_contains(dirs_removed, dirname)) - strset_add(info->relevant_source_dirs, dirname); + strintmap_contains(dirs_removed, dirname)) + strintmap_set(info->relevant_source_dirs, + dirname, 0 /* value irrelevant */); free(dirname); } } @@ -624,7 +664,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count) } static void cleanup_dir_rename_info(struct dir_rename_info *info, - struct strset *dirs_removed, + struct strintmap *dirs_removed, int keep_dir_rename_count) { struct hashmap_iter iter; @@ -644,7 +684,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, /* relevant_source_dirs */ if (info->relevant_source_dirs && info->relevant_source_dirs != dirs_removed) { - strset_clear(info->relevant_source_dirs); + strintmap_clear(info->relevant_source_dirs); FREE_AND_NULL(info->relevant_source_dirs); } @@ -659,18 +699,22 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info, /* * Although dir_rename_count was passed in * diffcore_rename_extended() and we want to keep it around and - * return it to that caller, we first want to remove any data + * return it to that caller, we first want to remove any counts in + * the maps associated with UNKNOWN_DIR entries and any data * associated with directories that weren't renamed. */ strmap_for_each_entry(info->dir_rename_count, &iter, entry) { const char *source_dir = entry->key; struct strintmap *counts = entry->value; - if (!strset_contains(dirs_removed, source_dir)) { + if (!strintmap_get(dirs_removed, source_dir)) { string_list_append(&to_remove, source_dir); strintmap_clear(counts); continue; } + + if (strintmap_contains(counts, UNKNOWN_DIR)) + strintmap_remove(counts, UNKNOWN_DIR); } for (i = 0; i < to_remove.nr; ++i) strmap_remove(info->dir_rename_count, @@ -770,8 +814,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info) static int find_basename_matches(struct diff_options *options, int minimum_score, struct dir_rename_info *info, - struct strset *relevant_sources, - struct strset *dirs_removed) + struct strintmap *relevant_sources, + struct strintmap *dirs_removed) { /* * When I checked in early 2020, over 76% of file renames in linux @@ -863,7 +907,7 @@ static int find_basename_matches(struct diff_options *options, /* Skip irrelevant sources */ if (relevant_sources && - !strset_contains(relevant_sources, filename)) + !strintmap_contains(relevant_sources, filename)) continue; /* @@ -994,7 +1038,7 @@ static int find_renames(struct diff_score *mx, int minimum_score, int copies, struct dir_rename_info *info, - struct strset *dirs_removed) + struct strintmap *dirs_removed) { int count = 0, i; @@ -1019,7 +1063,7 @@ static int find_renames(struct diff_score *mx, } static void remove_unneeded_paths_from_src(int detecting_copies, - struct strset *interesting) + struct strintmap *interesting) { int i, new_num_src; @@ -1061,7 +1105,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies, continue; /* If we don't care about the source path, skip it */ - if (interesting && !strset_contains(interesting, one->path)) + if (interesting && !strintmap_contains(interesting, one->path)) continue; if (new_num_src < i) @@ -1073,9 +1117,136 @@ static void remove_unneeded_paths_from_src(int detecting_copies, rename_src_nr = new_num_src; } +static void handle_early_known_dir_renames(struct dir_rename_info *info, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed) +{ + /* + * Directory renames are determined via an aggregate of all renames + * under them and using a "majority wins" rule. The fact that + * "majority wins", though, means we don't need all the renames + * under the given directory, we only need enough to ensure we have + * a majority. + */ + + int i, new_num_src; + struct hashmap_iter iter; + struct strmap_entry *entry; + + if (!dirs_removed || !relevant_sources) + return; /* nothing to cull */ + if (break_idx) + return; /* culling incompatbile with break detection */ + + /* + * Supplement dir_rename_count with number of potential renames, + * marking all potential rename sources as mapping to UNKNOWN_DIR. + */ + for (i = 0; i < rename_src_nr; i++) { + char *old_dir; + struct diff_filespec *one = rename_src[i].p->one; + + /* + * sources that are part of a rename will have already been + * removed by a prior call to remove_unneeded_paths_from_src() + */ + assert(!one->rename_used); + + old_dir = get_dirname(one->path); + while (*old_dir != '\0' && + NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) { + char *freeme = old_dir; + + increment_count(info, old_dir, UNKNOWN_DIR); + old_dir = get_dirname(old_dir); + + /* Free resources we don't need anymore */ + free(freeme); + } + /* + * old_dir and new_dir free'd in increment_count, but + * get_dirname() gives us a new pointer we need to free for + * old_dir. Also, if the loop runs 0 times we need old_dir + * to be freed. + */ + free(old_dir); + } + + /* + * For any directory which we need a potential rename detected for + * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check + * whether we have enough renames to satisfy the "majority rules" + * requirement such that detecting any more renames of files under + * it won't change the result. For any such directory, mark that + * we no longer need to detect a rename for it. However, since we + * might need to still detect renames for an ancestor of that + * directory, use RELEVANT_FOR_ANCESTOR. + */ + strmap_for_each_entry(info->dir_rename_count, &iter, entry) { + /* entry->key is source_dir */ + struct strintmap *counts = entry->value; + + if (strintmap_get(dirs_removed, entry->key) == + RELEVANT_FOR_SELF && + dir_rename_already_determinable(counts)) { + strintmap_set(dirs_removed, entry->key, + RELEVANT_FOR_ANCESTOR); + } + } + + for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { + struct diff_filespec *one = rename_src[i].p->one; + int val; + + val = strintmap_get(relevant_sources, one->path); + + /* + * sources that were not found in relevant_sources should + * have already been removed by a prior call to + * remove_unneeded_paths_from_src() + */ + assert(val != -1); + + if (val == RELEVANT_LOCATION) { + int removable = 1; + char *dir = get_dirname(one->path); + while (1) { + char *freeme = dir; + int res = strintmap_get(dirs_removed, dir); + + /* Quit if not found or irrelevant */ + if (res == NOT_RELEVANT) + break; + /* If RELEVANT_FOR_SELF, can't remove */ + if (res == RELEVANT_FOR_SELF) { + removable = 0; + break; + } + /* Else continue searching upwards */ + assert(res == RELEVANT_FOR_ANCESTOR); + dir = get_dirname(dir); + free(freeme); + } + free(dir); + if (removable) { + strintmap_set(relevant_sources, one->path, + RELEVANT_NO_MORE); + continue; + } + } + + if (new_num_src < i) + memcpy(&rename_src[new_num_src], &rename_src[i], + sizeof(struct diff_rename_src)); + new_num_src++; + } + + rename_src_nr = new_num_src; +} + void diffcore_rename_extended(struct diff_options *options, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count) { int detect_rename = options->detect_rename; @@ -1208,9 +1379,16 @@ void diffcore_rename_extended(struct diff_options *options, * Cull sources, again: * - remove ones involved in renames (found via basenames) * - remove ones not found in relevant_sources + * and + * - remove ones in relevant_sources which are needed only + * for directory renames IF no ancestory directory + * actually needs to know any more individual path + * renames under them */ trace2_region_enter("diff", "cull basename", options->repo); remove_unneeded_paths_from_src(want_copies, relevant_sources); + handle_early_known_dir_renames(&info, relevant_sources, + dirs_removed); trace2_region_leave("diff", "cull basename", options->repo); } diff --git a/diffcore.h b/diffcore.h index d76982f220..f5c6de4841 100644 --- a/diffcore.h +++ b/diffcore.h @@ -8,8 +8,8 @@ struct diff_options; struct repository; +struct strintmap; struct strmap; -struct strset; struct userdiff_driver; /* This header file is internal between diff.c and its diff transformers @@ -161,13 +161,26 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *, struct diff_filespec *); void diff_q(struct diff_queue_struct *, struct diff_filepair *); +/* dir_rename_relevance: the reason we want rename information for a dir */ +enum dir_rename_relevance { + NOT_RELEVANT = 0, + RELEVANT_FOR_ANCESTOR = 1, + RELEVANT_FOR_SELF = 2 +}; +/* file_rename_relevance: the reason(s) we want rename information for a file */ +enum file_rename_relevance { + RELEVANT_NO_MORE = 0, /* i.e. NOT relevant */ + RELEVANT_CONTENT = 1, + RELEVANT_LOCATION = 2 +}; + void partial_clear_dir_rename_count(struct strmap *dir_rename_count); void diffcore_break(struct repository *, int); void diffcore_rename(struct diff_options *); void diffcore_rename_extended(struct diff_options *options, - struct strset *relevant_sources, - struct strset *dirs_removed, + struct strintmap *relevant_sources, + struct strintmap *dirs_removed, struct strmap *dir_rename_count); void diffcore_merge_broken(void); void diffcore_pickaxe(struct diff_options *); diff --git a/merge-ort.c b/merge-ort.c index 5e118a85ee..028d712f54 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -73,8 +73,12 @@ struct rename_info { /* * dirs_removed: directories removed on a given side of history. + * + * The keys of dirs_removed[side] are the directories that were removed + * on the given side of history. The value of the strintmap for each + * directory is a value from enum dir_rename_relevance. */ - struct strset dirs_removed[3]; + struct strintmap dirs_removed[3]; /* * dir_rename_count: tracking where parts of a directory were renamed to @@ -95,18 +99,20 @@ struct rename_info { struct strmap dir_renames[3]; /* - * relevant_sources: deleted paths for which we need rename detection + * relevant_sources: deleted paths wanted in rename detection, and why * * relevant_sources is a set of deleted paths on each side of * history for which we need rename detection. If a path is deleted * on one side of history, we need to detect if it is part of a * rename if either - * * we need to detect renames for an ancestor directory * * the file is modified/deleted on the other side of history + * * we need to detect renames for an ancestor directory * If neither of those are true, we can skip rename detection for - * that path. + * that path. The reason is stored as a value from enum + * file_rename_relevance, as the reason can inform the algorithm in + * diffcore_rename_extended(). */ - struct strset relevant_sources[3]; + struct strintmap relevant_sources[3]; /* * dir_rename_mask: @@ -362,8 +368,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, int i; void (*strmap_func)(struct strmap *, int) = reinitialize ? strmap_partial_clear : strmap_clear; - void (*strset_func)(struct strset *) = - reinitialize ? strset_partial_clear : strset_clear; + void (*strintmap_func)(struct strintmap *) = + reinitialize ? strintmap_partial_clear : strintmap_clear; /* * We marked opti->paths with strdup_strings = 0, so that we @@ -395,7 +401,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, /* Free memory used by various renames maps */ for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) { - strset_func(&renames->dirs_removed[i]); + strintmap_func(&renames->dirs_removed[i]); partial_clear_dir_rename_count(&renames->dir_rename_count[i]); if (!reinitialize) @@ -403,7 +409,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, strmap_func(&renames->dir_renames[i], 0); - strset_func(&renames->relevant_sources[i]); + strintmap_func(&renames->relevant_sources[i]); } if (!reinitialize) { @@ -673,8 +679,11 @@ static void add_pair(struct merge_options *opt, unsigned content_relevant = (match_mask == 0); unsigned location_relevant = (dir_rename_mask == 0x07); - if (content_relevant || location_relevant) - strset_add(&renames->relevant_sources[side], pathname); + if (content_relevant || location_relevant) { + /* content_relevant trumps location_relevant */ + strintmap_set(&renames->relevant_sources[side], pathname, + content_relevant ? RELEVANT_CONTENT : RELEVANT_LOCATION); + } } one = alloc_filespec(pathname); @@ -729,10 +738,41 @@ static void collect_rename_info(struct merge_options *opt, if (dirmask == 1 || dirmask == 3 || dirmask == 5) { /* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */ unsigned sides = (0x07 - dirmask)/2; + unsigned relevance = (renames->dir_rename_mask == 0x07) ? + RELEVANT_FOR_ANCESTOR : NOT_RELEVANT; + /* + * Record relevance of this directory. However, note that + * when collect_merge_info_callback() recurses into this + * directory and calls collect_rename_info() on paths + * within that directory, if we find a path that was added + * to this directory on the other side of history, we will + * upgrade this value to RELEVANT_FOR_SELF; see below. + */ if (sides & 1) - strset_add(&renames->dirs_removed[1], fullname); + strintmap_set(&renames->dirs_removed[1], fullname, + relevance); if (sides & 2) - strset_add(&renames->dirs_removed[2], fullname); + strintmap_set(&renames->dirs_removed[2], fullname, + relevance); + } + + /* + * Here's the block that potentially upgrades to RELEVANT_FOR_SELF. + * When we run across a file added to a directory. In such a case, + * find the directory of the file and upgrade its relevance. + */ + if (renames->dir_rename_mask == 0x07 && + (filemask == 2 || filemask == 4)) { + /* + * Need directory rename for parent directory on other side + * of history from added file. Thus + * side = (~filemask & 0x06) >> 1 + * or + * side = 3 - (filemask/2). + */ + unsigned side = 3 - (filemask >> 1); + strintmap_set(&renames->dirs_removed[side], dirname, + RELEVANT_FOR_SELF); } if (filemask == 0 || filemask == 7) @@ -1511,6 +1551,9 @@ static void get_provisional_directory_renames(struct merge_options *opt, } } + if (max == 0) + continue; + if (bad_max == max) { path_msg(opt, source_dir, 0, _("CONFLICT (directory rename split): " @@ -2160,7 +2203,7 @@ static inline int possible_side_renames(struct rename_info *renames, unsigned side_index) { return renames->pairs[side_index].nr > 0 && - !strset_empty(&renames->relevant_sources[side_index]); + !strintmap_empty(&renames->relevant_sources[side_index]); } static inline int possible_renames(struct rename_info *renames) @@ -3444,14 +3487,14 @@ static void merge_start(struct merge_options *opt, struct merge_result *result) /* Initialization of various renames fields */ renames = &opt->priv->renames; for (i = MERGE_SIDE1; i <= MERGE_SIDE2; i++) { - strset_init_with_options(&renames->dirs_removed[i], - NULL, 0); + strintmap_init_with_options(&renames->dirs_removed[i], + NOT_RELEVANT, NULL, 0); strmap_init_with_options(&renames->dir_rename_count[i], NULL, 1); strmap_init_with_options(&renames->dir_renames[i], NULL, 0); - strset_init_with_options(&renames->relevant_sources[i], - NULL, 0); + strintmap_init_with_options(&renames->relevant_sources[i], + 0, NULL, 0); } /*