From: "Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Elijah Newren <newren@gmail.com>, Elijah Newren <newren@gmail.com>
Subject: [PATCH v2 05/10] diffcore-rename: compute dir_rename_counts in stages
Date: Tue, 23 Feb 2021 23:44:02 +0000 [thread overview]
Message-ID: <3a29cf9e526fba0227a7eec92c0c6bd58a7850f0.1614123848.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.844.v2.git.1614123848.gitgitgadget@gmail.com>
From: Elijah Newren <newren@gmail.com>
We want to first compute dir_rename_counts based just on exact renames
to start, as that can provide us useful information in
find_basename_matches(). That will give us an incomplete result, which
we can then later augment as basename and inexact rename matches are
found.
Signed-off-by: Elijah Newren <newren@gmail.com>
---
diffcore-rename.c | 76 ++++++++++++++++++++++++++++++++++++++---------
1 file changed, 62 insertions(+), 14 deletions(-)
diff --git a/diffcore-rename.c b/diffcore-rename.c
index aa21d4e7175c..489e9cb0871e 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -411,6 +411,28 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
char new_dir_first_char = new_dir[0];
int first_time_in_loop = 1;
+ if (!info->setup)
+ /*
+ * info->setup is 0 here in two cases: (1) all auxiliary
+ * vars (like dirs_removed) were NULL so
+ * initialize_dir_rename_info() returned early, or (2)
+ * either break detection or copy detection are active so
+ * that we never called initialize_dir_rename_info(). In
+ * the former case, we don't have enough info to know if
+ * directories were renamed (because dirs_removed lets us
+ * know about a necessary prerequisite, namely if they were
+ * removed), and in the latter, we don't care about
+ * directory renames or find_basename_matches.
+ *
+ * This matters because both basename and inexact matching
+ * will also call update_dir_rename_counts(). In either of
+ * the above two cases info->dir_rename_counts will not
+ * have been properly initialized which prevents us from
+ * updating it, but in these two cases we don't care about
+ * dir_rename_counts anyway, so we can just exit early.
+ */
+ return;
+
while (1) {
dirname_munge(old_dir);
dirname_munge(new_dir);
@@ -471,14 +493,22 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
free(new_dir);
}
-static void compute_dir_rename_counts(struct dir_rename_info *info,
- struct strset *dirs_removed,
- struct strmap *dir_rename_count)
+static void initialize_dir_rename_info(struct dir_rename_info *info,
+ struct strset *dirs_removed,
+ struct strmap *dir_rename_count)
{
int i;
+ info->setup = 0;
+ if (!dirs_removed)
+ return;
info->setup = 1;
+
info->dir_rename_count = dir_rename_count;
+ if (!info->dir_rename_count) {
+ info->dir_rename_count = xmalloc(sizeof(*dir_rename_count));
+ strmap_init(info->dir_rename_count);
+ }
for (i = 0; i < rename_dst_nr; ++i) {
/*
@@ -506,7 +536,6 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
strmap_partial_clear(dir_rename_count, 1);
}
-MAYBE_UNUSED
static void cleanup_dir_rename_info(struct dir_rename_info *info,
struct strset *dirs_removed,
int keep_dir_rename_count)
@@ -561,7 +590,9 @@ static const char *get_basename(const char *filename)
}
static int find_basename_matches(struct diff_options *options,
- int minimum_score)
+ int minimum_score,
+ struct dir_rename_info *info,
+ struct strset *dirs_removed)
{
/*
* When I checked in early 2020, over 76% of file renames in linux
@@ -669,6 +700,8 @@ static int find_basename_matches(struct diff_options *options,
continue;
record_rename_pair(dst_index, src_index, score);
renames++;
+ update_dir_rename_counts(info, dirs_removed,
+ one->path, two->path);
/*
* Found a rename so don't need text anymore; if we
@@ -752,7 +785,12 @@ static int too_many_rename_candidates(int num_destinations, int num_sources,
return 1;
}
-static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies)
+static int find_renames(struct diff_score *mx,
+ int dst_cnt,
+ int minimum_score,
+ int copies,
+ struct dir_rename_info *info,
+ struct strset *dirs_removed)
{
int count = 0, i;
@@ -769,6 +807,9 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
continue;
record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
count++;
+ update_dir_rename_counts(info, dirs_removed,
+ rename_src[mx[i].src].p->one->path,
+ rename_dst[mx[i].dst].p->two->path);
}
return count;
}
@@ -840,6 +881,8 @@ void diffcore_rename_extended(struct diff_options *options,
info.setup = 0;
assert(!dir_rename_count || strmap_empty(dir_rename_count));
want_copies = (detect_rename == DIFF_DETECT_COPY);
+ if (dirs_removed && (break_idx || want_copies))
+ BUG("dirs_removed incompatible with break/copy detection");
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
@@ -931,10 +974,17 @@ void diffcore_rename_extended(struct diff_options *options,
remove_unneeded_paths_from_src(want_copies);
trace2_region_leave("diff", "cull after exact", options->repo);
+ /* Preparation for basename-driven matching. */
+ trace2_region_enter("diff", "dir rename setup", options->repo);
+ initialize_dir_rename_info(&info,
+ dirs_removed, dir_rename_count);
+ trace2_region_leave("diff", "dir rename setup", options->repo);
+
/* Utilize file basenames to quickly find renames. */
trace2_region_enter("diff", "basename matches", options->repo);
rename_count += find_basename_matches(options,
- min_basename_score);
+ min_basename_score,
+ &info, dirs_removed);
trace2_region_leave("diff", "basename matches", options->repo);
/*
@@ -1020,18 +1070,15 @@ void diffcore_rename_extended(struct diff_options *options,
/* cost matrix sorted by most to least similar pair */
STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
- rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
+ rename_count += find_renames(mx, dst_cnt, minimum_score, 0,
+ &info, dirs_removed);
if (want_copies)
- rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
+ rename_count += find_renames(mx, dst_cnt, minimum_score, 1,
+ &info, dirs_removed);
free(mx);
trace2_region_leave("diff", "inexact renames", options->repo);
cleanup:
- /*
- * Now that renames have been computed, compute dir_rename_count */
- if (dirs_removed && dir_rename_count)
- compute_dir_rename_counts(&info, dirs_removed, dir_rename_count);
-
/* At this point, we have found some renames and copies and they
* are recorded in rename_dst. The original list is still in *q.
*/
@@ -1103,6 +1150,7 @@ void diffcore_rename_extended(struct diff_options *options,
if (rename_dst[i].filespec_to_free)
free_filespec(rename_dst[i].filespec_to_free);
+ cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL);
FREE_AND_NULL(rename_dst);
rename_dst_nr = rename_dst_alloc = 0;
FREE_AND_NULL(rename_src);
--
gitgitgadget
next prev parent reply other threads:[~2021-02-24 0:09 UTC|newest]
Thread overview: 61+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-02-14 7:58 [PATCH 00/10] Optimization batch 8: use file basenames even more Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 01/10] Move computation of dir_rename_count from merge-ort to diffcore-rename Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 02/10] diffcore-rename: add functions for clearing dir_rename_count Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 03/10] diffcore-rename: move dir_rename_counts into a dir_rename_info struct Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 04/10] diffcore-rename: extend cleanup_dir_rename_info() Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 05/10] diffcore-rename: compute dir_rename_counts in stages Elijah Newren via GitGitGadget
2021-02-14 7:58 ` [PATCH 06/10] diffcore-rename: add a mapping of destination names to their indices Elijah Newren via GitGitGadget
2021-02-14 7:59 ` [PATCH 07/10] diffcore-rename: add a dir_rename_guess field to dir_rename_info Elijah Newren via GitGitGadget
2021-02-14 7:59 ` [PATCH 08/10] diffcore-rename: add a new idx_possible_rename function Elijah Newren via GitGitGadget
2021-02-14 7:59 ` [PATCH 09/10] diffcore-rename: limit dir_rename_counts computation to relevant dirs Elijah Newren via GitGitGadget
2021-02-14 7:59 ` [PATCH 10/10] diffcore-rename: use directory rename guided basename comparisons Elijah Newren via GitGitGadget
2021-02-23 23:43 ` [PATCH v2 00/10] Optimization batch 8: use file basenames even more Elijah Newren via GitGitGadget
2021-02-23 23:43 ` [PATCH v2 01/10] Move computation of dir_rename_count from merge-ort to diffcore-rename Elijah Newren via GitGitGadget
2021-02-24 15:25 ` Derrick Stolee
2021-02-24 18:50 ` Elijah Newren
2021-02-23 23:43 ` [PATCH v2 02/10] diffcore-rename: add functions for clearing dir_rename_count Elijah Newren via GitGitGadget
2021-02-23 23:44 ` [PATCH v2 03/10] diffcore-rename: move dir_rename_counts into a dir_rename_info struct Elijah Newren via GitGitGadget
2021-02-23 23:44 ` [PATCH v2 04/10] diffcore-rename: extend cleanup_dir_rename_info() Elijah Newren via GitGitGadget
2021-02-24 15:37 ` Derrick Stolee
2021-02-25 2:16 ` Ævar Arnfjörð Bjarmason
2021-02-25 2:26 ` Ævar Arnfjörð Bjarmason
2021-02-25 2:34 ` Junio C Hamano
2021-02-23 23:44 ` Elijah Newren via GitGitGadget [this message]
2021-02-24 15:43 ` [PATCH v2 05/10] diffcore-rename: compute dir_rename_counts in stages Derrick Stolee
2021-02-23 23:44 ` [PATCH v2 06/10] diffcore-rename: add a mapping of destination names to their indices Elijah Newren via GitGitGadget
2021-02-23 23:44 ` [PATCH v2 07/10] diffcore-rename: add a dir_rename_guess field to dir_rename_info Elijah Newren via GitGitGadget
2021-02-23 23:44 ` [PATCH v2 08/10] diffcore-rename: add a new idx_possible_rename function Elijah Newren via GitGitGadget
2021-02-24 17:35 ` Derrick Stolee
2021-02-25 1:13 ` Elijah Newren
2021-02-23 23:44 ` [PATCH v2 09/10] diffcore-rename: limit dir_rename_counts computation to relevant dirs Elijah Newren via GitGitGadget
2021-02-23 23:44 ` [PATCH v2 10/10] diffcore-rename: use directory rename guided basename comparisons Elijah Newren via GitGitGadget
2021-02-24 17:44 ` Derrick Stolee
2021-02-24 17:50 ` [PATCH v2 00/10] Optimization batch 8: use file basenames even more Derrick Stolee
2021-02-25 1:38 ` Elijah Newren
2021-02-26 1:58 ` [PATCH v3 " Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 01/10] diffcore-rename: use directory rename guided basename comparisons Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 02/10] diffcore-rename: add a new idx_possible_rename function Elijah Newren via GitGitGadget
2021-02-26 15:52 ` Derrick Stolee
2021-02-26 1:58 ` [PATCH v3 03/10] diffcore-rename: add a mapping of destination names to their indices Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 04/10] Move computation of dir_rename_count from merge-ort to diffcore-rename Elijah Newren via GitGitGadget
2021-02-26 15:55 ` Derrick Stolee
2021-02-26 1:58 ` [PATCH v3 05/10] diffcore-rename: add function for clearing dir_rename_count Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 06/10] diffcore-rename: move dir_rename_counts into dir_rename_info struct Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 07/10] diffcore-rename: extend cleanup_dir_rename_info() Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 08/10] diffcore-rename: compute dir_rename_counts in stages Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 09/10] diffcore-rename: limit dir_rename_counts computation to relevant dirs Elijah Newren via GitGitGadget
2021-02-26 1:58 ` [PATCH v3 10/10] diffcore-rename: compute dir_rename_guess from dir_rename_counts Elijah Newren via GitGitGadget
2021-02-26 16:34 ` [PATCH v3 00/10] Optimization batch 8: use file basenames even more Derrick Stolee
2021-02-26 19:28 ` Elijah Newren
2021-02-27 0:30 ` [PATCH v4 " Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 01/10] diffcore-rename: use directory rename guided basename comparisons Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 02/10] diffcore-rename: provide basic implementation of idx_possible_rename() Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 03/10] diffcore-rename: add a mapping of destination names to their indices Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 04/10] Move computation of dir_rename_count from merge-ort to diffcore-rename Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 05/10] diffcore-rename: add function for clearing dir_rename_count Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 06/10] diffcore-rename: move dir_rename_counts into dir_rename_info struct Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 07/10] diffcore-rename: extend cleanup_dir_rename_info() Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 08/10] diffcore-rename: compute dir_rename_counts in stages Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 09/10] diffcore-rename: limit dir_rename_counts computation to relevant dirs Elijah Newren via GitGitGadget
2021-02-27 0:30 ` [PATCH v4 10/10] diffcore-rename: compute dir_rename_guess from dir_rename_counts Elijah Newren via GitGitGadget
2021-03-09 21:52 ` [PATCH v4 00/10] Optimization batch 8: use file basenames even more Derrick Stolee
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3a29cf9e526fba0227a7eec92c0c6bd58a7850f0.1614123848.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail.com \
--cc=git@vger.kernel.org \
--cc=newren@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).