From: Barret Rhoden <brho@google.com>
To: git@vger.kernel.org
Cc: "Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
"David Kastrup" <dak@gnu.org>, "Jeff King" <peff@peff.net>,
"Jeff Smith" <whydoubt@gmail.com>,
"Johannes Schindelin" <Johannes.Schindelin@gmx.de>,
"Junio C Hamano" <gitster@pobox.com>,
"René Scharfe" <l.s.r@web.de>,
"Stefan Beller" <stefanbeller@gmail.com>,
"Michael Platings" <michael@platin.gs>
Subject: [PATCH v7 8/8] blame: use the fingerprint heuristic to match ignored lines
Date: Wed, 15 May 2019 17:45:03 -0400 [thread overview]
Message-ID: <20190515214503.77162-9-brho@google.com> (raw)
In-Reply-To: <20190515214503.77162-1-brho@google.com>
This commit integrates the fuzzy fingerprint heuristic into
guess_line_blames().
We actually make two passes. The first pass uses the fuzzy algorithm to
find a match within the current diff chunk. If that fails, the second
pass searches the entire parent file for the best match.
For an example of scanning the entire parent for a match, consider:
commit-a 30) #include <sys/header_a.h>
commit-b 31) #include <header_b.h>
commit-c 32) #include <header_c.h>
Then commit X alphabetizes them:
commit-X 30) #include <header_b.h>
commit-X 31) #include <header_c.h>
commit-X 32) #include <sys/header_a.h>
If we just check the parent's chunk (i.e. the first pass), we'd get:
commit-b 30) #include <header_b.h>
commit-c 31) #include <header_c.h>
commit-X 32) #include <sys/header_a.h>
That's because commit X actually consists of two chunks: one chunk is
removing sys/header_a.h, then some context, and the second chunk is
adding sys/header_a.h.
If we scan the entire parent file, we get:
commit-b 30) #include <header_b.h>
commit-c 31) #include <header_c.h>
commit-a 32) #include <sys/header_a.h>
Signed-off-by: Barret Rhoden <brho@google.com>
---
blame.c | 60 ++++++++++++++++++++++++++++++++---
t/t8014-blame-ignore-fuzzy.sh | 3 --
2 files changed, 55 insertions(+), 8 deletions(-)
diff --git a/blame.c b/blame.c
index 7ef283166f29..e20d88dd74d8 100644
--- a/blame.c
+++ b/blame.c
@@ -998,12 +998,19 @@ static void fill_origin_fingerprints(struct blame_origin *o, mmfile_t *file)
return;
o->num_lines = find_line_starts(&line_starts, o->file.ptr,
o->file.size);
- /* TODO: Will fill in fingerprints in a future commit */
+ o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines);
+ get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,
+ 0, o->num_lines);
free(line_starts);
}
static void drop_origin_fingerprints(struct blame_origin *o)
{
+ if (o->fingerprints) {
+ free_line_fingerprints(o->fingerprints, o->num_lines);
+ o->num_lines = 0;
+ FREE_AND_NULL(o->fingerprints);
+ }
}
/*
@@ -1581,9 +1588,34 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
first->s_lno + 1 == second->s_lno;
}
+static int scan_parent_range(struct fingerprint *p_fps,
+ struct fingerprint *t_fps, int t_idx,
+ int from, int nr_lines)
+{
+ int sim, p_idx;
+ #define FINGERPRINT_FILE_THRESHOLD 10
+ int best_sim_val = FINGERPRINT_FILE_THRESHOLD;
+ int best_sim_idx = -1;
+
+ for (p_idx = from; p_idx < from + nr_lines; p_idx++) {
+ sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);
+ if (sim < best_sim_val)
+ continue;
+ /* Break ties with the closest-to-target line number */
+ if (sim == best_sim_val && best_sim_idx != -1 &&
+ abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))
+ continue;
+ best_sim_val = sim;
+ best_sim_idx = p_idx;
+ }
+ return best_sim_idx;
+}
+
/*
- * This cheap heuristic assigns lines in the chunk to their relative location in
- * the parent's chunk. Any additional lines are left with the target.
+ * The first pass checks the blame entry (from the target) against the parent's
+ * diff chunk. If that fails for a line, the second pass tries to match that
+ * line to any part of parent file. That catches cases where a change was
+ * broken into two chunks by 'context.'
*/
static void guess_line_blames(struct blame_origin *parent,
struct blame_origin *target,
@@ -1592,11 +1624,22 @@ static void guess_line_blames(struct blame_origin *parent,
{
int i, best_idx, target_idx;
int parent_slno = tlno + offset;
+ int *fuzzy_matches;
+ fuzzy_matches = fuzzy_find_matching_lines(parent, target,
+ tlno, parent_slno, same,
+ parent_len);
for (i = 0; i < same - tlno; i++) {
target_idx = tlno + i;
- best_idx = target_idx + offset;
- if (best_idx < parent_slno + parent_len) {
+ if (fuzzy_matches && fuzzy_matches[i] >= 0) {
+ best_idx = fuzzy_matches[i];
+ } else {
+ best_idx = scan_parent_range(parent->fingerprints,
+ target->fingerprints,
+ target_idx, 0,
+ parent->num_lines);
+ }
+ if (best_idx >= 0) {
line_blames[i].is_parent = 1;
line_blames[i].s_lno = best_idx;
} else {
@@ -1604,6 +1647,7 @@ static void guess_line_blames(struct blame_origin *parent,
line_blames[i].s_lno = target_idx;
}
}
+ free(fuzzy_matches);
}
/*
@@ -2380,6 +2424,12 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
if (!porigin)
continue;
pass_blame_to_parent(sb, origin, porigin, 1);
+ /*
+ * Preemptively drop porigin so we can refresh the
+ * fingerprints if we use the parent again, which can
+ * occur if you ignore back-to-back commits.
+ */
+ drop_origin_blob(porigin);
if (!origin->suspects)
goto finish;
}
diff --git a/t/t8014-blame-ignore-fuzzy.sh b/t/t8014-blame-ignore-fuzzy.sh
index 7c09aeb9e14d..c49971909a27 100755
--- a/t/t8014-blame-ignore-fuzzy.sh
+++ b/t/t8014-blame-ignore-fuzzy.sh
@@ -3,9 +3,6 @@
test_description='git blame ignore fuzzy heuristic'
. ./test-lib.sh
-# short circuit until blame has the fuzzy capabilities
-test_done
-
pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
# Each test is composed of 4 variables:
--
2.21.0.1020.gf2820cf01a-goog
prev parent reply other threads:[~2019-05-15 21:45 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-15 21:44 [PATCH v7 0/8] blame: add the ability to ignore commits Barret Rhoden
2019-05-15 21:44 ` [PATCH v7 1/8] fsck: rename and touch up init_skiplist() Barret Rhoden
2019-05-15 21:44 ` [PATCH v7 2/8] Move oidset_parse_file() to oidset.c Barret Rhoden
2019-05-15 21:44 ` [PATCH v7 3/8] blame: use a helper function in blame_chunk() Barret Rhoden
2019-05-15 21:44 ` [PATCH v7 4/8] blame: add the ability to ignore commits and their changes Barret Rhoden
2019-05-15 21:45 ` [PATCH v7 5/8] blame: add config options for the output of ignored or unblamable lines Barret Rhoden
2019-05-15 21:45 ` [PATCH v7 6/8] blame: optionally track line fingerprints during fill_blame_origin() Barret Rhoden
2019-05-15 21:45 ` [PATCH v7 7/8] blame: add a fingerprint heuristic to match ignored lines Barret Rhoden
2019-05-16 7:49 ` Junio C Hamano
2019-05-16 21:57 ` [PATCH v7 7/8 (edit)] " Barret Rhoden
2019-05-17 5:12 ` Junio C Hamano
2019-05-20 20:32 ` Michael Platings
2019-05-20 20:34 ` Barret Rhoden
2019-05-28 16:39 ` Junio C Hamano
2019-05-15 21:45 ` Barret Rhoden [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190515214503.77162-9-brho@google.com \
--to=brho@google.com \
--cc=Johannes.Schindelin@gmx.de \
--cc=avarab@gmail.com \
--cc=dak@gnu.org \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=l.s.r@web.de \
--cc=michael@platin.gs \
--cc=peff@peff.net \
--cc=stefanbeller@gmail.com \
--cc=whydoubt@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).