git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / Atom feed
From: Barret Rhoden <brho@google.com>
To: git@vger.kernel.org
Cc: "Ævar Arnfjörð Bjarmason"  <avarab@gmail.com>,
	David Kastrup <dak@gnu.org>, Jeff King <peff@peff.net>,
	Jeff Smith <whydoubt@gmail.com>,
	Johannes Schindelin <Johannes.Schindelin@gmx.de>,
	Junio C Hamano <gitster@pobox.com>, "René Scharfe" <l.s.r@web.de>,
	Stefan Beller <stefanbeller@gmail.com>,
	Michael Platings <michael@platin.gs>
Subject: [PATCH v8 8/9] blame: use the fingerprint heuristic to match ignored lines
Date: Mon, 10 Jun 2019 11:30:13 -0400
Message-ID: <20190610153014.42055-9-brho@google.com> (raw)
In-Reply-To: <20190610153014.42055-1-brho@google.com>

This commit integrates the fuzzy fingerprint heuristic into
guess_line_blames().

We actually make two passes.  The first pass uses the fuzzy algorithm to
find a match within the current diff chunk.  If that fails, the second
pass searches the entire parent file for the best match.

For an example of scanning the entire parent for a match, consider:

	commit-a 30) #include <sys/header_a.h>
	commit-b 31) #include <header_b.h>
	commit-c 32) #include <header_c.h>

Then commit X alphabetizes them:

	commit-X 30) #include <header_b.h>
	commit-X 31) #include <header_c.h>
	commit-X 32) #include <sys/header_a.h>

If we just check the parent's chunk (i.e. the first pass), we'd get:

	commit-b 30) #include <header_b.h>
	commit-c 31) #include <header_c.h>
	commit-X 32) #include <sys/header_a.h>

That's because commit X actually consists of two chunks: one chunk is
removing sys/header_a.h, then some context, and the second chunk is
adding sys/header_a.h.

If we scan the entire parent file, we get:

	commit-b 30) #include <header_b.h>
	commit-c 31) #include <header_c.h>
	commit-a 32) #include <sys/header_a.h>

Signed-off-by: Barret Rhoden <brho@google.com>
---
 blame.c                       | 60 ++++++++++++++++++++++++++++++++---
 t/t8014-blame-ignore-fuzzy.sh |  3 --
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/blame.c b/blame.c
index 103838546e07..f81ec9a8cf80 100644
--- a/blame.c
+++ b/blame.c
@@ -990,12 +990,19 @@ static void fill_origin_fingerprints(struct blame_origin *o, mmfile_t *file)
 		return;
 	o->num_lines = find_line_starts(&line_starts, o->file.ptr,
 					o->file.size);
-	/* TODO: Will fill in fingerprints in a future commit */
+	o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines);
+	get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,
+			      0, o->num_lines);
 	free(line_starts);
 }
 
 static void drop_origin_fingerprints(struct blame_origin *o)
 {
+	if (o->fingerprints) {
+		free_line_fingerprints(o->fingerprints, o->num_lines);
+		o->num_lines = 0;
+		FREE_AND_NULL(o->fingerprints);
+	}
 }
 
 /*
@@ -1573,9 +1580,34 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
 	       first->s_lno + 1 == second->s_lno;
 }
 
+static int scan_parent_range(struct fingerprint *p_fps,
+			     struct fingerprint *t_fps, int t_idx,
+			     int from, int nr_lines)
+{
+	int sim, p_idx;
+	#define FINGERPRINT_FILE_THRESHOLD	10
+	int best_sim_val = FINGERPRINT_FILE_THRESHOLD;
+	int best_sim_idx = -1;
+
+	for (p_idx = from; p_idx < from + nr_lines; p_idx++) {
+		sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);
+		if (sim < best_sim_val)
+			continue;
+		/* Break ties with the closest-to-target line number */
+		if (sim == best_sim_val && best_sim_idx != -1 &&
+		    abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))
+			continue;
+		best_sim_val = sim;
+		best_sim_idx = p_idx;
+	}
+	return best_sim_idx;
+}
+
 /*
- * This cheap heuristic assigns lines in the chunk to their relative location in
- * the parent's chunk.  Any additional lines are left with the target.
+ * The first pass checks the blame entry (from the target) against the parent's
+ * diff chunk.  If that fails for a line, the second pass tries to match that
+ * line to any part of parent file.  That catches cases where a change was
+ * broken into two chunks by 'context.'
  */
 static void guess_line_blames(struct blame_origin *parent,
 			      struct blame_origin *target,
@@ -1584,11 +1616,22 @@ static void guess_line_blames(struct blame_origin *parent,
 {
 	int i, best_idx, target_idx;
 	int parent_slno = tlno + offset;
+	int *fuzzy_matches;
 
+	fuzzy_matches = fuzzy_find_matching_lines(parent, target,
+						  tlno, parent_slno, same,
+						  parent_len);
 	for (i = 0; i < same - tlno; i++) {
 		target_idx = tlno + i;
-		best_idx = target_idx + offset;
-		if (best_idx < parent_slno + parent_len) {
+		if (fuzzy_matches && fuzzy_matches[i] >= 0) {
+			best_idx = fuzzy_matches[i];
+		} else {
+			best_idx = scan_parent_range(parent->fingerprints,
+						     target->fingerprints,
+						     target_idx, 0,
+						     parent->num_lines);
+		}
+		if (best_idx >= 0) {
 			line_blames[i].is_parent = 1;
 			line_blames[i].s_lno = best_idx;
 		} else {
@@ -1596,6 +1639,7 @@ static void guess_line_blames(struct blame_origin *parent,
 			line_blames[i].s_lno = target_idx;
 		}
 	}
+	free(fuzzy_matches);
 }
 
 /*
@@ -2372,6 +2416,12 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
 			if (!porigin)
 				continue;
 			pass_blame_to_parent(sb, origin, porigin, 1);
+			/*
+			 * Preemptively drop porigin so we can refresh the
+			 * fingerprints if we use the parent again, which can
+			 * occur if you ignore back-to-back commits.
+			 */
+			drop_origin_blob(porigin);
 			if (!origin->suspects)
 				goto finish;
 		}
diff --git a/t/t8014-blame-ignore-fuzzy.sh b/t/t8014-blame-ignore-fuzzy.sh
index 1d8fa1da74c9..1ff59624e915 100755
--- a/t/t8014-blame-ignore-fuzzy.sh
+++ b/t/t8014-blame-ignore-fuzzy.sh
@@ -3,9 +3,6 @@
 test_description='git blame ignore fuzzy heuristic'
 . ./test-lib.sh
 
-# short circuit until blame has the fuzzy capabilities
-test_done
-
 pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
 
 # Each test is composed of 4 variables:
-- 
2.22.0.rc2.383.gf4fbbf30c2-goog


  parent reply index

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-10 15:30 [PATCH v8 0/9] blame: add the ability to ignore commits Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 1/9] fsck: rename and touch up init_skiplist() Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 2/9] Move oidset_parse_file() to oidset.c Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 3/9] blame: use a helper function in blame_chunk() Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 4/9] blame: add the ability to ignore commits and their changes Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 5/9] blame: add config options for the output of ignored or unblamable lines Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 6/9] blame: optionally track line fingerprints during fill_blame_origin() Barret Rhoden
2019-06-10 15:30 ` [PATCH v8 7/9] blame: add a fingerprint heuristic to match ignored lines Barret Rhoden
2019-06-13 15:17   ` SZEDER Gábor
2019-06-13 16:38     ` Junio C Hamano
2019-06-16 20:44     ` [PATCH] t8014: avoid git command in upstream pipe michael
2019-06-17 15:03       ` Barret Rhoden
2019-06-10 15:30 ` Barret Rhoden [this message]
2019-06-10 15:30 ` [PATCH v8 9/9] blame: add a test to cover blame_coalesce() Barret Rhoden

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190610153014.42055-9-brho@google.com \
    --to=brho@google.com \
    --cc=Johannes.Schindelin@gmx.de \
    --cc=avarab@gmail.com \
    --cc=dak@gnu.org \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=l.s.r@web.de \
    --cc=michael@platin.gs \
    --cc=peff@peff.net \
    --cc=stefanbeller@gmail.com \
    --cc=whydoubt@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

git@vger.kernel.org list mirror (unofficial, one of many)

Archives are clonable:
	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://ou63pmih66umazou.onion/inbox.comp.version-control.git
	nntp://czquwvybam4bgbro.onion/inbox.comp.version-control.git
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.version-control.git
	nntp://news.gmane.org/gmane.comp.version-control.git

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git