git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Junio C Hamano <junkio@cox.net>
To: Linus Torvalds <torvalds@osdl.org>
Cc: Git Mailing List <git@vger.kernel.org>
Subject: [PATCH 12/12] Optimize diff-tree -[CM] --stdin
Date: Fri, 27 May 2005 15:56:38 -0700	[thread overview]
Message-ID: <7vhdgo2s7t.fsf_-_@assigned-by-dhcp.cox.net> (raw)
In-Reply-To: <7vk6lk5lxt.fsf_-_@assigned-by-dhcp.cox.net> (Junio C. Hamano's message of "Fri, 27 May 2005 15:43:58 -0700")

This attempts to optimize "diff-tree -[CM] --stdin", which
compares successible tree pairs.  This optimization does not
make much sense for other commands in the diff-* brothers.

When reading from --stdin and using rename/copy detection, the
patch makes diff-tree to read the current index file first.
This is done to reuse the optimization used by diff-cache in the
non-cached case.  Similarity estimator can avoid expanding a
blob if the index says what is in the work tree has an exact
copy of that blob already expanded.

Another optimization the patch makes is to check only file sizes
first to terminate similarity estimation early.  In order for
this to work, it needs a way to tell the size of the blob
without expanding it.  Since an obvious way of doing it, which
is to keep all the blobs previously used in the memory, is too
costly, it does so by keeping the filesize for each object it
has already seen in memory.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---

diff-tree.c        |    3 +
diff.c             |   83 +++++++++++++++++++++++++++++++++++++++++++++++++++--
diff.h             |    2 +
diffcore-pickaxe.c |    2 -
diffcore-rename.c  |   19 ++++++++----
diffcore.h         |    2 -
6 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/diff-tree.c b/diff-tree.c
--- a/diff-tree.c
+++ b/diff-tree.c
@@ -578,6 +578,9 @@ int main(int argc, const char **argv)
 	if (!read_stdin)
 		return 0;
 
+	if (detect_rename)
+		diff_setup_opt |= (DIFF_SETUP_USE_SIZE_CACHE |
+				   DIFF_SETUP_USE_CACHE);
 	while (fgets(line, sizeof(line), stdin))
 		diff_tree_stdin(line);
 
diff --git a/diff.c b/diff.c
--- a/diff.c
+++ b/diff.c
@@ -12,6 +12,7 @@ static const char *diff_opts = "-pu";
 static unsigned char null_sha1[20] = { 0, };
 
 static int reverse_diff;
+static int use_size_cache;
 
 static const char *external_diff(void)
 {
@@ -222,12 +223,60 @@ static int work_tree_matches(const char 
 	return 1;
 }
 
+static struct sha1_size_cache {
+	unsigned char sha1[20];
+	unsigned long size;
+} **sha1_size_cache;
+static int sha1_size_cache_nr, sha1_size_cache_alloc;
+
+static struct sha1_size_cache *locate_size_cache(unsigned char *sha1,
+						 unsigned long size)
+{
+	int first, last;
+	struct sha1_size_cache *e;
+
+	first = 0;
+	last = sha1_size_cache_nr;
+	while (last > first) {
+		int next = (last + first) >> 1;
+		e = sha1_size_cache[next];
+		int cmp = memcmp(e->sha1, sha1, 20);
+		if (!cmp)
+			return e;
+		if (cmp < 0) {
+			last = next;
+			continue;
+		}
+		first = next+1;
+	}
+	/* not found */
+	if (size == UINT_MAX)
+		return NULL;
+	/* insert to make it at "first" */
+	if (sha1_size_cache_alloc <= sha1_size_cache_nr) {
+		sha1_size_cache_alloc = alloc_nr(sha1_size_cache_alloc);
+		sha1_size_cache = xrealloc(sha1_size_cache,
+					   sha1_size_cache_alloc *
+					   sizeof(*sha1_size_cache));
+	}
+	sha1_size_cache_nr++;
+	if (first < sha1_size_cache_nr)
+		memmove(sha1_size_cache + first + 1, sha1_size_cache + first,
+			(sha1_size_cache_nr - first - 1) *
+			sizeof(*sha1_size_cache));
+	e = xmalloc(sizeof(struct sha1_size_cache));
+	sha1_size_cache[first] = e;
+	memcpy(e->sha1, sha1, 20);
+	e->size = size;
+	return e;
+}
+
 /*
  * While doing rename detection and pickaxe operation, we may need to
  * grab the data for the blob (or file) for our own in-core comparison.
  * diff_filespec has data and size fields for this purpose.
  */
-int diff_populate_filespec(struct diff_filespec *s)
+int diff_populate_filespec(struct diff_filespec *s, int size_only)
 {
 	int err = 0;
 	if (!DIFF_FILE_VALID(s))
@@ -235,6 +284,9 @@ int diff_populate_filespec(struct diff_f
 	if (S_ISDIR(s->mode))
 		return -1;
 
+	if (!use_size_cache)
+		size_only = 0;
+
 	if (s->data)
 		return err;
 	if (!s->sha1_valid ||
@@ -254,6 +306,8 @@ int diff_populate_filespec(struct diff_f
 		s->size = st.st_size;
 		if (!s->size)
 			goto empty;
+		if (size_only)
+			return 0;
 		if (S_ISLNK(st.st_mode)) {
 			int ret;
 			s->data = xmalloc(s->size);
@@ -273,9 +327,21 @@ int diff_populate_filespec(struct diff_f
 		close(fd);
 	}
 	else {
+		/* We cannot do size only for SHA1 blobs */
 		char type[20];
+		struct sha1_size_cache *e;
+
+		if (size_only) {
+			e = locate_size_cache(s->sha1, UINT_MAX);
+			if (e) {
+				s->size = e->size;
+				return 0;
+			}
+		}
 		s->data = read_sha1_file(s->sha1, type, &s->size);
 		s->should_free = 1;
+		if (s->data && size_only)
+			locate_size_cache(s->sha1, s->size);
 	}
 	return 0;
 }
@@ -361,7 +427,7 @@ static void prepare_temp_file(const char
 		return;
 	}
 	else {
-		if (diff_populate_filespec(one))
+		if (diff_populate_filespec(one, 0))
 			die("cannot read data blob for %s", one->path);
 		prep_temp_blob(temp, one->data, one->size,
 			       one->sha1, one->mode);
@@ -496,6 +562,19 @@ void diff_setup(int flags)
 {
 	if (flags & DIFF_SETUP_REVERSE)
 		reverse_diff = 1;
+	if (flags & DIFF_SETUP_USE_CACHE) {
+		if (!active_cache)
+			/* read-cache does not die even when it fails
+			 * so it is safe for us to do this here.  Also
+			 * it does not smudge active_cache or active_nr
+			 * when it fails, so we do not have to worry about
+			 * cleaning it up oufselves either.
+			 */
+			read_cache();
+	}
+	if (flags & DIFF_SETUP_USE_SIZE_CACHE)
+		use_size_cache = 1;
+	
 }
 
 struct diff_queue_struct diff_queued_diff;
diff --git a/diff.h b/diff.h
--- a/diff.h
+++ b/diff.h
@@ -29,6 +29,8 @@ extern void diff_unmerge(const char *pat
 extern int diff_scoreopt_parse(const char *opt);
 
 #define DIFF_SETUP_REVERSE      	1
+#define DIFF_SETUP_USE_CACHE		2
+#define DIFF_SETUP_USE_SIZE_CACHE	4
 extern void diff_setup(int flags);
 
 #define DIFF_DETECT_RENAME	1
diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -11,7 +11,7 @@ static int contains(struct diff_filespec
 {
 	unsigned long offset, sz;
 	const char *data;
-	if (diff_populate_filespec(one))
+	if (diff_populate_filespec(one, 0))
 		return 0;
 	sz = one->size;
 	data = one->data;
diff --git a/diffcore-rename.c b/diffcore-rename.c
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -99,8 +99,11 @@ static int is_exact_match(struct diff_fi
 	if (src->sha1_valid && dst->sha1_valid &&
 	    !memcmp(src->sha1, dst->sha1, 20))
 		return 1;
-	if (diff_populate_filespec(src) || diff_populate_filespec(dst))
-		/* this is an error but will be caught downstream */
+	if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
+		return 0;
+	if (src->size != dst->size)
+		return 0;
+	if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
 		return 0;
 	if (src->size == dst->size &&
 	    !memcmp(src->data, dst->data, src->size))
@@ -125,9 +128,11 @@ static int estimate_similarity(struct di
 	 * dst, and then some edit has been applied to dst.
 	 *
 	 * Compare them and return how similar they are, representing
-	 * the score as an integer between 0 and 10000, except
-	 * where they match exactly it is considered better than anything
-	 * else.
+	 * the score as an integer between 0 and MAX_SCORE.
+	 *
+	 * When there is an exact match, it is considered a better
+	 * match than anything else; the destination does not even
+	 * call into this function in that case.
 	 */
 	void *delta;
 	unsigned long delta_size, base_size;
@@ -147,6 +152,7 @@ static int estimate_similarity(struct di
 	/* We would not consider edits that change the file size so
 	 * drastically.  delta_size must be smaller than
 	 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
+	 *
 	 * Note that base_size == 0 case is handled here already
 	 * and the final score computation below would not have a
 	 * divide-by-zero issue.
@@ -154,6 +160,9 @@ static int estimate_similarity(struct di
 	if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
+	if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
+		return 0; /* error but caught downstream */
+
 	delta = diff_delta(src->data, src->size,
 			   dst->data, dst->size,
 			   &delta_size);
diff --git a/diffcore.h b/diffcore.h
--- a/diffcore.h
+++ b/diffcore.h
@@ -33,7 +33,7 @@ extern struct diff_filespec *alloc_files
 extern void fill_filespec(struct diff_filespec *, const unsigned char *,
 			  unsigned short);
 
-extern int diff_populate_filespec(struct diff_filespec *);
+extern int diff_populate_filespec(struct diff_filespec *, int);
 extern void diff_free_filespec_data(struct diff_filespec *);
 
 struct diff_filepair {
------------------------------------------------


  parent reply	other threads:[~2005-05-27 22:55 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-05-27  0:41 Broken directory pathname pruning Linus Torvalds
2005-05-27  0:42 ` Junio C Hamano
2005-05-27  0:49 ` [PATCH] allow pathspec to end with a slash Junio C Hamano
2005-05-27  0:52   ` [PATCH] allow pathspec to end with a slash (take #2) Junio C Hamano
2005-05-27  6:41 ` [PATCH] Diff updates, fixing pathspec and rename/copy interaction Junio C Hamano
2005-05-27 15:56   ` Linus Torvalds
2005-05-27 18:22     ` Junio C Hamano
2005-05-27 22:43     ` [PATCH 00/12] Diff updates Junio C Hamano
2005-05-27 22:49       ` [PATCH 01/12] Fix math thinko in similarity estimator Junio C Hamano
2005-05-27 22:50       ` [PATCH 02/12] Introduce diff_free_filepair() funcion Junio C Hamano
2005-05-27 22:51       ` [PATCH 03/12] Make pathspec only care about the detination tree Junio C Hamano
2005-05-27 22:52       ` [PATCH 04/12] Remove unused rank field from diff_core structure Junio C Hamano
2005-05-27 22:53       ` [PATCH 05/12] Do not expose internal scaling to diff-helper Junio C Hamano
2005-05-27 22:54       ` [PATCH 06/12] Remove final newline from the value of xfrm_msg variable Junio C Hamano
2005-05-27 22:54       ` [PATCH 07/12] Clean up diff_setup() to make it more extensible Junio C Hamano
2005-05-27 22:55       ` [PATCH 08/12] Remove a function not used anymore Junio C Hamano
2005-05-27 22:55       ` [PATCH 09/12] Add --pickaxe-all to diff-* brothers Junio C Hamano
2005-05-27 22:55       ` [PATCH 10/12] Fix the way diffcore-rename records unremoved source Junio C Hamano
2005-05-27 22:56       ` [PATCH 11/12] Move pathspec to the beginning of the diffcore chain Junio C Hamano
2005-05-27 22:56       ` Junio C Hamano [this message]
2005-06-04  2:17         ` [PATCH 12/12] Optimize diff-tree -[CM] --stdin Yoichi Yuasa
2005-05-27 23:03       ` [PATCH 00/12] Diff updates Junio C Hamano
2005-05-28 10:11         ` [PATCH] Do not show empty diff in diff-cache uncached Junio C Hamano
2005-05-28 19:22           ` [PATCH] Diff: two fixes Junio C Hamano
2005-05-29  4:20             ` [PATCH] diff-helper: fix R/C score parsing under -z flag Junio C Hamano
2005-05-29  5:23               ` [PATCH] diff-cache: diff-patch (-p) format fixes Junio C Hamano
2005-05-29  9:10                 ` [PATCH] diff: code clean-up Junio C Hamano
2005-05-29 18:53           ` [PATCH] Do not show empty diff in diff-cache uncached Linus Torvalds
2005-05-29 20:09             ` Junio C Hamano
2005-05-29 21:52               ` Junio C Hamano
2005-05-29 23:41                 ` [PATCH 0/3] Leftover bits after 12-series Junio C Hamano
2005-05-29 23:54                   ` [PATCH 1/3] diff-helper: Fix R/C score parsing under -z flag Junio C Hamano
2005-05-29 23:56                   ` [PATCH 2/3] diff: consolidate various calls into diffcore Junio C Hamano
2005-05-29 23:56                   ` [PATCH 3/3] diff: code clean-up and removal of rename hack Junio C Hamano
2005-05-30  6:58                   ` [PATCH 0/4] Junio C Hamano
2005-05-30  7:07                     ` [PATCH 1/4] diff: further cleanup Junio C Hamano
2005-05-30  7:08                     ` [PATCH 2/4] diff: fix the culling of unneeded delete record Junio C Hamano
2005-05-30  7:08                     ` [PATCH 3/4] Add -B flag to diff-* brothers Junio C Hamano
2005-05-30  7:09                     ` [PATCH 4/4] Add -O<orderfile> option " Junio C Hamano
2005-05-30  5:34                 ` [PATCH] Do not show empty diff in diff-cache uncached Linus Torvalds
2005-05-30  5:53                   ` Junio C Hamano
2005-06-11 23:27             ` [PATCH] apply.c: tolerate diff from a dirty but unchanged path Junio C Hamano
2005-06-12 16:14               ` Linus Torvalds
2005-06-12 17:05                 ` Linus Torvalds
2005-06-12 18:34                   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7vhdgo2s7t.fsf_-_@assigned-by-dhcp.cox.net \
    --to=junkio@cox.net \
    --cc=git@vger.kernel.org \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).