git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Linus Torvalds <torvalds@osdl.org>
To: Junio C Hamano <junkio@cox.net>
Cc: Fredrik Kuivinen <freku045@student.liu.se>,
	Git Mailing List <git@vger.kernel.org>
Subject: Re: Handling large files with GIT
Date: Wed, 15 Feb 2006 19:25:32 -0800 (PST)	[thread overview]
Message-ID: <Pine.LNX.4.64.0602151915010.916@g5.osdl.org> (raw)
In-Reply-To: <7vd5hpj6ab.fsf@assigned-by-dhcp.cox.net>



Btw, here's one last gasp on this thread: it generalizes the notion of 
traversing several trees in sync, which could be used to do the n-way diff 
for the "-c" and "--cc" style merge diffs a lot more efficiently.

I didn't check, but I'm pretty sure that this would bring the cost of 
doing the 12-way diff down to way under a second. Right now:

	[torvalds@g5 linux]$ time git-diff-tree -c 9fdb62a > /dev/null 

	real    0m1.279s
	user    0m1.272s
	sys     0m0.008s

and that's a bit too much. We I'd really have expected us to be able to do 
better.

It should be possible to do this as a 

	traverse_trees(12, &trees, "", combined_diff_callback);

fairly cheaply (and quickly throw away anything where any of the parents 
was the same as the result).

Junio, that "traverse_trees()" logic is totally independent of whether we 
actually do "git-merge-tree" or not, so if you want to, I could split up 
the patches the other way (and merge "traverse_trees()" first as a new 
interface, independently).

		Linus

----
git-merge-tree: generalize the "traverse <n> trees in sync" functionality

It's actually very useful for other things too. Notably, we could do the
combined diff a lot more efficiently with this.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>

diff --git a/merge-tree.c b/merge-tree.c
index 6381118..2a9a013 100644
--- a/merge-tree.c
+++ b/merge-tree.c
@@ -125,44 +125,19 @@ static void unresolved(const char *base,
 		printf("3 %06o %s %s%s\n", n[2].mode, sha1_to_hex(n[2].sha1), base, n[2].path);
 }
 
-/*
- * Merge two trees together (t[1] and t[2]), using a common base (t[0])
- * as the origin.
- *
- * This walks the (sorted) trees in lock-step, checking every possible
- * name. Note that directories automatically sort differently from other
- * files (see "base_name_compare"), so you'll never see file/directory
- * conflicts, because they won't ever compare the same.
- *
- * IOW, if a directory changes to a filename, it will automatically be
- * seen as the directory going away, and the filename being created.
- *
- * Think of this as a three-way diff.
- *
- * The output will be either:
- *  - successful merge
- *	 "0 mode sha1 filename"
- *    NOTE NOTE NOTE! FIXME! We really really need to walk the index
- *    in parallel with this too!
- * 
- *  - conflict:
- *	"1 mode sha1 filename"
- *	"2 mode sha1 filename"
- *	"3 mode sha1 filename"
- *    where not all of the 1/2/3 lines may exist, of course.
- *
- * The successful merge rules are the same as for the three-way merge
- * in git-read-tree.
- */
-static void merge_trees(struct tree_desc t[3], const char *base)
+typedef void (*traverse_callback_t)(int n, unsigned long mask, struct name_entry *entry, const char *base);
+
+static void traverse_trees(int n, struct tree_desc *t, const char *base, traverse_callback_t callback)
 {
+	struct name_entry *entry = xmalloc(n*sizeof(*entry));
+
 	for (;;) {
 		struct name_entry entry[3];
-		unsigned int mask = 0;
+		unsigned long mask = 0;
 		int i, last;
 
 		last = -1;
-		for (i = 0; i < 3; i++) {
+		for (i = 0; i < n; i++) {
 			if (!t[i].size)
 				continue;
 			entry_extract(t+i, entry+i);
@@ -182,7 +157,7 @@ static void merge_trees(struct tree_desc
 				if (cmp < 0)
 					mask = 0;
 			}
-			mask |= 1u << i;
+			mask |= 1ul << i;
 			last = i;
 		}
 		if (!mask)
@@ -192,38 +167,77 @@ static void merge_trees(struct tree_desc
 		 * Update the tree entries we've walked, and clear
 		 * all the unused name-entries.
 		 */
-		for (i = 0; i < 3; i++) {
-			if (mask & (1u << i)) {
+		for (i = 0; i < n; i++) {
+			if (mask & (1ul << i)) {
 				update_tree_entry(t+i);
 				continue;
 			}
 			entry_clear(entry + i);
 		}
+		callback(n, mask, entry, base);
+	}
+	free(entry);
+}
 
-		/* Same in both? */
-		if (same_entry(entry+1, entry+2)) {
-			if (entry[0].sha1) {
-				resolve(base, NULL, entry+1);
-				continue;
-			}
+/*
+ * Merge two trees together (t[1] and t[2]), using a common base (t[0])
+ * as the origin.
+ *
+ * This walks the (sorted) trees in lock-step, checking every possible
+ * name. Note that directories automatically sort differently from other
+ * files (see "base_name_compare"), so you'll never see file/directory
+ * conflicts, because they won't ever compare the same.
+ *
+ * IOW, if a directory changes to a filename, it will automatically be
+ * seen as the directory going away, and the filename being created.
+ *
+ * Think of this as a three-way diff.
+ *
+ * The output will be either:
+ *  - successful merge
+ *	 "0 mode sha1 filename"
+ *    NOTE NOTE NOTE! FIXME! We really really need to walk the index
+ *    in parallel with this too!
+ * 
+ *  - conflict:
+ *	"1 mode sha1 filename"
+ *	"2 mode sha1 filename"
+ *	"3 mode sha1 filename"
+ *    where not all of the 1/2/3 lines may exist, of course.
+ *
+ * The successful merge rules are the same as for the three-way merge
+ * in git-read-tree.
+ */
+static void threeway_callback(int n, unsigned long mask, struct name_entry *entry, const char *base)
+{
+	/* Same in both? */
+	if (same_entry(entry+1, entry+2)) {
+		if (entry[0].sha1) {
+			resolve(base, NULL, entry+1);
+			return;
 		}
+	}
 
-		if (same_entry(entry+0, entry+1)) {
-			if (entry[2].sha1 && !S_ISDIR(entry[2].mode)) {
-				resolve(base, entry+1, entry+2);
-				continue;
-			}
+	if (same_entry(entry+0, entry+1)) {
+		if (entry[2].sha1 && !S_ISDIR(entry[2].mode)) {
+			resolve(base, entry+1, entry+2);
+			return;
 		}
+	}
 
-		if (same_entry(entry+0, entry+2)) {
-			if (entry[1].sha1 && !S_ISDIR(entry[1].mode)) {
-				resolve(base, NULL, entry+1);
-				continue;
-			}
+	if (same_entry(entry+0, entry+2)) {
+		if (entry[1].sha1 && !S_ISDIR(entry[1].mode)) {
+			resolve(base, NULL, entry+1);
+			return;
 		}
-
-		unresolved(base, entry);
 	}
+
+	unresolved(base, entry);
+}
+
+static void merge_trees(struct tree_desc t[3], const char *base)
+{
+	traverse_trees(3, t, base, threeway_callback);
 }
 
 static void *get_tree_descriptor(struct tree_desc *desc, const char *rev)

  parent reply	other threads:[~2006-02-16  3:25 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-02-08  9:14 Handling large files with GIT Martin Langhoff
2006-02-08 11:54 ` Johannes Schindelin
2006-02-08 16:34   ` Linus Torvalds
2006-02-08 17:01     ` Linus Torvalds
2006-02-08 20:11       ` Junio C Hamano
2006-02-08 21:20 ` Florian Weimer
2006-02-08 22:35   ` Martin Langhoff
2006-02-13  1:26     ` Ben Clifford
2006-02-13  3:42       ` Linus Torvalds
2006-02-13  4:57         ` Linus Torvalds
2006-02-13  5:05           ` Linus Torvalds
2006-02-13 23:17             ` Ian Molton
2006-02-13 23:19               ` Martin Langhoff
2006-02-14 18:56               ` Johannes Schindelin
2006-02-14 19:52                 ` Linus Torvalds
2006-02-14 21:21                   ` Sam Vilain
2006-02-14 22:01                     ` Linus Torvalds
2006-02-14 22:30                       ` Junio C Hamano
2006-02-15  0:40                         ` Sam Vilain
2006-02-15  1:39                           ` Junio C Hamano
2006-02-15  4:03                             ` Sam Vilain
2006-02-15  2:07                           ` Martin Langhoff
2006-02-15  2:05                         ` Linus Torvalds
2006-02-15  2:18                           ` Linus Torvalds
2006-02-15  2:33                             ` Linus Torvalds
2006-02-15  3:58                               ` Linus Torvalds
2006-02-15  9:54                                 ` Junio C Hamano
2006-02-15 15:44                                   ` Linus Torvalds
2006-02-15 17:16                                     ` Linus Torvalds
2006-02-16  3:25                                   ` Linus Torvalds [this message]
2006-02-16  3:29                                     ` Junio C Hamano
2006-02-16 20:32                                 ` Fredrik Kuivinen
2006-02-13  5:55           ` Jeff Garzik
2006-02-13  6:07             ` Keith Packard
2006-02-14  0:07               ` Martin Langhoff
2006-02-13 16:19             ` Linus Torvalds
2006-02-13  4:40       ` Martin Langhoff
2006-02-09  4:54   ` Greg KH
2006-02-09  5:38     ` Martin Langhoff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0602151915010.916@g5.osdl.org \
    --to=torvalds@osdl.org \
    --cc=freku045@student.liu.se \
    --cc=git@vger.kernel.org \
    --cc=junkio@cox.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).