git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [WIP PATCH] Manual rename correction
@ 2012-07-31 14:15 Nguyen Thai Ngoc Duy
  2012-07-31 16:32 ` Junio C Hamano
  0 siblings, 1 reply; 35+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-07-31 14:15 UTC (permalink / raw
  To: git

Git's rename detection is good but still not perfect. There have been
a few times I wanted to correct git for better output but I
couldn't. This PoC WIP patch attempts to address that. It allows to
break/rearrange any file pairs. We can do something crazy like this:

 attr.c => dir.c      | 1786 ++++++++++++++++++++++++++++++++-----------------
 dir.c => attr.c      | 1788 +++++++++++++++++---------------------------------
 t/t1306-xdg-files.sh |   39 ++
 t/test-lib.sh        |    1 +
 4 files changed, 1828 insertions(+), 1786 deletions(-)

The above output is done with "git diff --manual-rename=foo A B"
and "foo" contains (probably not in the best format though)

-- 8< --
attr.c dir.c
dir.c attr.c
-- 8< --

The plan is to use git-notes to record rename corrections like above
so that "git log --patch" for example can make use of them. I'm not
sure what to do with merge commits yet (can we track renames in a
merge?). We can generate rename file from "diff -CM", then users can
edit and save it.

If you want to diff between two arbitrary trees, you'll have to feed
rename corrections via command line as git-notes are for commit diff
only.

In some cases, manual rename may be cheaper than --find-copies-harder,
so this feature could help reduce cpu usage. Though that's not my main
aim.

Oh and I think rename detection in diff other than tree-tree does not
work. Maybe I tested it the wrong way?

Comments?

-- 8< --
diff --git a/diff.c b/diff.c
index 62cbe14..c8d55d2 100644
--- a/diff.c
+++ b/diff.c
@@ -3547,6 +3547,12 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		DIFF_OPT_SET(options, RENAME_EMPTY);
 	else if (!strcmp(arg, "--no-rename-empty"))
 		DIFF_OPT_CLR(options, RENAME_EMPTY);
+	else if (!prefixcmp(arg, "--manual-rename=")) {
+		int ret = strbuf_read_file(&options->renames, arg + 16, 0);
+		if (ret == -1)
+			die("unable to read %s", arg + 16);
+		DIFF_OPT_SET(options, MANUAL_RENAME);
+	}
 	else if (!strcmp(arg, "--relative"))
 		DIFF_OPT_SET(options, RELATIVE_NAME);
 	else if (!prefixcmp(arg, "--relative=")) {
@@ -4621,6 +4627,8 @@ void diffcore_std(struct diff_options *options)
 	if (options->skip_stat_unmatch)
 		diffcore_skip_stat_unmatch(options);
 	if (!options->found_follow) {
+		if (DIFF_OPT_TST(options, MANUAL_RENAME))
+			diffcore_manual_rename(options);
 		/* See try_to_follow_renames() in tree-diff.c */
 		if (options->break_opt != -1)
 			diffcore_break(options->break_opt);
diff --git a/diff.h b/diff.h
index e027650..60d104e 100644
--- a/diff.h
+++ b/diff.h
@@ -61,6 +61,7 @@ typedef struct strbuf *(*diff_prefix_fn_t)(struct diff_options *opt, void *data)
 #define DIFF_OPT_FIND_COPIES_HARDER  (1 <<  6)
 #define DIFF_OPT_FOLLOW_RENAMES      (1 <<  7)
 #define DIFF_OPT_RENAME_EMPTY        (1 <<  8)
+#define DIFF_OPT_MANUAL_RENAME       (1 <<  9)
 /* (1 <<  9) unused */
 #define DIFF_OPT_HAS_CHANGES         (1 << 10)
 #define DIFF_OPT_QUICK               (1 << 11)
@@ -147,6 +148,7 @@ struct diff_options {
 	int close_file;
 
 	struct pathspec pathspec;
+	struct strbuf renames;
 	change_fn_t change;
 	add_remove_fn_t add_remove;
 	diff_format_fn_t format_callback;
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 216a7a4..05da99f 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -722,3 +722,148 @@ void diffcore_rename(struct diff_options *options)
 	rename_src_nr = rename_src_alloc = 0;
 	return;
 }
+
+struct rename {
+	char *one, *two;
+	struct rename *next_one, *next_two;
+	struct diff_filespec *spec_one;
+	struct diff_filespec *spec_two;
+};
+
+static unsigned int string_hash(const char *s)
+{
+	unsigned int v = 1;
+	while (s && *s)
+		v += (unsigned char)*s++;
+	return v;
+}
+
+void diffcore_manual_rename(struct diff_options *options)
+{
+	struct rename *renames = NULL;
+	int i, nr = 0, alloc = 0;
+	const char *next, *p, *end;
+	struct hash_table hash_one, hash_two;
+	struct diff_queue_struct *q = &diff_queued_diff;
+	struct diff_queue_struct outq;
+
+	/* parse rename instructions */
+	end = options->renames.buf + options->renames.len;
+	for (p = options->renames.buf; p < end; p = next) {
+		struct rename *r;
+		const char *sep, *nl, *next_sep;
+
+		nl = strchr(p, '\n');
+		if (!nl)
+			nl = next = end;
+		else {
+			next = nl + 1;
+			if (p == nl)
+				continue;
+		}
+
+		/* one space to separate two paths (for now, quoting can come later) */
+		sep = strchr(p, ' ');
+		if (!sep || sep >= nl)
+			die("invalid syntax");
+		next_sep = strchr(sep + 1, ' ');
+		if (next_sep && next_sep < nl)
+			die("invalid syntax");
+
+		ALLOC_GROW(renames, nr + 1, alloc);
+		r = renames + nr++;
+		memset(r, 0, sizeof(*r));
+		if (p < sep)
+			r->one = xstrndup(p, sep - p);
+		if (sep < nl)
+			r->two = xstrndup(sep + 1, nl - (sep + 1));
+	}
+
+
+	/* initialize hash tables */
+	init_hash(&hash_one);
+	init_hash(&hash_two);
+	for (i = 0; i < nr; i++) {
+		struct rename *r = renames + i;
+		void** p;
+		p = insert_hash(string_hash(r->one), r, &hash_one);
+		if (p)  {
+			r->next_one = *p;
+			*p = r;
+		}
+		p = insert_hash(string_hash(r->two), r, &hash_two);
+		if (p) {
+			r->next_two = *p;
+			*p = r;
+		}
+	}
+
+	/* rename */
+	DIFF_QUEUE_CLEAR(&outq);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		struct rename *r1 = NULL, *r2 = NULL;
+		int hash, skip = 0;
+		if (DIFF_PAIR_UNMERGED(p))
+			continue;
+		if (DIFF_FILE_VALID(p->one)) {
+			hash = string_hash(p->one->path);
+			r1 = lookup_hash(hash, &hash_one);
+			while (r1) {
+				if (!strcmp(p->one->path, r1->one)) {
+					r1->spec_one = p->one;
+					skip = 1;
+				}
+				r1 = r1->next_one;
+			}
+		}
+		if (DIFF_FILE_VALID(p->two)) {
+			hash = string_hash(p->two->path);
+			r2 = lookup_hash(hash, &hash_two);
+			while (r2) {
+				if (!strcmp(p->two->path, r2->two)) {
+					r2->spec_two = p->two;
+					skip = 1;
+				}
+				r2 = r2->next_two;
+			}
+		}
+
+		/* This pair has nothing to do with manual renames,
+		   reinsert it */
+		if (!skip)
+			diff_q(&outq, p);
+	}
+	free(q->queue);
+
+	for (i = 0; i < nr; i++) {
+		struct rename *r = renames + i;
+		struct diff_filepair *dp;
+		if (r->spec_one && r->spec_two) {
+			dp = diff_queue(&outq, r->spec_one, r->spec_two);
+			dp->renamed_pair = 1;
+			dp->score = MAX_SCORE;
+		} else if (r->spec_one && !r->two) {
+			dp = diff_queue(&outq, r->spec_one,
+					alloc_filespec(r->one));
+		} else if (!r->one && r->spec_two) {
+			dp = diff_queue(&outq, alloc_filespec(r->two),
+					r->spec_two);
+		} else {
+			die("incorrect rename %s %s", r->one, r->two);
+		}
+	}
+	*q = outq;
+	/* required? */
+	diffcore_fix_diff_index(options);
+
+	/* cleanup */
+	for (i = 0; i < nr; i++) {
+		struct rename *r = renames + i;
+		free(r->one);
+		free(r->two);
+	}
+	free(renames);
+	free_hash(&hash_one);
+	free_hash(&hash_two);
+}
diff --git a/diffcore.h b/diffcore.h
index be0739c..193bc67 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -107,6 +107,7 @@ extern void diff_q(struct diff_queue_struct *, struct diff_filepair *);
 
 extern void diffcore_break(int);
 extern void diffcore_rename(struct diff_options *);
+extern void diffcore_manual_rename(struct diff_options *);
 extern void diffcore_merge_broken(void);
 extern void diffcore_pickaxe(struct diff_options *);
 extern void diffcore_order(const char *orderfile);
-- 8< --
-- 
Duy

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-07-31 14:15 [WIP PATCH] Manual rename correction Nguyen Thai Ngoc Duy
@ 2012-07-31 16:32 ` Junio C Hamano
  2012-07-31 19:23   ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-07-31 16:32 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> The above output is done with "git diff --manual-rename=foo A B"
> and "foo" contains (probably not in the best format though)
>
> -- 8< --
> attr.c dir.c
> dir.c attr.c
> -- 8< --
> ...
> Comments?

It is a good direction to go in, I would think, to give users a way
to explicitly tell that "in comparison between these two trees, I
know path B in the postimage corresponds to path A in the preimage".

I however wonder why you did this as a separate function that only
does the explicitly marked ones.  Probably it was easier as a POC to
do it this way, and that is fine.

The real version should do this in the same diffcore_rename()
function, by excluding the paths that the user explicitly told you
about from the the automatic matching logic, and instead matching
them up manually; then you can let the remainder of the paths be
paired by the existing code.

Notice how the non-nullness of rename_dst[i].pair is used as a cue
to skip the similarity computation in the expensive matrix part of
diffcore_rename()?  That comes from find_exact_renames() that is
called earlier in the function.  I would imagine that your logic
would fit _before_ we call find_exact_renames() as a call to a new
helper function (e.g. "record_explicit_renames()" perhaps).
Anything that reduces the cost in the matrix part should come
earlier, as that reduces the number of pairs we would need to try
matching up.

We might want to introduce a way to express the similarity score for
such a filepair that was manually constructed when showing the
result, though.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-07-31 16:32 ` Junio C Hamano
@ 2012-07-31 19:23   ` Jeff King
  2012-07-31 20:20     ` Junio C Hamano
  2012-08-01  1:10     ` Nguyen Thai Ngoc Duy
  0 siblings, 2 replies; 35+ messages in thread
From: Jeff King @ 2012-07-31 19:23 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Tue, Jul 31, 2012 at 09:32:49AM -0700, Junio C Hamano wrote:

> Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
> 
> > The above output is done with "git diff --manual-rename=foo A B"
> > and "foo" contains (probably not in the best format though)
> >
> > -- 8< --
> > attr.c dir.c
> > dir.c attr.c
> > -- 8< --
> > ...
> > Comments?
> 
> It is a good direction to go in, I would think, to give users a way
> to explicitly tell that "in comparison between these two trees, I
> know path B in the postimage corresponds to path A in the preimage".

I do not think that is the right direction. Let's imagine that I have a
commit "A" and I annotate it (via notes or whatever) to say "between
A^^{tree} and A^{tree}, foo.c became bar.c". That will help me when
doing "git show" or "git log". But it will not help me when I later try
to merge "A" (or its descendent). In that case, I will compute the diff
between "A" and the merge-base (or worse, some descendent of "A" and the
merge-base), and I will miss this hint entirely.

A much better hint is to annotate pairs of sha1s, to say "do not bother
doing inexact rename correlation on this pair; I promise that they have
value N". Then it will find that pair no matter which trees or commits
are being diffed, and it will do so relatively inexpensively[1].

That is not fool-proof, of course. You might have a manual rename from
sha1 X to sha1 Y, and then a slight modification to Y to make Z. So you
would want some kind of transitivity to notice that X and Z correlate.
I think you could model it as a graph problem; sha1s are nodes, and each
"this is a rename" pair of annotated sha1s has an edge between them.
They are the "same file" if there is a path.

Of course that gives you bizarre and counter-intuitive results, because
X and Z might not actually be that similar. And that is why we have
rename detection in the first place. The idea of file identity (which
this fundamentally is) leads to these sorts of weird results.

I'm sure you could get better results by weakening the transitivity
according to the rename score, or something like that. But now you are
getting pretty complex.

-Peff

[1] We could actually cache rename results by storing pairs of sha1s
    along with their rename score, and should be able to get a good
    speedup (we are still src*dst in comparing, but now the comparison
    is a simple table lookup rather than loading the blobs and computing
    the differences). If we had such a cache, then manually marking a
    rename would just be a matter of priming the cache with your manual
    entries.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-07-31 19:23   ` Jeff King
@ 2012-07-31 20:20     ` Junio C Hamano
  2012-08-01  0:42       ` Jeff King
  2012-08-01  1:10     ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-07-31 20:20 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> A much better hint is to annotate pairs of sha1s, to say "do not bother
> doing inexact rename correlation on this pair; I promise that they have
> value N".

Surely.  And I suspect that the patch to the current codebase to do
so would be much less impact if you go that way.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-07-31 20:20     ` Junio C Hamano
@ 2012-08-01  0:42       ` Jeff King
  2012-08-01  6:01         ` Junio C Hamano
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-01  0:42 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Tue, Jul 31, 2012 at 01:20:42PM -0700, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > A much better hint is to annotate pairs of sha1s, to say "do not bother
> > doing inexact rename correlation on this pair; I promise that they have
> > value N".
> 
> Surely.  And I suspect that the patch to the current codebase to do
> so would be much less impact if you go that way.

Yes. You may remember I wrote a generic caching subsystem last summer
when we were talking about caching commit generations. Plugging in a new
map type to map sha1 pairs to 32-bit integers was pretty simple, and
that gives the basis for a rename cache.

It's fairly unimpressive on git.git. My best-of-five for "git log
--format=%H --raw -M" went from 5.83s to 5.74s, which is pretty much
within the run-to-run noise. The resulting cache was 155K.

However, it's easy to come up with much more pathological cases. I have
a really ugly rename-and-tweak-tags commit on my photo repository, and
those blobs are relatively big. My timings for "git show" on that were:

  before: 49.724s
  after, run 1: 54.904s
  after, run 2:  0.117s

Which is pretty darn nice. The resulting cache is 5.3M (the repository
itself is in the gigabytes, but that's not really relevant; the cache
will obviously scale with the number of paths, not with the size of the
blobs).

It would also work for copies, too, of course. Here are the results of
"git log --format=%H --raw -M -C -C" on git.git:

  before: 1m35s
  after, run 1: 39.7s
  after, run 2: 39.5s

So it does make much more of a difference for copies, which is obvious;
git is doing a lot more work for us to cache. At the same time, our
cache is much bigger: 32M. Yikes.

My cache is fairly naive, in that it literally stores 44 bytes of
<src_sha1, dst_sha1, score> for each entry. At the cost of more
complexity, you could store each src_sha1 once, followed by a set of
<dst_sha1, score> pairs. I also didn't take any special care to avoid
duplicates of <X, Y> and <Y, X> (since presumably these renames would be
commutative). I'm not sure it is necessary, though; I think the copy
machinery already suppresses this when entries are in both source and
destination lists.

So I don't know. It can definitely speed up some operations, but at the
cost of a non-trivial cache on disk. I'll spare you all of the generic
caching infrastructure, but the actual patch to rename looks like this
(just to give a sense of where the hooks go):

diff --git a/diffcore-rename.c b/diffcore-rename.c
index 216a7a4..db70878 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -6,6 +6,7 @@
 #include "diffcore.h"
 #include "hash.h"
 #include "progress.h"
+#include "metadata-cache.h"
 
 /* Table of rename/copy destinations */
 
@@ -137,7 +138,8 @@ static int estimate_similarity(struct diff_filespec *src,
 	 */
 	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
 	unsigned long delta_limit;
-	int score;
+	uint32_t score;
+	struct sha1pair pair;
 
 	/* We deal only with regular files.  Symlink renames are handled
 	 * only when they are exact matches --- in other words, no edits
@@ -175,6 +177,11 @@ static int estimate_similarity(struct diff_filespec *src,
 	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
+	hashcpy(pair.one, src->sha1);
+	hashcpy(pair.two, dst->sha1);
+	if (rename_cache_get(&pair, &score))
+		return score;
+
 	if (!src->cnt_data && diff_populate_filespec(src, 0))
 		return 0;
 	if (!dst->cnt_data && diff_populate_filespec(dst, 0))
@@ -195,6 +202,7 @@ static int estimate_similarity(struct diff_filespec *src,
 		score = 0; /* should not happen */
 	else
 		score = (int)(src_copied * MAX_SCORE / max_size);
+	rename_cache_set(&pair, &score);
 	return score;
 }
 
-Peff

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-07-31 19:23   ` Jeff King
  2012-07-31 20:20     ` Junio C Hamano
@ 2012-08-01  1:10     ` Nguyen Thai Ngoc Duy
  2012-08-01  2:01       ` Jeff King
  1 sibling, 1 reply; 35+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-08-01  1:10 UTC (permalink / raw
  To: Jeff King; +Cc: Junio C Hamano, git

On Wed, Aug 1, 2012 at 2:23 AM, Jeff King <peff@peff.net> wrote:
>> It is a good direction to go in, I would think, to give users a way
>> to explicitly tell that "in comparison between these two trees, I
>> know path B in the postimage corresponds to path A in the preimage".
>
> I do not think that is the right direction. Let's imagine that I have a
> commit "A" and I annotate it (via notes or whatever) to say "between
> A^^{tree} and A^{tree}, foo.c became bar.c". That will help me when
> doing "git show" or "git log". But it will not help me when I later try
> to merge "A" (or its descendent). In that case, I will compute the diff
> between "A" and the merge-base (or worse, some descendent of "A" and the
> merge-base), and I will miss this hint entirely.
>
> A much better hint is to annotate pairs of sha1s, to say "do not bother
> doing inexact rename correlation on this pair; I promise that they have
> value N".

I haven't had time to think it through yet but I throw my thoughts in
any way. I actually went with your approach first. But it's more
difficult to control the renaming. Assume we want to tell git to
rename SHA-1 "A" to SHA-1 "B". What happens if we have two As in the
source tree and two Bs in the target tree? What happens if two As and
one B, or one A and two Bs? What if a user defines A -> B and A -> C,
and we happen to have two As in source tree and B and C in target
tree?

There's also the problem with transferring this information. With
git-notes I think I can transfer it (though not automatically). How do
we transfer sha1 map (that you mentioned in the commit generation mail
in this thread)?

> Then it will find that pair no matter which trees or commits
> are being diffed, and it will do so relatively inexpensively[1].

But does that happen often in practice? I mean diff-ing two arbitrary
trees and expect rename correction. I disregarded it as "git log" is
my main case, but I'm just a single user..
-- 
Duy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  1:10     ` Nguyen Thai Ngoc Duy
@ 2012-08-01  2:01       ` Jeff King
  2012-08-01  4:36         ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-01  2:01 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Wed, Aug 01, 2012 at 08:10:12AM +0700, Nguyen Thai Ngoc Duy wrote:

> > I do not think that is the right direction. Let's imagine that I have a
> > commit "A" and I annotate it (via notes or whatever) to say "between
> > A^^{tree} and A^{tree}, foo.c became bar.c". That will help me when
> > doing "git show" or "git log". But it will not help me when I later try
> > to merge "A" (or its descendent). In that case, I will compute the diff
> > between "A" and the merge-base (or worse, some descendent of "A" and the
> > merge-base), and I will miss this hint entirely.
> >
> > A much better hint is to annotate pairs of sha1s, to say "do not bother
> > doing inexact rename correlation on this pair; I promise that they have
> > value N".
> 
> I haven't had time to think it through yet but I throw my thoughts in
> any way. I actually went with your approach first. But it's more
> difficult to control the renaming. Assume we want to tell git to
> rename SHA-1 "A" to SHA-1 "B". What happens if we have two As in the
> source tree and two Bs in the target tree? What happens if two As and
> one B, or one A and two Bs? What if a user defines A -> B and A -> C,
> and we happen to have two As in source tree and B and C in target
> tree?

Yes, it disregards path totally. But if you had the exact same movement
of content from one path to another in one instance, and it is
considered a rename, wouldn't it also be a rename in a second instance?

> There's also the problem with transferring this information. With
> git-notes I think I can transfer it (though not automatically). How do
> we transfer sha1 map (that you mentioned in the commit generation mail
> in this thread)?

That is orthogonal to the issue of what is being stored. I chose my
mmap'd disk implementation because it is very fast, which makes it nice
for a performance cache. But you could store the same thing in git-notes
(indexed by dst sha1, I guess, and then pointing to a blob of (src,
score) pairs.

If you want to include path-based hints in a commit, I'd say that using
some micro-format in the commit message would be the simplest thing. But
that has been discussed before; ultimately the problem is that it only
covers _one_ diff that we do with that commit (it is probably the most
common, of course, but it doesn't cover them all).

> > Then it will find that pair no matter which trees or commits
> > are being diffed, and it will do so relatively inexpensively[1].
> 
> But does that happen often in practice? I mean diff-ing two arbitrary
> trees and expect rename correction. I disregarded it as "git log" is
> my main case, but I'm just a single user..

It happens every time merge-recursive does rename detection, which
includes "git merge" but also things like "cherry-pick".

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  2:01       ` Jeff King
@ 2012-08-01  4:36         ` Nguyen Thai Ngoc Duy
  2012-08-01  6:09           ` Junio C Hamano
  2012-08-01 21:27           ` Jeff King
  0 siblings, 2 replies; 35+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-08-01  4:36 UTC (permalink / raw
  To: Jeff King; +Cc: Junio C Hamano, git

On Wed, Aug 1, 2012 at 9:01 AM, Jeff King <peff@peff.net> wrote:
> On Wed, Aug 01, 2012 at 08:10:12AM +0700, Nguyen Thai Ngoc Duy wrote:
>
>> > I do not think that is the right direction. Let's imagine that I have a
>> > commit "A" and I annotate it (via notes or whatever) to say "between
>> > A^^{tree} and A^{tree}, foo.c became bar.c". That will help me when
>> > doing "git show" or "git log". But it will not help me when I later try
>> > to merge "A" (or its descendent). In that case, I will compute the diff
>> > between "A" and the merge-base (or worse, some descendent of "A" and the
>> > merge-base), and I will miss this hint entirely.
>> >
>> > A much better hint is to annotate pairs of sha1s, to say "do not bother
>> > doing inexact rename correlation on this pair; I promise that they have
>> > value N".
>>
>> I haven't had time to think it through yet but I throw my thoughts in
>> any way. I actually went with your approach first. But it's more
>> difficult to control the renaming. Assume we want to tell git to
>> rename SHA-1 "A" to SHA-1 "B". What happens if we have two As in the
>> source tree and two Bs in the target tree? What happens if two As and
>> one B, or one A and two Bs? What if a user defines A -> B and A -> C,
>> and we happen to have two As in source tree and B and C in target
>> tree?
>
> Yes, it disregards path totally. But if you had the exact same movement
> of content from one path to another in one instance, and it is
> considered a rename, wouldn't it also be a rename in a second instance?

Yes. This is probably cosmetics only, but without path information, we
leave it to chance to decide which A to pair with B and C (in the
A->B, A->C case above). Wrong path might lead to funny effects (i'm
thinking of git log --follow).

>> There's also the problem with transferring this information. With
>> git-notes I think I can transfer it (though not automatically). How do
>> we transfer sha1 map (that you mentioned in the commit generation mail
>> in this thread)?

I wasn't clear. This is about transferring info across repositories.

> That is orthogonal to the issue of what is being stored. I chose my
> mmap'd disk implementation because it is very fast, which makes it nice
> for a performance cache. But you could store the same thing in git-notes
> (indexed by dst sha1, I guess, and then pointing to a blob of (src,
> score) pairs.
>
> If you want to include path-based hints in a commit, I'd say that using
> some micro-format in the commit message would be the simplest thing.

Rename correction is after the commit is created. I don't think we can
recreate commits.

> But
> that has been discussed before; ultimately the problem is that it only
> covers _one_ diff that we do with that commit (it is probably the most
> common, of course, but it doesn't cover them all).

How about we generate sha1 mapping from commit hints? We try to take
advantage of path hints when we can. Else we fall back to sha-1
mapping. This way we can transfer commit hints as git-notes to another
repo, then regenerate sha-1 mapping there. No need to transfer sha1
maps.

>> > Then it will find that pair no matter which trees or commits
>> > are being diffed, and it will do so relatively inexpensively[1].
>>
>> But does that happen often in practice? I mean diff-ing two arbitrary
>> trees and expect rename correction. I disregarded it as "git log" is
>> my main case, but I'm just a single user..
>
> It happens every time merge-recursive does rename detection, which
> includes "git merge" but also things like "cherry-pick".

Thanks. I'll look into merge/cherry-pick.
-- 
Duy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  0:42       ` Jeff King
@ 2012-08-01  6:01         ` Junio C Hamano
  2012-08-01 21:54           ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-01  6:01 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> @@ -175,6 +177,11 @@ static int estimate_similarity(struct diff_filespec *src,
>  	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
>  		return 0;
>  
> +	hashcpy(pair.one, src->sha1);
> +	hashcpy(pair.two, dst->sha1);
> +	if (rename_cache_get(&pair, &score))
> +		return score;
> +

Random thoughts.

Even though your "rename cache" could be used to reject pairing that
the similarity estimator would otherwise give high score, I would
imagine that in practice, people would always use the mechanism to
boost the similarity score of desired pairing.  This conjecture has
a few interesting implications.

 - As we track of only the top NUM_CANDIDATE_PER_DST rename src for
   each dst (see record_if_better()), you should be able to first
   see if pairs that have dst exist in your rename cache, and
   iterate over the <src,dst> pairs, filling m[] with srcs that
   appear in this particular invocation of diff.

 - If you find NUM_CANDIDATE_PER_DST srcs from your rename cache,
   you wouldn't have to run estimate_similarity() at all, but that
   is very unlikely.  We could however declare that user configured
   similarity boost always wins computed ones, and skip estimation
   for a dst for which you find an entry in the rename cache.

 - As entries in rename cache that record high scores have names of
   "similar" blobs, pack-objects may be able to take advantage of
   this information.

 - If you declare blobs A and B are similar, it is likely that blobs
   C, D, E, ... that are created by making a series of small tweaks
   to B are also similar.  Would it make more sense to introduce a
   concept of "set of similar blobs" instead of recording pairwise
   scores for (A,B), (A,C), (A,D), ... (B,C), (B,D), ...?  If so,
   the body of per-dst loop in diffcore_rename() may become:

	if (we know where dst came from)
		continue;
	if (dst belongs to a known blob family) {
		for (each src in rename_src[]) {
			if (src belongs to the same blob family as dst)
				record it in m[];
                }
	}
	if (the above didn't record anything in m[]) {
        	... existing estimate_similarity() code ...
	}

Regarding your rename-and-tweak-exif photo sets, is the issue that
there are too many rename src/dst candidates and filling a large
matrix takes a lot of time, or tweaking exif makes the contents
unnecessarily dissimilar and causes the similarity detection to
fail?  As we still have the pathname in this codepath, I am
wondering if we would benefit from custom "content hash" that knows
the nature of payload than the built-in similarity estimator, driven
by the attribute mechanism (if the latter is the case, that is).

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  4:36         ` Nguyen Thai Ngoc Duy
@ 2012-08-01  6:09           ` Junio C Hamano
  2012-08-01  6:34             ` Nguyen Thai Ngoc Duy
  2012-08-01 21:27           ` Jeff King
  1 sibling, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-01  6:09 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Jeff King, git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> Yes. This is probably cosmetics only, but without path information, we
> leave it to chance to decide which A to pair with B and C (in the
> A->B, A->C case above). Wrong path might lead to funny effects (i'm
> thinking of git log --follow).

Isn't that why <A,B> and <A,C> can have different scores per object
name pair?  And if you mean by B and C the paths not object names,
and the blob at B and C are indeed identical, why would it matter?

>>> There's also the problem with transferring this information. With
>>> git-notes I think I can transfer it (though not automatically). How do
>>> we transfer sha1 map (that you mentioned in the commit generation mail
>>> in this thread)?
>
> I wasn't clear. This is about transferring info across repositories.

It is simple to define a portable format to record Jeff's rename
cache database (it could be a text blob full of "x40 x40 score"
lines), point such a blob with a special ref, and write a Porcelain
to transfer and merge across repositories, e.g.

	git fetch $there refs/rename-cache
	new=$(
	(
            git cat-file blob refs/rename-cache
            git cat-file blob FETCH_HEAD
	) | sort | uniq | git hash-object -w --stdin)
	git update-ref refs/rename-cache $new

And as Jeff said, that is orthogonal to the issue of what is being
stored.  The contents of refs/rename-cache blob could be compiled
into a binary form for quicker access at runtime in the Porcelain.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  6:09           ` Junio C Hamano
@ 2012-08-01  6:34             ` Nguyen Thai Ngoc Duy
  2012-08-01 21:32               ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-08-01  6:34 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Jeff King, git

On Wed, Aug 1, 2012 at 1:09 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
>
>> Yes. This is probably cosmetics only, but without path information, we
>> leave it to chance to decide which A to pair with B and C (in the
>> A->B, A->C case above). Wrong path might lead to funny effects (i'm
>> thinking of git log --follow).
>
> Isn't that why <A,B> and <A,C> can have different scores per object
> name pair?  And if you mean by B and C the paths not object names,
> and the blob at B and C are indeed identical, why would it matter?

I don't see how scores affect that. Suppose I have two trees and a
rename-cache file:

$ git ls-tree -r HEAD^
100644 blob d00491    path1/foo
100644 blob d00491    path2/bar
$ git ls-tree -r HEAD
100644 blob 0cfbf0    path1/fooo
100644 blob 00750e    path2/barr
$ cat rename-cache
d00491 0cfbf0 <score 1>
d00491 00750e <score 2>

How can I be sure "git diff HEAD^!" will rename path1/foo =>
path1/fooo and path2/bar => path2/barr, not path1/foo => path2/barr
and path2/bar => path1/fooo?
-- 
Duy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  4:36         ` Nguyen Thai Ngoc Duy
  2012-08-01  6:09           ` Junio C Hamano
@ 2012-08-01 21:27           ` Jeff King
  2012-08-02 12:08             ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-01 21:27 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Wed, Aug 01, 2012 at 11:36:00AM +0700, Nguyen Thai Ngoc Duy wrote:

> > That is orthogonal to the issue of what is being stored. I chose my
> > mmap'd disk implementation because it is very fast, which makes it nice
> > for a performance cache. But you could store the same thing in git-notes
> > (indexed by dst sha1, I guess, and then pointing to a blob of (src,
> > score) pairs.
> >
> > If you want to include path-based hints in a commit, I'd say that using
> > some micro-format in the commit message would be the simplest thing.
> 
> Rename correction is after the commit is created. I don't think we can
> recreate commits.

Yes, if you go with a commit-based approach, you can do either notes or
in-commit messages. In other words, I would break the solutions down as:

  1. Store sha1+sha1 -> score mapping (i.e., what I suggested). This is
     fundamentally a global store, not a per-commit store. For storage,
     you can do one (or a combination) of:

     a. Store the mapping in some local file. Fast, but can't be shared.

     b. Store the mapping in a note (probably indexed by the destination
        blob sha1). Less efficient, but easy to share.

  2. Store path -> path mapping. This is fundamentally a per-commit
     store, since it is valid only in the diff between two particular
     trees. For storage, you can do one (or a combination) of:

     a. Store the mapping inside the commit object. Simple to share, but
        hard to change later.

     b. Store it in a note indexed by the commit object. Slightly harder
        to share, but can change later.

I implemented (1a). Implementing (1b) would be easy, but for a full-on
cache (especially for "-C"), I think the resulting size might be
prohibitive.

All solutions under (2) suffer from the same problem: they are accurate
only for a single diff. For other diffs, you would either have to not
use the feature, or you would be stuck traversing the history and
assigning a temporary file identity (e.g., given commits A->B->C, and in
A->B we rename "foo" to "bar", the diff between A and C could discover
that A's "foo" corresponds to C's "bar").

That is slower than what we do now, but that is only one problem with
it. Another is that the history relationship between the commits might
be complex. In the example above, we had a direct-descendent
relationship. But what about one with branching, and one side does the
rename and the other does not? Or even cherry-picking from an unrelated
part of history entirely?

And I think all forms of (2) will suffer from this, no matter how you
store the data. The reason I mentioned specifically putting it in the
commit message (i.e., 2a) because that is what people suggested in the
very early days of git (e.g., saying that rename detection is OK, but
could we supplement it with hints at commit-time?). And all of the
arguments against it back then apply equally now. And apply equally to
storing it in notes, because it is just another form of the same thing.

> > But that has been discussed before; ultimately the problem is that
> > it only covers _one_ diff that we do with that commit (it is
> > probably the most common, of course, but it doesn't cover them all).
> 
> How about we generate sha1 mapping from commit hints? We try to take
> advantage of path hints when we can. Else we fall back to sha-1
> mapping. This way we can transfer commit hints as git-notes to another
> repo, then regenerate sha-1 mapping there. No need to transfer sha1
> maps.

Yes. You could definitely pre-seed the sha1 mapping from the per-commit
hints. However, if you are interested in overriding git's rename
detection, then per-commit hints will only let you do part of what you
want. For example, consider a linear history like this:

   A--B--C

And imagine that commit B renamed a file from "foo" to "bar", and that
commit C further modified "bar". Let's say that git did not detect the
rename in B, but you manually annotated it. As a result, we seed the
sha1 mapping to show that the sha1 of A:foo and the sha1 of B:bar has a
rename score of 100%.

Now I want to do a diff between A and C (e.g., for rename detection on
one side of a merge). But their sha1s are not in my mapping (because
C:bar is not the same as B:bar), and I don't find the rename. To find
it, I cannot seed a per-commit path mapping. I must seed the sha1
mapping itself.

So to adapt your per-commit mapping into a sha1 mapping, the seeding
process would have to actually walk the history, transitively creating
sha1 mapping entries (i.e., seeing that "bar" changed between B and C,
and therefore mapping A:foo to C:bar, as well).

For this reason, I'm not sure that stored overrides like this are
generally useful in the long run. I think storage is useful for
_caching_ the results, because it doesn't have to be perfect; it just
helps with some repetitive queries. Whereas for overriding, I think it
is much more interesting to override _particular_ diff. E.g., to say "I
am merging X and Y, and please pretend that Y renamed "foo" to "bar"
when you do rename detection.

And in that sense, your "git log" example can be considered a
special-case of this: you are saying that the diff from $commit to
$commit^ is done frequently, so rather than saying "please pretend..."
each time, you would like to store the information forever. And storing
it in the commit message or a note is one way of doing that.

I don't think there's anything fundamentally _wrong_ with that, but I
kind of question its usefulness. In other words, what is the point in
doing so? If it is inform the user that semantically the commit did a
rename, even though the content changed enough that rename detection
does not find it, then I would argue that you should simply state it in
the commit message (or in a human-readable git-note, if it was only
realized after the fact).

But there is not much point in making it machine-readable, since the
interesting machine-readable things we do with renames are:

  1. Show the diff against the rename src, which can often be easier to
     read. Except that if rename detection did not find it, it is
     probably _not_ going to be easier to read.

  2. Applying content to the destination of a merge. But you're almost
     never doing the diff between a commit and its parent, so the
     information would be useless.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  6:34             ` Nguyen Thai Ngoc Duy
@ 2012-08-01 21:32               ` Jeff King
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-01 21:32 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Wed, Aug 01, 2012 at 01:34:23PM +0700, Nguyen Thai Ngoc Duy wrote:

> On Wed, Aug 1, 2012 at 1:09 PM, Junio C Hamano <gitster@pobox.com> wrote:
> > Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
> >
> >> Yes. This is probably cosmetics only, but without path information, we
> >> leave it to chance to decide which A to pair with B and C (in the
> >> A->B, A->C case above). Wrong path might lead to funny effects (i'm
> >> thinking of git log --follow).
> >
> > Isn't that why <A,B> and <A,C> can have different scores per object
> > name pair?  And if you mean by B and C the paths not object names,
> > and the blob at B and C are indeed identical, why would it matter?
> 
> I don't see how scores affect that. Suppose I have two trees and a
> rename-cache file:
> 
> $ git ls-tree -r HEAD^
> 100644 blob d00491    path1/foo
> 100644 blob d00491    path2/bar
> $ git ls-tree -r HEAD
> 100644 blob 0cfbf0    path1/fooo
> 100644 blob 00750e    path2/barr
> $ cat rename-cache
> d00491 0cfbf0 <score 1>
> d00491 00750e <score 2>
> 
> How can I be sure "git diff HEAD^!" will rename path1/foo =>
> path1/fooo and path2/bar => path2/barr, not path1/foo => path2/barr
> and path2/bar => path1/fooo?

You can't. A pathless mapping can never represent that. I think my (and
Junio's argument) is more like: why would you care? And that gets back
to the statement I just made elsewhere in the thread. Which is that you
need to consider the purpose of the rename detection. For generating
diffs and for merging, it doesn't matter in the above case. If the point
is to communicate some semantics of the change (e.g., if your change was
"double the final letter of each filename, and also make a small change),
then that is what the commit message is for.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01  6:01         ` Junio C Hamano
@ 2012-08-01 21:54           ` Jeff King
  2012-08-01 22:10             ` Junio C Hamano
  2012-08-02  5:33             ` Junio C Hamano
  0 siblings, 2 replies; 35+ messages in thread
From: Jeff King @ 2012-08-01 21:54 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Tue, Jul 31, 2012 at 11:01:27PM -0700, Junio C Hamano wrote:

> > @@ -175,6 +177,11 @@ static int estimate_similarity(struct diff_filespec *src,
> >  	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
> >  		return 0;
> >  
> > +	hashcpy(pair.one, src->sha1);
> > +	hashcpy(pair.two, dst->sha1);
> > +	if (rename_cache_get(&pair, &score))
> > +		return score;
> > +
> 
> Random thoughts.
> 
> Even though your "rename cache" could be used to reject pairing that
> the similarity estimator would otherwise give high score, I would
> imagine that in practice, people would always use the mechanism to
> boost the similarity score of desired pairing.  This conjecture has
> a few interesting implications.

I agree that is the probable use. But low scores might also have a
purpose in preventing useless work from being done (IOW, storing a "0"
score is unlikely to happen manually, but it does let us avoid repeating
the calculation).

As you might have guessed, I am really more interested in caching and
performance gains than in manual overrides.

>  - As we track of only the top NUM_CANDIDATE_PER_DST rename src for
>    each dst (see record_if_better()), you should be able to first
>    see if pairs that have dst exist in your rename cache, and
>    iterate over the <src,dst> pairs, filling m[] with srcs that
>    appear in this particular invocation of diff.

Yes, although I don't know if it makes a big difference. We still run
estimate_similarity on the candidates to see if they make it into
record_if_better, and that is the expensive part. I don't think you want
to change that either (e.g., by assuming that cached scores will always
be in the top N; it would depend on the particular tree that the cached
scores came from).

Of course, that changes if it is not a cache at all, and simply a manual
override. Or if manual overrides are stored separately and take effect
first.

But I do not think you even want to care about NUM_CANDIDATE_PER_DST for
manual overrides. You simply want to take the highest scored one,
consider the it done, and not run estimate_similarity at all. Even if
you only had 1 (and NUM_CANDIDATE_PER_DST were 4), there is still no
point in looking at the others. The manual override should beat it,
regardless of score.

>  - If you find NUM_CANDIDATE_PER_DST srcs from your rename cache,
>    you wouldn't have to run estimate_similarity() at all, but that
>    is very unlikely.  We could however declare that user configured
>    similarity boost always wins computed ones, and skip estimation
>    for a dst for which you find an entry in the rename cache.

Right. See above.

>  - As entries in rename cache that record high scores have names of
>    "similar" blobs, pack-objects may be able to take advantage of
>    this information.

Yeah, although I suspect it is not as big a win as you might hope.  In
practice, you're going to diff a lot fewer objects than pack-objects
would consider, so most of the pack-objects candidate pairs will not
have a cache entry.  Which means that you really need to not rely on the
cache (since it is very likely that the best candidate is still to be
found), and you still do lots of computation.

You could cache the result of comparisons done by pack-objects, but that
cache ends up being large. You might remember that one of my very first
patches was a cache for recording negative delta finds (so we try to
delta A against B and find that it is not a good match, and record that
information). Even that cache was huge, and we ended up discarding it in
favor of Linus's much more sensible "if two things are in the same pack
and not delta'd, then either they are not a good match, or something
else is in better" heuristic. But one with full-on similarity scores
would be even bigger.

>  - If you declare blobs A and B are similar, it is likely that blobs
>    C, D, E, ... that are created by making a series of small tweaks
>    to B are also similar.  Would it make more sense to introduce a
>    concept of "set of similar blobs" instead of recording pairwise
>    scores for (A,B), (A,C), (A,D), ... (B,C), (B,D), ...?  If so,
>    the body of per-dst loop in diffcore_rename() may become:

Yes, this is the transitive bit I mentioned elsewhere. I think there are
dragons lurking there, as you end up with the edges of the set not
_really_ being that close to each other. You'd probably want some kind
of weighted association, like if A->B had 80% similarity, and B->C had
90% similarity, then A->C would be 72%. But even that is not accurate;
it is just a lower bound. The differences between A->B and B->C might
overlap, and A->C might be much more similar.

You can't know the true value without actually finding the score for
A->C.  And since we output the scores, they should probably be accurate.

> Regarding your rename-and-tweak-exif photo sets, is the issue that
> there are too many rename src/dst candidates and filling a large
> matrix takes a lot of time, or tweaking exif makes the contents
> unnecessarily dissimilar and causes the similarity detection to
> fail?

The former. We do find the renames; it is just that there are a lot of
them (and worse, they are large blobs, so estimate_similarity is slow to
run). I have to use "-l0" just to get the rename to run at all.

> As we still have the pathname in this codepath, I am wondering if we
> would benefit from custom "content hash" that knows the nature of
> payload than the built-in similarity estimator, driven by the
> attribute mechanism (if the latter is the case, that is).

Hmm. Interesting. But I don't think that attributes are a good fit here.
They are pathname based, so how do I apply anything related to
similarity of a particular version by pathname? IOW, how does it apply
in one tree but not another?

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01 21:54           ` Jeff King
@ 2012-08-01 22:10             ` Junio C Hamano
  2012-08-02 22:37               ` Jeff King
  2012-08-02  5:33             ` Junio C Hamano
  1 sibling, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-01 22:10 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> On Tue, Jul 31, 2012 at 11:01:27PM -0700, Junio C Hamano wrote:
> ...
>> As we still have the pathname in this codepath, I am wondering if we
>> would benefit from custom "content hash" that knows the nature of
>> payload than the built-in similarity estimator, driven by the
>> attribute mechanism (if the latter is the case, that is).
>
> Hmm. Interesting. But I don't think that attributes are a good fit here.
> They are pathname based, so how do I apply anything related to
> similarity of a particular version by pathname? IOW, how does it apply
> in one tree but not another?

When you move porn/0001.jpg in the preimage to naughty/00001.jpg in
the postimage, they both can hit "*.jpg contentid=jpeg" line in the
top-level .gitattribute file, and the contentid driver for jpeg type
may strip exif and hash the remainder bits in the image to come up
with a token you can use in a similar way as object ID is used in
the exact rename detection phase.

Just thinking aloud.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01 21:54           ` Jeff King
  2012-08-01 22:10             ` Junio C Hamano
@ 2012-08-02  5:33             ` Junio C Hamano
  1 sibling, 0 replies; 35+ messages in thread
From: Junio C Hamano @ 2012-08-02  5:33 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> On Tue, Jul 31, 2012 at 11:01:27PM -0700, Junio C Hamano wrote:
>
>>  - As entries in rename cache that record high scores have names of
>>    "similar" blobs, pack-objects may be able to take advantage of
>>    this information.
>
> Yeah, although I suspect it is not as big a win as you might hope.  In
> practice, you're going to diff a lot fewer objects than pack-objects
> would consider, so most of the pack-objects candidate pairs will not
> have a cache entry.  Which means that you really need to not rely on the
> cache (since it is very likely that the best candidate is still to be
> found), and you still do lots of computation.
>
> You could cache the result of comparisons done by pack-objects, but that
> cache ends up being large. You might remember that one of my very first
> patches was a cache for recording negative delta finds (so we try to
> delta A against B and find that it is not a good match, and record that
> information). Even that cache was huge, and we ended up discarding it in
> favor of Linus's much more sensible "if two things are in the same pack
> and not delta'd, then either they are not a good match, or something
> else is in better" heuristic. But one with full-on similarity scores
> would be even bigger.

Yeah, I forgot about your negative cache.  Also the older companion
heuristics "if a delta and its base are in an existing pack, use
that delta" works well enough to make a positive cache unnecessary
(and the "rename similarity cache" can only be a subset of such a
cache).

C.f. http://thread.gmane.org/gmane.comp.version-control.git/16223/focus=16267

>>  - If you declare blobs A and B are similar, it is likely that blobs
>>    C, D, E, ... that are created by making a series of small tweaks
>>    to B are also similar.  Would it make more sense to introduce a
>>    concept of "set of similar blobs" instead of recording pairwise
>>    scores for (A,B), (A,C), (A,D), ... (B,C), (B,D), ...?  If so,
>>    the body of per-dst loop in diffcore_rename() may become:
>
> Yes, this is the transitive bit I mentioned elsewhere. I think there are
> dragons lurking there, as you end up with the edges of the set not
> _really_ being that close to each other. You'd probably want some kind
> of weighted association, like if A->B had 80% similarity, and B->C had
> 90% similarity, then A->C would be 72%.

I recall we discussed the transitivity when we were designing the
basic rename thing, and rejected it after we realized that it was
unworkable.  You may have a blob A, perform a large-ish edit to
produce B, and A and B may be similar by only 60%.  You then may
start from the same A, perform another large-ish edit to produce C,
and A and C may be similar by only 55%.  Depending on the nature of
these two large-ish changes, B and C may be totally dissimilar, or
if the changes are more or less in the same "direction", they may be
95% similar.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01 21:27           ` Jeff King
@ 2012-08-02 12:08             ` Nguyen Thai Ngoc Duy
  2012-08-02 22:41               ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-08-02 12:08 UTC (permalink / raw
  To: Jeff King; +Cc: Junio C Hamano, git

On Thu, Aug 2, 2012 at 4:27 AM, Jeff King <peff@peff.net> wrote:
> Yes, if you go with a commit-based approach, you can do either notes or
> in-commit messages. In other words, I would break the solutions down as:
>
>   1. Store sha1+sha1 -> score mapping (i.e., what I suggested). This is
>      fundamentally a global store, not a per-commit store. For storage,
>      you can do one (or a combination) of:
>
>      a. Store the mapping in some local file. Fast, but can't be shared.
>
>      b. Store the mapping in a note (probably indexed by the destination
>         blob sha1). Less efficient, but easy to share.
>
> I implemented (1a). Implementing (1b) would be easy, but for a full-on
> cache (especially for "-C"), I think the resulting size might be
> prohibitive.

(1a) is good regardless rename overrides. Why don't you polish and
submit it? We can set some criteria to limit the cache size while
keeping computation reasonably low. Caching rename scores for file
pairs that has file size larger than a limit is one. Rename matrix
size could also be a candidate. We could even cache just rename scores
for recent commits (i.e. close to heads) only with the assumption that
people diff/apply recent commits more often.


> All solutions under (2) suffer from the same problem: they are accurate
> only for a single diff. For other diffs, you would either have to not
> use the feature, or you would be stuck traversing the history and
> assigning a temporary file identity (e.g., given commits A->B->C, and in
> A->B we rename "foo" to "bar", the diff between A and C could discover
> that A's "foo" corresponds to C's "bar").

Yeah. If we go with manual overrides, I expect users to deal with
these manually too. IOW they'll need to create a mapping for A->C
themselves. We can help detect that there are manual overrides in some
cases, like merge, and let users know that manual overrides are
ignored. For merge, I think we can just check for all commits while
traversing looking for bases.

> For this reason, I'm not sure that stored overrides like this are
> generally useful in the long run. I think storage is useful for
> _caching_ the results, because it doesn't have to be perfect; it just
> helps with some repetitive queries. Whereas for overriding, I think it
> is much more interesting to override _particular_ diff. E.g., to say "I
> am merging X and Y, and please pretend that Y renamed "foo" to "bar"
> when you do rename detection.
>
> And in that sense, your "git log" example can be considered a
> special-case of this: you are saying that the diff from $commit to
> $commit^ is done frequently, so rather than saying "please pretend..."
> each time, you would like to store the information forever. And storing
> it in the commit message or a note is one way of doing that.

Yep, specifying rename overrides between two trees is probably better.

> I don't think there's anything fundamentally _wrong_ with that, but I
> kind of question its usefulness. In other words, what is the point in
> doing so? If it is inform the user that semantically the commit did a
> rename, even though the content changed enough that rename detection
> does not find it, then I would argue that you should simply state it in
> the commit message (or in a human-readable git-note, if it was only
> realized after the fact).
>
> But there is not much point in making it machine-readable, since the
> interesting machine-readable things we do with renames are:
>
>   1. Show the diff against the rename src, which can often be easier to
>      read. Except that if rename detection did not find it, it is
>      probably _not_ going to be easier to read.

Probably. Still it helps "git log --follow" to follow the correct
track in the 1% case that rename detection does go wrong.

>
>   2. Applying content to the destination of a merge. But you're almost
>      never doing the diff between a commit and its parent, so the
>      information would be useless.

Having a way to interfere rename detection, even manually, could be
good in this case if it reduces conflicts. We could feed rename
overrides using command line.
-- 
Duy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-01 22:10             ` Junio C Hamano
@ 2012-08-02 22:37               ` Jeff King
  2012-08-02 22:51                 ` Junio C Hamano
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-02 22:37 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Wed, Aug 01, 2012 at 03:10:55PM -0700, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > On Tue, Jul 31, 2012 at 11:01:27PM -0700, Junio C Hamano wrote:
> > ...
> >> As we still have the pathname in this codepath, I am wondering if we
> >> would benefit from custom "content hash" that knows the nature of
> >> payload than the built-in similarity estimator, driven by the
> >> attribute mechanism (if the latter is the case, that is).
> >
> > Hmm. Interesting. But I don't think that attributes are a good fit here.
> > They are pathname based, so how do I apply anything related to
> > similarity of a particular version by pathname? IOW, how does it apply
> > in one tree but not another?
> 
> When you move porn/0001.jpg in the preimage to naughty/00001.jpg in
> the postimage, they both can hit "*.jpg contentid=jpeg" line in the
> top-level .gitattribute file, and the contentid driver for jpeg type
> may strip exif and hash the remainder bits in the image to come up
> with a token you can use in a similar way as object ID is used in
> the exact rename detection phase.
> 
> Just thinking aloud.

Ah, I see. That still feels like way too specific a use case to me. A
much more general use case to me would be a contentid driver which
splits the file into multiple chunks (which can be concatenated to
arrive at the original content), and marks chunks as "OK to delta" or
"not able to delta".  In other words, a content-specific version of the
bup-style splitting that people have proposed.

Assuming we split a jpeg into its EXIF bits (+delta) and its image bits
(-delta), then you could do a fast rename or pack-objects comparison
between two such files (in fact, with chunked object storage,
pack-objects can avoid looking at the image parts at all).

However, it may be the case that such "smart" splitting is not
necessary, as stupid and generic bup-style splitting may be enough. I
really need to start playing with the patches you wrote last year that
started in that direction.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-02 12:08             ` Nguyen Thai Ngoc Duy
@ 2012-08-02 22:41               ` Jeff King
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-02 22:41 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Thu, Aug 02, 2012 at 07:08:25PM +0700, Nguyen Thai Ngoc Duy wrote:

> > I implemented (1a). Implementing (1b) would be easy, but for a full-on
> > cache (especially for "-C"), I think the resulting size might be
> > prohibitive.
> 
> (1a) is good regardless rename overrides. Why don't you polish and
> submit it? We can set some criteria to limit the cache size while
> keeping computation reasonably low. Caching rename scores for file
> pairs that has file size larger than a limit is one. Rename matrix
> size could also be a candidate. We could even cache just rename scores
> for recent commits (i.e. close to heads) only with the assumption that
> people diff/apply recent commits more often.

I'll polish and share it. I'm still not 100% sure it's a good idea,
because introducing an on-disk cache means we need to _manage_ that
cache. How big will it be? Who will prune it when it gets too big? By
what criteria? And so on.

But if it's all hidden behind a config option, then it won't hurt people
who don't use it. And people who do use it can gather data on how the
caches grow.

> > All solutions under (2) suffer from the same problem: they are accurate
> > only for a single diff. For other diffs, you would either have to not
> > use the feature, or you would be stuck traversing the history and
> > assigning a temporary file identity (e.g., given commits A->B->C, and in
> > A->B we rename "foo" to "bar", the diff between A and C could discover
> > that A's "foo" corresponds to C's "bar").
> 
> Yeah. If we go with manual overrides, I expect users to deal with
> these manually too. IOW they'll need to create a mapping for A->C
> themselves. We can help detect that there are manual overrides in some
> cases, like merge, and let users know that manual overrides are
> ignored. For merge, I think we can just check for all commits while
> traversing looking for bases.

Yeah, merges are a special case, in that we know the diff we perform
will always have a direct-ancestor relationship (since it is always
between a tip and the merge base).

> > But there is not much point in making it machine-readable, since the
> > interesting machine-readable things we do with renames are:
> >
> >   1. Show the diff against the rename src, which can often be easier to
> >      read. Except that if rename detection did not find it, it is
> >      probably _not_ going to be easier to read.
> 
> Probably. Still it helps "git log --follow" to follow the correct
> track in the 1% case that rename detection does go wrong.

Thanks. I didn't think of --follow, but that is a good counterpoint to
my argument.

> >   2. Applying content to the destination of a merge. But you're almost
> >      never doing the diff between a commit and its parent, so the
> >      information would be useless.
> 
> Having a way to interfere rename detection, even manually, could be
> good in this case if it reduces conflicts. We could feed rename
> overrides using command line.

Yeah. I think I'd start with letting you feed pairs to diff_options,
give it a command-line option to see how useful it is, and then later on
consider a mechanism for extracting those pairs automatically from
commits or notes.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-02 22:37               ` Jeff King
@ 2012-08-02 22:51                 ` Junio C Hamano
  2012-08-02 22:58                   ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-02 22:51 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> On Wed, Aug 01, 2012 at 03:10:55PM -0700, Junio C Hamano wrote:
> ...
>> When you move porn/0001.jpg in the preimage to naughty/00001.jpg in
>> the postimage, they both can hit "*.jpg contentid=jpeg" line in the
>> top-level .gitattribute file, and the contentid driver for jpeg type
>> may strip exif and hash the remainder bits in the image to come up
>> with a token you can use in a similar way as object ID is used in
>> the exact rename detection phase.
>> 
>> Just thinking aloud.
>
> Ah, I see. That still feels like way too specific a use case to me. A
> much more general use case to me would be a contentid driver which
> splits the file into multiple chunks (which can be concatenated to
> arrive at the original content), and marks chunks as "OK to delta" or
> "not able to delta".  In other words, a content-specific version of the
> bup-style splitting that people have proposed.
>
> Assuming we split a jpeg into its EXIF bits (+delta) and its image bits
> (-delta), then you could do a fast rename or pack-objects comparison
> between two such files (in fact, with chunked object storage,
> pack-objects can avoid looking at the image parts at all).
>
> However, it may be the case that such "smart" splitting is not
> necessary, as stupid and generic bup-style splitting may be enough. I
> really need to start playing with the patches you wrote last year that
> started in that direction.

I wasn't interested in "packing split object representation",
actually.  The idea was still within the context of "rename".

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP PATCH] Manual rename correction
  2012-08-02 22:51                 ` Junio C Hamano
@ 2012-08-02 22:58                   ` Jeff King
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-02 22:58 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Thu, Aug 02, 2012 at 03:51:17PM -0700, Junio C Hamano wrote:

> > On Wed, Aug 01, 2012 at 03:10:55PM -0700, Junio C Hamano wrote:
> > ...
> >> When you move porn/0001.jpg in the preimage to naughty/00001.jpg in
> >> the postimage, they both can hit "*.jpg contentid=jpeg" line in the
> >> top-level .gitattribute file, and the contentid driver for jpeg type
> >> may strip exif and hash the remainder bits in the image to come up
> >> with a token you can use in a similar way as object ID is used in
> >> the exact rename detection phase.
> >> 
> >> Just thinking aloud.
> >
> > Ah, I see. That still feels like way too specific a use case to me. A
> > much more general use case to me would be a contentid driver which
> > splits the file into multiple chunks (which can be concatenated to
> > arrive at the original content), and marks chunks as "OK to delta" or
> > "not able to delta".  In other words, a content-specific version of the
> > bup-style splitting that people have proposed.
> >
> > Assuming we split a jpeg into its EXIF bits (+delta) and its image bits
> > (-delta), then you could do a fast rename or pack-objects comparison
> > between two such files (in fact, with chunked object storage,
> > pack-objects can avoid looking at the image parts at all).
> >
> > However, it may be the case that such "smart" splitting is not
> > necessary, as stupid and generic bup-style splitting may be enough. I
> > really need to start playing with the patches you wrote last year that
> > started in that direction.
> 
> I wasn't interested in "packing split object representation",
> actually.  The idea was still within the context of "rename".

But it would work for rename, too. If you want to compare two files, the
driver would give you back { sha1_exif (+delta), sha1_image (-delta) }
for each file. You know the size of each chunk and the size of the total
file.

Then you would just compare sha1_image for each entry. If they match,
then you have a lower bound on similarity of image_chunk_size /
total_size. If they don't, then you have an upper bound of similarity of
1-(image_chunk_size/total_size). In the former case, you can get the
exact similarity by doing a real delta on the sha1_exif content. In the
latter case, you can either exit early (if you are already below the
similarity threshold, which is likely), or possibly do the delta on the
sha1_exif content to get an exact value.

But either way, you never had to do a direct comparison between the big
image data; you only needed to know the sha1s. And as a bonus, if you
did want to cache results, you can have an O(# of blobs) cache of the
chunked sha1s of the chunked form (because the information is immutable
for a given sha1 and content driver). Whereas by caching the result of
estimate_similarity, our worst-case cache is the square of that (because
we are storing sha1 pairs).

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 0/8] caching rename results
  2012-08-02 22:41               ` Jeff King
@ 2012-08-04 17:09                 ` Jeff King
  2012-08-04 17:10                   ` [PATCH 1/8] implement generic key/value map Jeff King
                                     ` (7 more replies)
  0 siblings, 8 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:09 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Thu, Aug 02, 2012 at 06:41:55PM -0400, Jeff King wrote:

> > (1a) is good regardless rename overrides. Why don't you polish and
> > submit it? We can set some criteria to limit the cache size while
> > keeping computation reasonably low. Caching rename scores for file
> > pairs that has file size larger than a limit is one. Rename matrix
> > size could also be a candidate. We could even cache just rename scores
> > for recent commits (i.e. close to heads) only with the assumption that
> > people diff/apply recent commits more often.
> 
> I'll polish and share it. I'm still not 100% sure it's a good idea,
> because introducing an on-disk cache means we need to _manage_ that
> cache. How big will it be? Who will prune it when it gets too big? By
> what criteria? And so on.
> 
> But if it's all hidden behind a config option, then it won't hurt people
> who don't use it. And people who do use it can gather data on how the
> caches grow.

Here it is, all polished up. I'm still a little lukewarm on it for two
reasons:

  1. The whole idea. For the reasons above, I'm a little iffy on doing
     this cache at all. It does yield speedups, but only in some
     specific cases. So it's hidden behind a diff.renamecaches option
     and off by default.

  2. The implementation is a little...gross. Long ago, I had written a
     type-generic map class for git using void pointers. It ended up
     complex and had problems with unaligned accesses. So I rewrote it
     using preprocessor macro expansion (e.g., you'd call
     IMPLEMENT_MAP(foo, const char *, int) or similar). But that wasn't
     quite powerful enough, as I really want conditional compilation
     inside the macro expansion, but you can't #ifdef.

     So I really wanted some kind of code generation that could do
     conditionals. Which you can do with the C preprocessor, but rather
     than expanding macros, you have to #include templates that expand
     based on parameters you've set. Which is kind of ugly and
     non-intuitive, but it does work. Look at patch 1 to see what I
     mean.

     Also, this sort of pre-processor hackery to create type-generic
     data structures is the first step on the road that eventually led
     to C++ being developed. And that scares me a little.

So yeah. Here it is. I'm not sure yet if it's a good idea or not.

  [1/8]: implement generic key/value map

Infrastructure.

  [2/8]: map: add helper functions for objects as keys
  [3/8]: fast-export: use object to uint32 map instead of "decorate"
  [4/8]: decorate: use "map" for the underlying implementation

These ones are optional for this series, but since we are introducing
the infrastructure anyway (which is really just a generalized form of
what "decorate" does), it offsets the code bloat.

  [5/8]: map: implement persistent maps
  [6/8]: implement metadata cache subsystem

More infrastructure.

  [7/8]: implement rename cache
  [8/8]: diff: optionally use rename cache

And these are the actual rename cache.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/8] implement generic key/value map
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
@ 2012-08-04 17:10                   ` Jeff King
  2012-08-04 22:58                     ` Junio C Hamano
  2012-08-04 17:10                   ` [PATCH 2/8] map: add helper functions for objects as keys Jeff King
                                     ` (6 subsequent siblings)
  7 siblings, 1 reply; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:10 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

It is frequently useful to have a fast, generic data
structure mapping keys to values. We already have something
like this in the "decorate" API, but it has two downsides:

  1. The key type must always be a "struct object *".

  2. The value type is a void pointer, which means it is
     inefficient and cumbersome for storing small values.
     One must either encode their value inside the void
     pointer, or allocate additional storage for the pointer
     to point to.

This patch introduces a generic map data structure, mapping
keys of arbitrary type to values of arbitrary type.

One possible strategy for implementation is to have a struct
that points to a sequence of bytes for each of the key and
the value, and to try to treat them as opaque in the code.
However, this code gets complex, has a lot of casts, and
runs afoul of violating alignment and strict aliasing rules.

This patch takes a different approach. We parameterize the
types in each map and putting the declarations and
implementations inside macros. This lets the compiler see
the actual code, with its real types, and figure out things
like struct packing and alignment itself.

Signed-off-by: Jeff King <peff@peff.net>
---
This is the one with the pre-processor grossness. Don't be _too_ scared
off by the ugliness of test-map.c; it's also trying to be polymorphic
with respect to different maps, which makes it even uglier. See patches
2 and 3 for a more reasonable application.

 .gitignore                          |   1 +
 Documentation/technical/api-map.txt | 214 ++++++++++++++++++++++++++++++++++++
 Makefile                            |   5 +
 map-decl.h                          |  22 ++++
 map-done.h                          |  19 ++++
 map-impl.h                          |  94 ++++++++++++++++
 map-init.h                          |  24 ++++
 t/t0007-map.sh                      |  50 +++++++++
 test-map.c                          | 182 ++++++++++++++++++++++++++++++
 9 files changed, 611 insertions(+)
 create mode 100644 Documentation/technical/api-map.txt
 create mode 100644 map-decl.h
 create mode 100644 map-done.h
 create mode 100644 map-impl.h
 create mode 100644 map-init.h
 create mode 100755 t/t0007-map.sh
 create mode 100644 test-map.c

diff --git a/.gitignore b/.gitignore
index bb5c91e..c2bad89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,6 +184,7 @@
 /test-genrandom
 /test-index-version
 /test-line-buffer
+/test-map
 /test-match-trees
 /test-mergesort
 /test-mktemp
diff --git a/Documentation/technical/api-map.txt b/Documentation/technical/api-map.txt
new file mode 100644
index 0000000..f86b785
--- /dev/null
+++ b/Documentation/technical/api-map.txt
@@ -0,0 +1,214 @@
+map API
+=======
+
+The map API is a system for efficiently mapping keys to values in memory. Items
+are stored in a hash table for fast lookup; storage efficiency is achieved
+through macro-based code generation, which lets the compiler store values
+compactly in memory.
+
+Due to the code generation, there are two different facets of this API: macros
+to build new types of mappings (i.e., generate new function and struct
+definitions), and generated functions to store and retrieve values from a
+particular mapping.
+
+
+Related APIs
+------------
+
+The hash API provides a similar key/value store. However, it does not deal with
+hash collisions itself, leaving the caller to handle bucket management (but
+this is a feature if you are interested in using the collisions as part of an
+algorithm).  Furthermore, it can store only void pointers, making storage of
+small values inefficient and cumbersome.
+
+The decorate API provides a similar interface to map, but is restricted to
+using "struct object" as the key, and a void pointer as the value.
+
+
+Defining New Map Types
+----------------------
+
+A map type is uniquely defined by the pair of its key and value types. To
+define a new type, you must set up some preprocessor defines to specify
+the key and values types, along with any special options for the
+implementation. Then to instantiate the declaration of a map (i.e., the
+bits that would go in a header file), include "map-decl.h". To
+instantiate the implementation, include "map-impl.h". To clean up your
+preprocessor options, include "map-done.h".
+
+The following map defines are available:
+
+`NAME`::
+
+	Required. The name of the map. This should syntactically be a C
+	identifier (alphanumeric and underscore), and should describe
+	the types involved in the map (e.g., `object_uint32` to map
+	objects to 32-bit integers).
+
+`KEY_TYPE`::
+
+	Required. The C type of the key, as it will be stored in the
+	hash (e.g., `struct object *` to store an object pointer).
+
+`PASS_KEY_BY_REF`::
+
+	Optional. If defined, indicates that keys are a complex type
+	that should be passed between functions using pointers.
+	Otherwise, keys are passed by value.
+
+`HASH`::
+
+	Required. A function that will convert an object of type
+	`KEY_TYPE` into an integer hash value.
+
+`KEY_EQUAL`::
+
+	Required. A function that will compare two keys, and return
+	non-zero if and only if they are equal.
+
+`VALUE_TYPE`::
+
+	Required. The C type of the value, as it will be stored in the
+	hash (e.g., `uint32_t` to store a 32-bit integer).
+
+`SENTINEL_NULL`::
+
+	Optional. If defined, indicates that keys can store an all-zero
+	sentinel value (e.g., if the key is a pointer). This enables an
+	optimization to shrink the size of each map entry, at the cost
+	of not being able to store `NULL` key pointers in the map.
+
+
+Data Structures
+---------------
+
+Each defined map type will have its own structure (e.g., `map_object_uint32`).
+
+`struct map_NAME`::
+
+	A single map object. This struct should be initialized to all-zeroes.
+	The `nr` field specifies the number of items stored in the map. The
+	`size` field specifies the number of hash buckets allocated. The `hash`
+	field stores the actual data. Callers should never need to look at
+	these fields unless they are enumerating all elements of the map (see
+	the example below).
+
+`struct map_entry_NAME`::
+
+	A single key/value entry in the hash, which may or may not
+	contain valid data. If `SENTINEL_NULL` is defined, then an empty
+	entry will have a NULL key; otherwise, there is a `used` field
+	which will be zero in an empty entry (in which case the contents
+	of the `key` field are undefined). If the entry is empty, the
+	contents of the `value` field is undefined.  You should never
+	need to use this type directly, unless you are enumerating all
+	elements of a map.
+
+
+Functions
+---------
+
+Each defined map type will have its own set of access functions (e.g.,
+`map_get_object_uint32`).
+
+`map_get_NAME(struct map_NAME *, KEY_TYPE key, VALUE_TYPE *value)`::
+
+	Retrieve the value corresponding to `key`, returning it via the
+	pointer `value`. Returns 1 if an item was found, zero otherwise
+	(in which case `value` is unchanged). If `PASS_KEY_BY_REF` is
+	defined, the key is passed in as a `const KEY_TYPE *`.
+
+`map_set_NAME(struct map_NAME *, KEY_TYPE key, VALUE_TYPE value, VALUE_TYPE *old)`::
+
+	Insert a mapping from `key` to `value`. If a mapping for `key`
+	already existed, the previous value is copied into `old` (if it
+	is non-NULL) and the function returns 1. Otherwise, the function
+	returns 0.  If `PASS_KEY_BY_REF` or `PASS_VALUE_BY_REF` is
+	defined, the key and value are passed in as `const KEY_TYPE *`
+	and `const VALUE_TYPE *`, respectively.
+
+
+Examples
+--------
+
+Declare a new mapping type of strings to integers:
+
+-------------------------------------------------------------------
+/* in map-string-int.h */
+#define NAME string_int
+#define KEY_TYPE const char *
+#define VALUE_TYPE int
+#include "map-decl.h"
+#include "map-done.h"
+-------------------------------------------------------------------
+
+Implement the mapping:
+
+-------------------------------------------------------------------
+/* in map-string-int.c */
+
+static unsigned int hash_string(const char * const *strp, unsigned int n)
+{
+	unsigned long hash = 0;
+	const char *p;
+
+	for (p = *strp; *p; p++)
+		hash = (hash << 5) + *p;
+	return hash % n;
+}
+
+static unsigned int string_equal(const char * const *a, const char * const *b)
+{
+	return !strcmp(*a, *b);
+}
+
+#define NAME string_int
+#define KEY_TYPE const char *
+#define VALUE_TYPE int
+#include "map-impl.h"
+#include "map-done.h"
+-------------------------------------------------------------------
+
+Store and retrieve integers by string (note that the map will not
+duplicate the strings; the type is defined to merely store the
+pointer values).
+
+-------------------------------------------------------------------
+#include "map-string-int.h"
+
+static struct map_string_int foos;
+
+void store_foo(const char *s, int foo)
+{
+	int old;
+	if (map_set_object_int(&foos, xstrdup(s), foo, &old))
+		printf("old value was %d\n", old);
+}
+
+void print_foo(const char *s)
+{
+	int v;
+
+	if (map_get_object_int(&foos, s, &v))
+		printf("foo: %d\n", v);
+	else
+		printf("no such foo\n");
+}
+-------------------------------------------------------------------
+
+Iterate over all map entries:
+
+-------------------------------------------------------------------
+void dump_foos(void)
+{
+	int i;
+
+	printf("there are %u foos:\n", foos.nr);
+
+	for (i = 0; i < foos.size; i++) {
+		struct map_entry_string_int *e = foos.hash + i;
+		if (e->used)
+			printf("%s -> %d\n", e->key, e->value);
+	}
+}
+-------------------------------------------------------------------
diff --git a/Makefile b/Makefile
index 4b58b91..d512f27 100644
--- a/Makefile
+++ b/Makefile
@@ -491,6 +491,7 @@ TEST_PROGRAMS_NEED_X += test-dump-cache-tree
 TEST_PROGRAMS_NEED_X += test-genrandom
 TEST_PROGRAMS_NEED_X += test-index-version
 TEST_PROGRAMS_NEED_X += test-line-buffer
+TEST_PROGRAMS_NEED_X += test-map
 TEST_PROGRAMS_NEED_X += test-match-trees
 TEST_PROGRAMS_NEED_X += test-mergesort
 TEST_PROGRAMS_NEED_X += test-mktemp
@@ -628,6 +629,10 @@ LIB_H += list-objects.h
 LIB_H += ll-merge.h
 LIB_H += log-tree.h
 LIB_H += mailmap.h
+LIB_H += map-decl.h
+LIB_H += map-done.h
+LIB_H += map-impl.h
+LIB_H += map-init.h
 LIB_H += merge-file.h
 LIB_H += merge-recursive.h
 LIB_H += mergesort.h
diff --git a/map-decl.h b/map-decl.h
new file mode 100644
index 0000000..7d4daaf
--- /dev/null
+++ b/map-decl.h
@@ -0,0 +1,22 @@
+#include "map-init.h"
+
+struct THIS(map_entry) {
+	KEY_TYPE key;
+	VALUE_TYPE value;
+#ifndef SENTINEL_NULL
+	unsigned used:1;
+#endif
+};
+
+struct THIS(map) {
+	unsigned int size, nr;
+	struct THIS(map_entry) *hash;
+};
+
+extern int THIS(map_get)(struct THIS(map) *,
+			 KEY_ARG key,
+			 VALUE_TYPE *value);
+extern int THIS(map_set)(struct THIS(map) *,
+			 KEY_ARG key,
+			 VALUE_ARG value,
+			 VALUE_TYPE *old);
diff --git a/map-done.h b/map-done.h
new file mode 100644
index 0000000..65983d3
--- /dev/null
+++ b/map-done.h
@@ -0,0 +1,19 @@
+#undef NAME
+#undef KEY_TYPE
+#undef KEY_ARG
+#undef PASS_KEY_BY_REF
+#undef key_ref
+#undef key_deref
+#undef VALUE_TYPE
+#undef VALUE_ARG
+#undef PASS_VALUE_BY_REF
+#undef value_ref
+#undef value_deref
+#undef HASH
+#undef KEY_EQUAL
+#undef SENTINEL_NULL
+#undef hash_used
+#undef PASTE2
+#undef PASTE
+#undef THIS
+#undef MAP_UTIL_INITIALIZED
diff --git a/map-impl.h b/map-impl.h
new file mode 100644
index 0000000..844b7f1
--- /dev/null
+++ b/map-impl.h
@@ -0,0 +1,94 @@
+#include "map-init.h"
+
+#ifdef PASS_KEY_BY_REF
+#define key_ref(x) (&(x))
+#define key_deref(x) (*(x))
+#else
+#define key_ref(x) (x)
+#define key_deref(x) (x)
+#endif
+
+#ifdef PASS_VALUE_BY_REF
+#define value_ref(x) (&(x))
+#define value_deref(x) (*(x))
+#else
+#define value_ref(x) (x)
+#define value_deref(x) (x)
+#endif
+
+#ifdef SENTINEL_NULL
+#define hash_used(x) (x)->key
+#else
+#define hash_used(x) (x)->used
+#endif
+
+static int THIS(map_insert)(struct THIS(map) *m,
+			    KEY_ARG key,
+			    VALUE_ARG value,
+			    VALUE_TYPE *old)
+{
+	unsigned int j;
+
+	for (j = HASH(key, m->size); hash_used(&m->hash[j]); j = (j+1) % m->size) {
+		if (KEY_EQUAL(key_ref(m->hash[j].key), key)) {
+			if (old)
+				*old = m->hash[j].value;
+			m->hash[j].value = value_deref(value);
+			return 1;
+		}
+	}
+
+	m->hash[j].key = key_deref(key);
+	m->hash[j].value = value_deref(value);
+#ifndef SENTINEL_NULL
+	m->hash[j].used = 1;
+#endif
+	m->nr++;
+	return 0;
+}
+
+static void THIS(map_grow)(struct THIS(map) *m)
+{
+	struct THIS(map_entry) *old_hash = m->hash;
+	unsigned int old_size = m->size;
+	unsigned int i;
+
+	m->size = (old_size + 1000) * 3 / 2;
+	m->hash = xcalloc(m->size, sizeof(*m->hash));
+	m->nr = 0;
+
+	for (i = 0; i < old_size; i++) {
+		if (!hash_used(&old_hash[i]))
+			continue;
+		THIS(map_insert)(m, key_ref(old_hash[i].key), value_ref(old_hash[i].value), NULL);
+	}
+	free(old_hash);
+}
+
+int THIS(map_set)(struct THIS(map) *m,
+		   KEY_ARG key,
+		   VALUE_ARG value,
+		   VALUE_TYPE *old)
+{
+	if (m->nr >= m->size * 2 / 3)
+		THIS(map_grow)(m);
+	return THIS(map_insert)(m, key, value, old);
+}
+
+int THIS(map_get)(struct THIS(map) *m,
+		  KEY_ARG key,
+		  VALUE_TYPE *value)
+{
+	unsigned int j;
+
+	if (!m->size)
+		return 0;
+
+	for (j = HASH(key, m->size); hash_used(&m->hash[j]); j = (j+1) % m->size) {
+		if (KEY_EQUAL(key_ref(m->hash[j].key), key)) {
+			*value = m->hash[j].value;
+			return 1;
+		}
+	}
+	return 0;
+}
diff --git a/map-init.h b/map-init.h
new file mode 100644
index 0000000..1d2c4b3
--- /dev/null
+++ b/map-init.h
@@ -0,0 +1,24 @@
+#ifndef MAP_UTIL_INITIALIZED
+#define MAP_UTIL_INITIALIZED
+
+/*
+ * C preprocessor recursion hackery. Macros are not expanded next
+ * to ## tokens, so we have to add an extra level of indirection.
+ */
+#define PASTE2(x,y) x ## _ ## y
+#define PASTE(x,y) PASTE2(x,y)
+#define THIS(x) PASTE(x, NAME)
+
+#ifdef PASS_KEY_BY_REF
+#define KEY_ARG const KEY_TYPE *
+#else
+#define KEY_ARG KEY_TYPE
+#endif
+
+#ifdef PASS_VALUE_BY_REF
+#define VALUE_ARG const VALUE_TYPE *
+#else
+#define VALUE_ARG VALUE_TYPE
+#endif
+
+#endif
diff --git a/t/t0007-map.sh b/t/t0007-map.sh
new file mode 100755
index 0000000..b5845f4
--- /dev/null
+++ b/t/t0007-map.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+test_description='basic tests for the map implementation'
+. ./test-lib.sh
+
+test_expect_success 'setup input' '
+	cat >input <<-\EOF
+	f 6
+	b 2
+	a 1
+	e 5
+	i 9
+	g 7
+	d 4
+	h 8
+	c 3
+	EOF
+'
+
+for type in pointer struct; do
+	test_expect_success "look up elements ($type)" "
+		cat >expect <<-\EOF &&
+		a: 1
+		i: 9
+		d: 4
+		EOF
+		test-map $type find a i d <input >actual &&
+		test_cmp expect actual
+	"
+
+	test_expect_success "iterate over elements ($type)" "
+		cat >expect <<-\EOF &&
+		a: 1
+		b: 2
+		c: 3
+		d: 4
+		e: 5
+		f: 6
+		g: 7
+		h: 8
+		i: 9
+		EOF
+		test-map $type print <input >actual &&
+		# iteration order is hash-dependent, so we must sort
+		sort <actual >actual.sorted &&
+		test_cmp expect actual.sorted
+	"
+done
+
+test_done
diff --git a/test-map.c b/test-map.c
new file mode 100644
index 0000000..b5f74a1
--- /dev/null
+++ b/test-map.c
@@ -0,0 +1,182 @@
+#include "git-compat-util.h"
+
+static const char usage_msg[] =
+"test-map <pointer|struct> <find|print> [args]";
+
+static inline unsigned int hash_string(const char *str, unsigned int n)
+{
+	unsigned long hash = 0;
+
+	for (; *str; str++)
+		hash = (hash << 5) + *str;
+	return hash % n;
+}
+
+static inline unsigned int string_equal(const char *a, const char *b)
+{
+	return !strcmp(a, b);
+}
+
+#define NAME string_int
+#define KEY_TYPE const char *
+#define VALUE_TYPE int
+#define HASH hash_string
+#define KEY_EQUAL string_equal
+#define SENTINEL_NULL
+#include "map-decl.h"
+#include "map-impl.h"
+#include "map-done.h"
+
+struct foo {
+	char buf[10];
+};
+
+static inline unsigned int hash_foo(const struct foo *f, unsigned int n)
+{
+	return hash_string(f->buf, n);
+}
+
+static inline unsigned int foo_equal(const struct foo *a, const struct foo *b)
+{
+	return string_equal(a->buf, b->buf);
+}
+
+#define NAME foo_foo
+#define KEY_TYPE struct foo
+#define VALUE_TYPE struct foo
+#define HASH hash_foo
+#define KEY_EQUAL foo_equal
+#define PASS_KEY_BY_REF
+#define PASS_VALUE_BY_REF
+#include "map-decl.h"
+#include "map-impl.h"
+#include "map-done.h"
+
+static void read_map(void *vm,
+		     void (*store)(void *, const char *, const char *))
+{
+	char buf[1024];
+
+	while (fgets(buf, sizeof(buf), stdin)) {
+		char *delim;
+
+		if (strlen(buf) > 0 && buf[strlen(buf)-1] == '\n')
+			buf[strlen(buf)-1] = '\0';
+
+		delim = strchr(buf, ' ');
+		if (!delim)
+			die("invalid input: %s", buf);
+		*delim++ = '\0';
+
+		store(vm, buf, delim);
+	}
+}
+
+static void store_string_int(void *vm, const char *k, const char *v)
+{
+	struct map_string_int *m = vm;
+	map_set_string_int(m, xstrdup(k), atoi(v), NULL);
+}
+
+static void store_foo_foo(void *vm, const char *k, const char *v)
+{
+	struct map_foo_foo *m = vm;
+	struct foo kf, vf;
+	snprintf(kf.buf, sizeof(kf.buf), "%s", k);
+	snprintf(vf.buf, sizeof(vf.buf), "%s", v);
+	map_set_foo_foo(m, &kf, &vf, NULL);
+}
+
+static int fetch_string_int(void *vm, const char *k)
+{
+	struct map_string_int *m = vm;
+	int value;
+	if (map_get_string_int(m, k, &value))
+		return value;
+	return 0;
+}
+
+static int fetch_foo_foo(void *vm, const char *k)
+{
+	struct map_foo_foo *m = vm;
+	struct foo key, value;
+	snprintf(key.buf, sizeof(key.buf), "%s", k);
+	if (map_get_foo_foo(m, &key, &value))
+		return atoi(value.buf);
+	return 0;
+}
+
+static void print_string_int(void *vm)
+{
+	struct map_string_int *m = vm;
+	int i;
+	for (i = 0; i < m->size; i++) {
+		struct map_entry_string_int *e = m->hash + i;
+
+		if (e->key)
+			printf("%s: %d\n", e->key, e->value);
+	}
+}
+
+static void print_foo_foo(void *vm)
+{
+	struct map_foo_foo *m = vm;
+	int i;
+	for (i = 0; i < m->size; i++) {
+		struct map_entry_foo_foo *e = m->hash + i;
+
+		if (e->used)
+			printf("%s: %s\n", e->key.buf, e->value.buf);
+	}
+}
+
+static void do_op(void *vm,
+		  void (*print)(void *),
+		  int (*fetch)(void *, const char *),
+		  const char **argv)
+{
+	const char *op = *argv++;
+	if (!op)
+		usage(usage_msg);
+
+	if (!strcmp(op, "print"))
+		print(vm);
+	else if (!strcmp(op, "find")) {
+		for (; *argv; argv++) {
+			const char *key = *argv;
+			int value = fetch(vm, key);
+
+			if (value)
+				printf("%s: %d\n", key, value);
+			else
+				printf("%s: not found\n", key);
+		}
+	}
+	else
+		usage(usage_msg);
+}
+
+int main(int argc, const char **argv)
+{
+	const char *type;
+
+	argv++;
+	type = *argv++;
+	if (!type)
+	       usage(usage_msg);
+
+	if (!strcmp(type, "pointer")) {
+		struct map_string_int m = {0};
+		read_map(&m, store_string_int);
+		do_op(&m, print_string_int, fetch_string_int, argv);
+	}
+	else if (!strcmp(type, "struct")) {
+		struct map_foo_foo m = {0};
+		read_map(&m, store_foo_foo);
+		do_op(&m, print_foo_foo, fetch_foo_foo, argv);
+	}
+	else
+		usage(usage_msg);
+
+	return 0;
+}
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 2/8] map: add helper functions for objects as keys
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
  2012-08-04 17:10                   ` [PATCH 1/8] implement generic key/value map Jeff King
@ 2012-08-04 17:10                   ` Jeff King
  2012-08-04 17:11                   ` [PATCH 3/8] fast-export: use object to uint32 map instead of "decorate" Jeff King
                                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:10 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

These functions can be used as HASH and KEY_EQUAL functions
when defining new maps with "struct object *" as their key.

Signed-off-by: Jeff King <peff@peff.net>
---
 Makefile     |  1 +
 map-object.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 map-object.h

diff --git a/Makefile b/Makefile
index d512f27..1d29a95 100644
--- a/Makefile
+++ b/Makefile
@@ -633,6 +633,7 @@ LIB_H += map-decl.h
 LIB_H += map-done.h
 LIB_H += map-impl.h
 LIB_H += map-init.h
+LIB_H += map-object.h
 LIB_H += merge-file.h
 LIB_H += merge-recursive.h
 LIB_H += mergesort.h
diff --git a/map-object.h b/map-object.h
new file mode 100644
index 0000000..4f98413
--- /dev/null
+++ b/map-object.h
@@ -0,0 +1,19 @@
+#ifndef MAP_OBJECT_H
+#define MAP_OBJECT_H
+
+#include "object.h"
+
+static unsigned int hash_obj(const struct object *obj, unsigned int n)
+{
+	unsigned int hash;
+
+	memcpy(&hash, obj->sha1, sizeof(unsigned int));
+	return hash % n;
+}
+
+static int obj_equal(const struct object *a, const struct object *b)
+{
+	return a == b;
+}
+
+#endif /* MAP_OBJECT_H */
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 3/8] fast-export: use object to uint32 map instead of "decorate"
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
  2012-08-04 17:10                   ` [PATCH 1/8] implement generic key/value map Jeff King
  2012-08-04 17:10                   ` [PATCH 2/8] map: add helper functions for objects as keys Jeff King
@ 2012-08-04 17:11                   ` Jeff King
  2012-08-04 17:11                   ` [PATCH 4/8] decorate: use "map" for the underlying implementation Jeff King
                                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:11 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

Previously we encoded the "mark" mapping inside the "void *"
field of a "struct decorate". It's a little more natural for
us to do so using a data structure made for holding actual
values.

Signed-off-by: Jeff King <peff@peff.net>
---
 builtin/fast-export.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 9ab6db3..03e8f5f 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -12,7 +12,7 @@
 #include "diffcore.h"
 #include "log-tree.h"
 #include "revision.h"
-#include "decorate.h"
+#include "map-object.h"
 #include "string-list.h"
 #include "utf8.h"
 #include "parse-options.h"
@@ -61,7 +61,17 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
 	return 0;
 }
 
-static struct decoration idnums;
+#define NAME object_uint32
+#define KEY_TYPE struct object *
+#define VALUE_TYPE uint32_t
+#define HASH hash_obj
+#define KEY_EQUAL obj_equal
+#define SENTINEL_NULL
+#include "map-decl.h"
+#include "map-impl.h"
+#include "map-done.h"
+
+static struct map_object_uint32 idnums;
 static uint32_t last_idnum;
 
 static int has_unshown_parent(struct commit *commit)
@@ -75,20 +85,9 @@ static int has_unshown_parent(struct commit *commit)
 	return 0;
 }
 
-/* Since intptr_t is C99, we do not use it here */
-static inline uint32_t *mark_to_ptr(uint32_t mark)
-{
-	return ((uint32_t *)NULL) + mark;
-}
-
-static inline uint32_t ptr_to_mark(void * mark)
-{
-	return (uint32_t *)mark - (uint32_t *)NULL;
-}
-
 static inline void mark_object(struct object *object, uint32_t mark)
 {
-	add_decoration(&idnums, object, mark_to_ptr(mark));
+	map_set_object_uint32(&idnums, object, mark, NULL);
 }
 
 static inline void mark_next_object(struct object *object)
@@ -98,10 +97,9 @@ static inline void mark_next_object(struct object *object)
 
 static int get_object_mark(struct object *object)
 {
-	void *decoration = lookup_decoration(&idnums, object);
-	if (!decoration)
-		return 0;
-	return ptr_to_mark(decoration);
+	uint32_t ret = 0;
+	map_get_object_uint32(&idnums, object, &ret);
+	return ret;
 }
 
 static void show_progress(void)
@@ -557,8 +555,6 @@ static void handle_tags_and_duplicates(struct string_list *extra_refs)
 static void export_marks(char *file)
 {
 	unsigned int i;
-	uint32_t mark;
-	struct object_decoration *deco = idnums.hash;
 	FILE *f;
 	int e = 0;
 
@@ -567,15 +563,15 @@ static void export_marks(char *file)
 		die_errno("Unable to open marks file %s for writing.", file);
 
 	for (i = 0; i < idnums.size; i++) {
-		if (deco->base && deco->base->type == 1) {
-			mark = ptr_to_mark(deco->decoration);
-			if (fprintf(f, ":%"PRIu32" %s\n", mark,
-				sha1_to_hex(deco->base->sha1)) < 0) {
+		const struct map_entry_object_uint32 *m = idnums.hash + i;
+
+		if (m->key && m->key->type == 1) {
+			if (fprintf(f, ":%"PRIu32" %s\n", m->value,
+				sha1_to_hex(m->key->sha1)) < 0) {
 			    e = 1;
 			    break;
 			}
 		}
-		deco++;
 	}
 
 	e |= ferror(f);
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 4/8] decorate: use "map" for the underlying implementation
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
                                     ` (2 preceding siblings ...)
  2012-08-04 17:11                   ` [PATCH 3/8] fast-export: use object to uint32 map instead of "decorate" Jeff King
@ 2012-08-04 17:11                   ` Jeff King
  2012-08-04 17:11                   ` [PATCH 5/8] map: implement persistent maps Jeff King
                                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:11 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

The decoration API maps objects to void pointers. This is a
subset of what the map API is capable of, so let's get rid
of the duplicate implementation.

We could just fix all callers of decorate to call the map
API directly. However, the map API is very generic since it
is meant to handle any type. In particular, it can't use
sentinel values like "NULL" to indicate "entry not found"
(since it doesn't know whether the type can represent such a
sentinel value), which makes the interface slightly more
complicated.

Instead, let's keep the existing decorate API as a wrapper
on top of map. No callers need to be updated at all.

Signed-off-by: Jeff King <peff@peff.net>
---
 Documentation/technical/api-decorate.txt | 38 +++++++++++++-
 Makefile                                 |  3 ++
 decorate.c                               | 85 +++-----------------------------
 decorate.h                               | 10 ++--
 map-object-void-params.h                 |  6 +++
 map-object-void.c                        |  7 +++
 map-object-void.h                        |  8 +++
 7 files changed, 70 insertions(+), 87 deletions(-)
 create mode 100644 map-object-void-params.h
 create mode 100644 map-object-void.c
 create mode 100644 map-object-void.h

diff --git a/Documentation/technical/api-decorate.txt b/Documentation/technical/api-decorate.txt
index 1d52a6c..3c1197a 100644
--- a/Documentation/technical/api-decorate.txt
+++ b/Documentation/technical/api-decorate.txt
@@ -1,6 +1,40 @@
 decorate API
 ============
 
-Talk about <decorate.h>
+The decorate API is a system for efficiently mapping objects to values
+in memory. It is slightly slower than an actual member of an object
+struct (because it incurs a hash lookup), but it uses less memory when
+the mapping is not in use, or when the number of decorated objects is
+small compared to the total number of objects.
 
-(Linus)
+The decorate API is a special form of the `map` link:api-map.html[map
+API]. It has slightly simpler calling conventions, but only use objects
+as keys, and can only store void pointers as values.
+
+
+Data Structures
+---------------
+
+`struct decoration`::
+
+	This structure represents a single mapping of objects to values.
+	The `name` field is not used by the decorate API itself, but may
+	be used by calling code. The `map` field represents the actual
+	mapping of objects to void pointers (see the
+	link:api-map.html[map API] for details).
+
+
+Functions
+---------
+
+`add_decoration`::
+
+	Add a mapping from an object to a void pointer. If there was a
+	previous value for this object, the function returns this value;
+	otherwise, the function returns NULL.
+
+`lookup_decoration`::
+
+	Retrieve the stored value pointer for an object from the
+	mapping. The return value is the value pointer, or `NULL` if
+	there is no value for this object.
diff --git a/Makefile b/Makefile
index 1d29a95..7684a76 100644
--- a/Makefile
+++ b/Makefile
@@ -633,6 +633,8 @@ LIB_H += map-decl.h
 LIB_H += map-done.h
 LIB_H += map-impl.h
 LIB_H += map-init.h
+LIB_H += map-object-void-params.h
+LIB_H += map-object-void.h
 LIB_H += map-object.h
 LIB_H += merge-file.h
 LIB_H += merge-recursive.h
@@ -747,6 +749,7 @@ LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
 LIB_OBJS += mailmap.o
+LIB_OBJS += map-object-void.o
 LIB_OBJS += match-trees.o
 LIB_OBJS += merge-file.o
 LIB_OBJS += merge-recursive.o
diff --git a/decorate.c b/decorate.c
index 2f8a63e..31e9656 100644
--- a/decorate.c
+++ b/decorate.c
@@ -1,88 +1,17 @@
-/*
- * decorate.c - decorate a git object with some arbitrary
- * data.
- */
 #include "cache.h"
-#include "object.h"
 #include "decorate.h"
 
-static unsigned int hash_obj(const struct object *obj, unsigned int n)
-{
-	unsigned int hash;
-
-	memcpy(&hash, obj->sha1, sizeof(unsigned int));
-	return hash % n;
-}
-
-static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)
-{
-	int size = n->size;
-	struct object_decoration *hash = n->hash;
-	unsigned int j = hash_obj(base, size);
-
-	while (hash[j].base) {
-		if (hash[j].base == base) {
-			void *old = hash[j].decoration;
-			hash[j].decoration = decoration;
-			return old;
-		}
-		if (++j >= size)
-			j = 0;
-	}
-	hash[j].base = base;
-	hash[j].decoration = decoration;
-	n->nr++;
-	return NULL;
-}
-
-static void grow_decoration(struct decoration *n)
-{
-	int i;
-	int old_size = n->size;
-	struct object_decoration *old_hash = n->hash;
-
-	n->size = (old_size + 1000) * 3 / 2;
-	n->hash = xcalloc(n->size, sizeof(struct object_decoration));
-	n->nr = 0;
-
-	for (i = 0; i < old_size; i++) {
-		const struct object *base = old_hash[i].base;
-		void *decoration = old_hash[i].decoration;
-
-		if (!base)
-			continue;
-		insert_decoration(n, base, decoration);
-	}
-	free(old_hash);
-}
-
-/* Add a decoration pointer, return any old one */
 void *add_decoration(struct decoration *n, const struct object *obj,
-		void *decoration)
+		     void *decoration)
 {
-	int nr = n->nr + 1;
-
-	if (nr > n->size * 2 / 3)
-		grow_decoration(n);
-	return insert_decoration(n, obj, decoration);
+	void *ret = NULL;
+	map_set_object_void(&n->map, obj, decoration, &ret);
+	return ret;
 }
 
-/* Lookup a decoration pointer */
 void *lookup_decoration(struct decoration *n, const struct object *obj)
 {
-	unsigned int j;
-
-	/* nothing to lookup */
-	if (!n->size)
-		return NULL;
-	j = hash_obj(obj, n->size);
-	for (;;) {
-		struct object_decoration *ref = n->hash + j;
-		if (ref->base == obj)
-			return ref->decoration;
-		if (!ref->base)
-			return NULL;
-		if (++j == n->size)
-			j = 0;
-	}
+	void *ret = NULL;
+	map_get_object_void(&n->map, obj, &ret);
+	return ret;
 }
diff --git a/decorate.h b/decorate.h
index e732804..c58acbc 100644
--- a/decorate.h
+++ b/decorate.h
@@ -1,15 +1,11 @@
 #ifndef DECORATE_H
 #define DECORATE_H
 
-struct object_decoration {
-	const struct object *base;
-	void *decoration;
-};
+#include "map-object-void.h"
 
 struct decoration {
-	const char *name;
-	unsigned int size, nr;
-	struct object_decoration *hash;
+	char *name;
+	struct map_object_void map;
 };
 
 extern void *add_decoration(struct decoration *n, const struct object *obj, void *decoration);
diff --git a/map-object-void-params.h b/map-object-void-params.h
new file mode 100644
index 0000000..471fbdc
--- /dev/null
+++ b/map-object-void-params.h
@@ -0,0 +1,6 @@
+#define NAME object_void
+#define KEY_TYPE const struct object *
+#define VALUE_TYPE void *
+#define HASH hash_obj
+#define KEY_EQUAL obj_equal
+#define SENTINEL_NULL
diff --git a/map-object-void.c b/map-object-void.c
new file mode 100644
index 0000000..4934642
--- /dev/null
+++ b/map-object-void.c
@@ -0,0 +1,7 @@
+#include "cache.h"
+#include "map-object-void.h"
+
+#include "map-object.h"
+#include "map-object-void-params.h"
+#include "map-impl.h"
+#include "map-done.h"
diff --git a/map-object-void.h b/map-object-void.h
new file mode 100644
index 0000000..b4bc64d
--- /dev/null
+++ b/map-object-void.h
@@ -0,0 +1,8 @@
+#ifndef MAP_OBJECT_VOID_H
+#define MAP_OBJECT_VOID_H
+
+#include "map-object-void-params.h"
+#include "map-decl.h"
+#include "map-done.h"
+
+#endif /* MAP_OBJECT_VOID_H */
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 5/8] map: implement persistent maps
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
                                     ` (3 preceding siblings ...)
  2012-08-04 17:11                   ` [PATCH 4/8] decorate: use "map" for the underlying implementation Jeff King
@ 2012-08-04 17:11                   ` Jeff King
  2012-08-04 17:11                   ` [PATCH 6/8] implement metadata cache subsystem Jeff King
                                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:11 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

It's sometimes useful to keep a mapping across program
invocations (e.g., because a space/time tradeoff makes it
worth keeping a cache of calculated metadata for some
objects).

This adds a persistent version of the map API which can be
backed by a flat memory store (like an mmap'd file). By
itself, it's not very pleasant to use, as the caller is
responsible for actually opening and mapping files. But it
provides the building blocks for disk caches, which will
come in the next patch.

Signed-off-by: Jeff King <peff@peff.net>
---
 .gitignore                          |   1 +
 Documentation/technical/api-map.txt |  77 +++++++++++++++++++
 Makefile                            |   1 +
 map-decl.h                          |  18 +++++
 map-impl.h                          | 148 ++++++++++++++++++++++++++++++++++++
 t/t0007-map.sh                      |  24 ++++++
 test-map-persist.c                  | 125 ++++++++++++++++++++++++++++++
 7 files changed, 394 insertions(+)
 create mode 100644 test-map-persist.c

diff --git a/.gitignore b/.gitignore
index c2bad89..14350f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -185,6 +185,7 @@
 /test-index-version
 /test-line-buffer
 /test-map
+/test-map-persist
 /test-match-trees
 /test-mergesort
 /test-mktemp
diff --git a/Documentation/technical/api-map.txt b/Documentation/technical/api-map.txt
index f86b785..256bddf 100644
--- a/Documentation/technical/api-map.txt
+++ b/Documentation/technical/api-map.txt
@@ -25,6 +25,21 @@ The decorate API provides a similar interface to map, but is restricted to
 using "struct object" as the key, and a void pointer as the value.
 
 
+Persistent Maps
+---------------
+
+Maps come in two flavors: persistent and in-core. In-core maps are
+represented by a hash table, and can contain any C type. Persistent maps
+are backed by flat storage, such as an mmap'd file, and store values
+between program runs. Key and value types must be serializable to
+fixed-width byte values.
+
+The flat storage is a sorted array of key/value pairs, with no
+delimiters between pairs or between elements of a pair.  Persistent maps
+use an in-core map for newly-added values, and then merge the new
+values into the flat storage on request.
+
+
 Defining New Map Types
 ----------------------
 
@@ -78,6 +93,39 @@ The following map defines are available:
 	optimization to shrink the size of each map entry, at the cost
 	of not being able to store `NULL` key pointers in the map.
 
+`PERSIST`::
+
+	Optional. If defined, functions for storing the map to disk are
+	also created.
+
+`KEY_SIZE`::
+
+	Required by `PERSIST`. The fixed-width size of the on-disk
+	representation of a key, in bytes.
+
+`VALUE_SIZE`::
+
+	Required by `PERSIST`. The fixed-width size of the on-disk
+	representation of a value, in bytes.
+
+`KEY_TO_DISK(KEY_ARG key, unsigned char *out)`::
+
+	Required by `PERSIST`. A function that will convert the
+	in-memory representation of a key into its on-disk
+	representation. Note that the reverse transformation does not
+	have to be possible.
+
+`VALUE_TO_DISK(VALUE_ARG value, unsigned char *out)`::
+
+	Required by `PERSIST`. A function that will convert the
+	in-memory representation of a value into its on-disk
+	representation.
+
+`DISK_TO_VALUE(unsigned char *in, VALUE_TYPE *value)`::
+
+	Required by `PERSIST`. A function that will convert the on-disk
+	representation of a value into its in-memory representation.
+
 
 Data Structures
 ---------------
@@ -104,6 +152,14 @@ Each defined map type will have its own structure (e.g., `map_object_uint32`).
 	need to use this type directly, unless you are enumerating all
 	elements of a map.
 
+`struct map_persist_NAME`::
+
+	A persistent map. This struct should be initialized to
+	all-zeroes. The `map` field contains a complete in-core map. The
+	`disk_entries` and `disk_nr` fields specify the flat storage.
+	These should not be set directly, but rather through the
+	`attach` function.
+
 
 Functions
 ---------
@@ -127,6 +183,27 @@ Each defined map type will have its own set of access functions (e.g.,
 	defined, the key and value are passed in as `const KEY_TYPE *`
 	and `const VALUE_TYPE *`, respectively.
 
+`map_persist_get_NAME(struct map_persist_NAME *, KEY_TYPE key, VALUE_TYPE *value)`::
+
+	Same as `map_get_NAME`, but for a persistent map.
+
+`map_persist_set_NAME(struct map_persist_NAME *, KEY_TYPE key, VALUE_TYPE value)`::
+
+	Same as `map_set_name`, but for a persistent map. It does
+	not provide the "old" value for the key.
+
+`map_persist_attach_NAME`::
+
+	Attach storage from `buf` of size `len` bytes as the flat
+	backing store for the map. The map does not copy the storage;
+	the caller is responsible for making sure it stays around as
+	long as the map does.
+
+`map_persist_flush_NAME`::
+
+	Merge in-core entries with those found in the backing store, and
+	write the result to `fd`. Returns 0 for success, -1 for failure.
+
 
 Examples
 --------
diff --git a/Makefile b/Makefile
index 7684a76..e38ae9e 100644
--- a/Makefile
+++ b/Makefile
@@ -492,6 +492,7 @@ TEST_PROGRAMS_NEED_X += test-genrandom
 TEST_PROGRAMS_NEED_X += test-index-version
 TEST_PROGRAMS_NEED_X += test-line-buffer
 TEST_PROGRAMS_NEED_X += test-map
+TEST_PROGRAMS_NEED_X += test-map-persist
 TEST_PROGRAMS_NEED_X += test-match-trees
 TEST_PROGRAMS_NEED_X += test-mergesort
 TEST_PROGRAMS_NEED_X += test-mktemp
diff --git a/map-decl.h b/map-decl.h
index 7d4daaf..1ba7157 100644
--- a/map-decl.h
+++ b/map-decl.h
@@ -20,3 +20,21 @@ extern int THIS(map_set)(struct THIS(map) *,
 			 KEY_ARG key,
 			 VALUE_ARG value,
 			 VALUE_TYPE *old);
+
+#ifdef PERSIST
+struct THIS(map_persist) {
+	const unsigned char *disk_entries;
+	unsigned int disk_nr;
+	struct THIS(map) mem;
+};
+extern int THIS(map_persist_get)(struct THIS(map_persist) *,
+				 KEY_ARG key,
+				 VALUE_TYPE *value);
+extern int THIS(map_persist_set)(struct THIS(map_persist) *,
+				 KEY_ARG key,
+				 VALUE_ARG value);
+extern void THIS(map_persist_attach)(struct THIS(map_persist) *,
+				     const unsigned char *buf,
+				     unsigned int len);
+extern int THIS(map_persist_flush)(struct THIS(map_persist) *, int fd);
+#endif
diff --git a/map-impl.h b/map-impl.h
index 844b7f1..c44d0a0 100644
--- a/map-impl.h
+++ b/map-impl.h
@@ -92,3 +92,151 @@ int THIS(map_get)(struct THIS(map) *m,
 	}
 	return 0;
 }
+
+#ifdef PERSIST
+static const unsigned char *THIS(disk_lookup)(const unsigned char *buf, int nr,
+					      int ksize, int vsize,
+					      const unsigned char *key)
+{
+	unsigned lo = 0, hi = nr;
+
+	do {
+		unsigned mi = (lo + hi) / 2;
+		const unsigned char *e = buf + mi * (ksize + vsize);
+		int cmp = memcmp(key, e, ksize);
+
+		if (!cmp)
+			return e + ksize;
+		if (cmp < 0)
+			hi = mi;
+		else
+			lo = mi + 1;
+	} while (lo < hi);
+
+	return NULL;
+}
+
+int THIS(map_persist_get)(struct THIS(map_persist) *m,
+			  KEY_ARG key,
+			  VALUE_TYPE *value)
+{
+	unsigned char disk_key[KEY_SIZE];
+	const unsigned char *disk_value;
+
+	if (THIS(map_get)(&m->mem, key, value))
+		return 1;
+
+	if (!m->disk_entries)
+		return 0;
+
+	KEY_TO_DISK(key, disk_key);
+	disk_value = THIS(disk_lookup)(m->disk_entries, m->disk_nr,
+				       KEY_SIZE, VALUE_SIZE, disk_key);
+	if (disk_value) {
+		DISK_TO_VALUE(disk_value, value);
+		return 1;
+	}
+
+	return 0;
+}
+
+int THIS(map_persist_set)(struct THIS(map_persist) *m,
+			  KEY_ARG key,
+			  VALUE_ARG value)
+{
+	return THIS(map_set)(&m->mem, key, value, NULL);
+}
+
+void THIS(map_persist_attach)(struct THIS(map_persist) *m,
+			      const unsigned char *buf,
+			      unsigned int len)
+{
+	m->disk_entries = buf;
+	m->disk_nr = len / (KEY_SIZE + VALUE_SIZE);
+}
+
+static unsigned char *THIS(flatten_mem)(struct THIS(map_persist) *m)
+{
+	unsigned char *ret, *out;
+	int i, nr;
+
+	out = ret = xmalloc(m->mem.nr * (KEY_SIZE + VALUE_SIZE));
+	nr = 0;
+	for (i = 0; i < m->mem.size; i++) {
+		struct THIS(map_entry) *e = m->mem.hash + i;
+
+		if (!hash_used(e))
+			continue;
+
+		if (nr++ == m->mem.nr)
+			die("BUG: map hash contained extra values");
+
+		KEY_TO_DISK(key_ref(e->key), out);
+		out += KEY_SIZE;
+		VALUE_TO_DISK(value_ref(e->value), out);
+		out += VALUE_SIZE;
+	}
+
+	if (nr != m->mem.nr)
+		die("BUG: map hash had fewer values than claimed");
+
+	return ret;
+}
+
+static int THIS(keycmp)(const void *a, const void *b)
+{
+	return memcmp(a, b, KEY_SIZE);
+}
+
+static int THIS(merge_entries)(int fd,
+			       const unsigned char *left, unsigned nr_left,
+			       const unsigned char *right, unsigned nr_right)
+{
+#define ADVANCE(name) \
+	do { \
+		name += KEY_SIZE + VALUE_SIZE; \
+		nr_##name--; \
+	} while (0)
+#define WRITE_ENTRY(name) \
+	do { \
+		if (write_in_full(fd, name, KEY_SIZE + VALUE_SIZE) < 0) \
+			return -1; \
+		ADVANCE(name); \
+	} while (0)
+
+	while (nr_left && nr_right) {
+		int cmp = THIS(keycmp)(left, right);
+
+		/* skip duplicates, preferring left to right */
+		if (cmp == 0)
+			ADVANCE(right);
+		else if (cmp < 0)
+			WRITE_ENTRY(left);
+		else
+			WRITE_ENTRY(right);
+	}
+	while (nr_left)
+		WRITE_ENTRY(left);
+	while (nr_right)
+		WRITE_ENTRY(right);
+
+#undef WRITE_ENTRY
+#undef ADVANCE
+
+	return 0;
+}
+
+int THIS(map_persist_flush)(struct THIS(map_persist) *m, int fd)
+{
+	unsigned char *mem_entries;
+	int r;
+
+	mem_entries = THIS(flatten_mem)(m);
+	qsort(mem_entries, m->mem.nr, KEY_SIZE + VALUE_SIZE, THIS(keycmp));
+
+	r = THIS(merge_entries)(fd, mem_entries, m->mem.nr,
+				m->disk_entries, m->disk_nr);
+	free(mem_entries);
+	return r;
+}
+#endif
diff --git a/t/t0007-map.sh b/t/t0007-map.sh
index b5845f4..14187d6 100755
--- a/t/t0007-map.sh
+++ b/t/t0007-map.sh
@@ -47,4 +47,28 @@ for type in pointer struct; do
 	"
 done
 
+test_expect_success 'put some items in a persistent map' '
+	test-map-persist foo d:4 a:1 c:3 b:2
+'
+
+test_expect_success 'retrieve persistent items' '
+	cat >expect <<-\EOF &&
+	c: 3
+	a: 1
+	e: not found
+	EOF
+	test-map-persist foo c a e >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'new entries override disk entries' '
+	cat >expect <<-\EOF &&
+	c: 3
+	a: 5
+	e: not found
+	EOF
+	test-map-persist foo a:5 c a e >actual &&
+	test_cmp expect actual
+'
+
 test_done
diff --git a/test-map-persist.c b/test-map-persist.c
new file mode 100644
index 0000000..1334cdd
--- /dev/null
+++ b/test-map-persist.c
@@ -0,0 +1,125 @@
+#include "cache.h"
+
+static const char usage_msg[] =
+"test-map-persist <file> [keys]";
+
+static inline unsigned int hash_string(const char *str, unsigned int n)
+{
+	unsigned long hash = 0;
+
+	for (; *str; str++)
+		hash = (hash << 5) + *str;
+	return hash % n;
+}
+
+static inline unsigned int string_equal(const char *a, const char *b)
+{
+	return !strcmp(a, b);
+}
+
+static inline void string_to_disk(const char *s, unsigned char *out)
+{
+	/*
+	 * we need a fixed-width representation, so let's just store 10 bytes,
+	 * padded with zeroes
+	 */
+	int len = strlen(s);
+	if (len > 10)
+		len = 10;
+	memcpy(out, s, len);
+	memset(out + len, 0, 10 - len);
+}
+
+static inline void uint32_to_disk(uint32_t v, unsigned char *out)
+{
+	v = htonl(v);
+	memcpy(out, &v, 4);
+}
+
+static inline void disk_to_uint32(const unsigned char *in, uint32_t *v)
+{
+	memcpy(v, in, 4);
+	*v = ntohl(*v);
+}
+
+#define NAME string_uint32
+#define PERSIST
+#define KEY_TYPE const char *
+#define KEY_SIZE 10
+#define KEY_TO_DISK string_to_disk
+#define KEY_EQUAL string_equal
+#define HASH hash_string
+#define DISK_LOOKUP_FUN lookup_string
+#define SENTINEL_NULL
+#define VALUE_TYPE uint32_t
+#define VALUE_SIZE 4
+#define VALUE_TO_DISK uint32_to_disk
+#define DISK_TO_VALUE disk_to_uint32
+#include "map-decl.h"
+#include "map-impl.h"
+#include "map-done.h"
+
+static void open_map(const char *path, struct map_persist_string_uint32 *m)
+{
+	int fd;
+	struct stat sb;
+	const unsigned char *buf;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return;
+
+	fstat(fd, &sb);
+	buf = xmmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	map_persist_attach_string_uint32(m, buf, sb.st_size);
+}
+
+static void flush_map(const char *path, struct map_persist_string_uint32 *m)
+{
+	char tmp[1024];
+	int fd;
+
+	snprintf(tmp, sizeof(tmp), "%s.tmp", path);
+	fd = open(tmp, O_WRONLY|O_CREAT, 0666);
+	if (fd < 0)
+		die_errno("unable to open '%s' for writing", tmp);
+	if (map_persist_flush_string_uint32(m, fd) < 0 || close(fd) < 0)
+		die_errno("unable to write new map");
+	if (rename(tmp, path) < 0)
+		die_errno("unable to rename new map into place");
+}
+
+int main(int argc, const char **argv)
+{
+	struct map_persist_string_uint32 m = {0};
+	const char *path;
+	const char *spec;
+
+	argv++;
+	path = *argv++;
+	if (!path)
+		usage(usage_msg);
+
+	open_map(path, &m);
+
+	while ((spec = *argv++)) {
+		char *colon = strchr(spec, ':');
+		if (colon) {
+			*colon++ = '\0';
+			map_persist_set_string_uint32(&m, spec, atoi(colon));
+		}
+		else {
+			uint32_t value;
+			if (map_persist_get_string_uint32(&m, spec, &value))
+				printf("%s: %d\n", spec, value);
+			else
+				printf("%s: not found\n", spec);
+		}
+	}
+
+	flush_map(path, &m);
+
+	return 0;
+}
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 6/8] implement metadata cache subsystem
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
                                     ` (4 preceding siblings ...)
  2012-08-04 17:11                   ` [PATCH 5/8] map: implement persistent maps Jeff King
@ 2012-08-04 17:11                   ` Jeff King
  2012-08-04 22:49                     ` Junio C Hamano
  2012-08-06 20:38                     ` Jeff King
  2012-08-04 17:12                   ` [PATCH 7/8] implement rename cache Jeff King
  2012-08-04 17:14                   ` [PATCH 8/8] diff: optionally use " Jeff King
  7 siblings, 2 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:11 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

There are some calculations that git makes repeatedly, even
though the results are invariant for a certain input (e.g.,
the patch-id of a certain commit). We can make a space/time
tradeoff by caching these on disk between runs.

Even though these may be immutable for a certain commit, we
don't want to directly store the results in the commit
objects themselves, for a few reasons:

  1. They are not necessarily used by all algorithms, so
     bloating the commit object might slow down other
     algorithms.

  2. Because they can be calculated from the existing
     commits, they are redundant with the existing
     information. Thus they are an implementation detail of
     our current algorithms, and should not be cast in stone
     by including them in the commit sha1.

  3. They may only be immutable under a certain set of
     conditions (e.g., which grafts or replace refs we are
     using). Keeping the storage external means we can
     invalidate and regenerate the cache whenever those
     conditions change.

The persistent map API already provides the storage we need.
This new API takes care of the details of opening and
closing the cache files automatically. Callers need only get
and set values as they see fit.

Signed-off-by: Jeff King <peff@peff.net>
---
 Documentation/technical/api-metadata-cache.txt |  67 ++++++++++
 Makefile                                       |   2 +
 metadata-cache.c                               | 169 +++++++++++++++++++++++++
 metadata-cache.h                               |   8 ++
 4 files changed, 246 insertions(+)
 create mode 100644 Documentation/technical/api-metadata-cache.txt
 create mode 100644 metadata-cache.c
 create mode 100644 metadata-cache.h

diff --git a/Documentation/technical/api-metadata-cache.txt b/Documentation/technical/api-metadata-cache.txt
new file mode 100644
index 0000000..5f4422d
--- /dev/null
+++ b/Documentation/technical/api-metadata-cache.txt
@@ -0,0 +1,67 @@
+metadata cache API
+==================
+
+The metadata cache API provides simple-to-use, persistent key/value
+storage. It is built on the link:api-map.html[map API], so keys and
+values can have any serializable type.
+
+Caches are statically allocated, and no explicit initialization is
+required.  Callers can simply call the "get" and "set" functions for a
+given cache.  At program exit, any new entries in the cache are flushed
+to disk.
+
+
+Defining a New Cache
+--------------------
+
+You need to provide three pieces of information to define a new cache:
+
+name::
+	This name will be used both as part of the C identifier and as
+	part of the filename under which the cache is stored. Restrict
+	the characters used to alphanumerics and underscore.
+
+map::
+	The type of map (declared by `DECLARE_MAP`) that this cache will
+	store.
+
+validity::
+	A function that will generate a 20-byte "validity token"
+	representing the conditions under which the cache is valid.
+	For example, a cache that depended on the structure of the
+	history graph would be valid only under a given set of grafts
+	and replace refs. That set could be stirred into a sha1 and used
+	as a validity token.
+
+You must declare the cache in metadata-cache.h using
+`DECLARE_METADATA_CACHE`, and then implement it in metadata-cache.c
+using `IMPLEMENT_METADATA_CACHE`.
+
+
+Using a Cache
+-------------
+
+Interaction with a cache consists entirely of getting and setting
+values. No initialization or cleanup is required. The get and set
+functions mirror their "map" counterparts; see the
+link:api-map.html[map API] for details.
+
+
+File Format
+-----------
+
+Cache files are stored in the $GIT_DIR/cache directory. Each cache gets
+its own directory, named after the `name` parameter in the cache
+definition. Within each directory is a set of files, one cache per file,
+named after their validity tokens. Caches for multiple sets of
+conditions can simultaneously exist, and git will use whichever is
+appropriate.
+
+The files themselves consist of an 8-byte header. The first four bytes
+are the magic sequence "MTAC" (for "MeTA Cache"), followed by a 4-byte
+version number, in network byte order. This document describes version
+1.
+
+The rest of the file consists of the persistent map data. This is a
+compact, sorted list of keys and values; see the link:api-map.html[map
+API] for details.
diff --git a/Makefile b/Makefile
index e38ae9e..08ddfa7 100644
--- a/Makefile
+++ b/Makefile
@@ -640,6 +640,7 @@ LIB_H += map-object.h
 LIB_H += merge-file.h
 LIB_H += merge-recursive.h
 LIB_H += mergesort.h
+LIB_H += metadata-cache.h
 LIB_H += notes-cache.h
 LIB_H += notes-merge.h
 LIB_H += notes.h
@@ -755,6 +756,7 @@ LIB_OBJS += match-trees.o
 LIB_OBJS += merge-file.o
 LIB_OBJS += merge-recursive.o
 LIB_OBJS += mergesort.o
+LIB_OBJS += metadata-cache.o
 LIB_OBJS += name-hash.o
 LIB_OBJS += notes.o
 LIB_OBJS += notes-cache.o
diff --git a/metadata-cache.c b/metadata-cache.c
new file mode 100644
index 0000000..94eabfb
--- /dev/null
+++ b/metadata-cache.c
@@ -0,0 +1,169 @@
+#include "cache.h"
+#include "metadata-cache.h"
+
+struct metadata_cache {
+	const char *name;
+	void (*attach)(unsigned char *buf, unsigned long len);
+	int (*flush)(int fd);
+	int (*empty)(void);
+	void (*validity)(unsigned char[20]);
+	int fd;
+	unsigned char *buf;
+	unsigned long len;
+	struct metadata_cache *next;
+	int initialized;
+};
+
+struct metadata_cache *caches;
+
+static const char *metadata_cache_path(struct metadata_cache *mc)
+{
+	unsigned char token[20];
+
+	if (mc->validity)
+		mc->validity(token);
+	else
+		hashcpy(token, null_sha1);
+	return git_path("cache/%s/%s", mc->name, sha1_to_hex(token));
+}
+
+static void write_one_cache(struct metadata_cache *mc)
+{
+	const char *path;
+	struct strbuf tempfile = STRBUF_INIT;
+	int fd = -1;
+
+	if (mc->empty())
+		return;
+
+	path = metadata_cache_path(mc);
+	strbuf_addf(&tempfile, "%s.XXXXXX", path);
+
+	if (safe_create_leading_directories(tempfile.buf) < 0)
+		goto fail;
+	fd = git_mkstemp_mode(tempfile.buf, 0444);
+	if (fd < 0)
+		goto fail;
+
+	if (write_in_full(fd, "MTAC\x00\x00\x00\x01", 8) < 0)
+		goto fail;
+	if (mc->flush(fd) < 0)
+		goto fail;
+	if (close(fd) < 0)
+		goto fail;
+	if (rename(tempfile.buf, path) < 0)
+		goto fail;
+
+	strbuf_release(&tempfile);
+	return;
+
+fail:
+	close(fd);
+	unlink(tempfile.buf);
+	strbuf_release(&tempfile);
+}
+
+static void write_all_caches(void)
+{
+	struct metadata_cache *mc;
+	for (mc = caches; mc; mc = mc->next)
+		write_one_cache(mc);
+}
+
+void metadata_cache_init(struct metadata_cache *mc)
+{
+	const char *path;
+	struct stat st;
+	const unsigned char *p;
+	uint32_t version;
+	int fd;
+
+	if (mc->initialized)
+		return;
+
+	if (!caches)
+		atexit(write_all_caches);
+	mc->next = caches;
+	caches = mc;
+	mc->initialized = 1;
+
+	path = metadata_cache_path(mc);
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return;
+
+	if (fstat(fd, &st) < 0)
+		goto fail;
+	mc->len = st.st_size;
+	mc->buf = xmmap(NULL, mc->len, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (!mc->buf)
+		goto fail;
+	close(fd);
+	fd = -1;
+
+	if (mc->len < 8) {
+		warning("cache '%s' is missing header", path);
+		goto fail;
+	}
+	p = mc->buf;
+	if (memcmp(p, "MTAC", 4)) {
+		warning("cache '%s' has invalid magic: %c%c%c%c",
+			path, p[0], p[1], p[2], p[3]);
+		goto fail;
+	}
+	p += 4;
+	memcpy(&version, p, 4);
+	version = ntohl(version);
+	if (version != 1) {
+		warning("cache '%s' has unknown version: %"PRIu32,
+			path, version);
+		goto fail;
+	}
+
+	mc->attach(mc->buf + 8, mc->len - 8);
+	return;
+
+fail:
+	close(fd);
+	if (mc->buf)
+		munmap(mc->buf, mc->len);
+	mc->buf = NULL;
+	mc->len = 0;
+}
+
+#define IMPLEMENT_CACHE(name, maptype, ktype, vtype) \
+static struct map_persist_sha1pair_uint32 name##_map; \
+\
+static void name##_cache_attach(unsigned char *buf, unsigned long len) \
+{ \
+	map_persist_attach_##maptype(&name##_map, buf, len); \
+} \
+\
+static int name##_cache_flush(int fd) \
+{ \
+	return map_persist_flush_##maptype(&name##_map, fd); \
+} \
+\
+static int name##_cache_empty(void) \
+{ \
+	return !name##_map.mem.nr; \
+} \
+\
+static struct metadata_cache name##_cache = { \
+	#name, \
+	name##_cache_attach, \
+	name##_cache_flush, \
+	name##_cache_empty, \
+}; \
+\
+int name##_cache_get(ktype key, vtype *value) \
+{ \
+	metadata_cache_init(&name##_cache); \
+	return map_persist_get_##maptype(&name##_map, key, value); \
+} \
+\
+void name##_cache_set(ktype key, vtype value) \
+{ \
+	metadata_cache_init(&name##_cache); \
+	map_persist_set_##maptype(&name##_map, key, value); \
+}
diff --git a/metadata-cache.h b/metadata-cache.h
new file mode 100644
index 0000000..4354023
--- /dev/null
+++ b/metadata-cache.h
@@ -0,0 +1,8 @@
+#ifndef METADATA_CACHE_H
+#define METADATA_CACHE_H
+
+#define DECLARE_CACHE(name, ktype, vtype) \
+int name##_cache_get(ktype, vtype *); \
+void name##_cache_set(ktype, vtype);
+
+#endif /* METADATA_CACHE_H */
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 7/8] implement rename cache
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
                                     ` (5 preceding siblings ...)
  2012-08-04 17:11                   ` [PATCH 6/8] implement metadata cache subsystem Jeff King
@ 2012-08-04 17:12                   ` Jeff King
  2012-08-04 17:14                   ` [PATCH 8/8] diff: optionally use " Jeff King
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:12 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

This just stores pairs of sha1s mapped to their similarity scores.

Signed-off-by: Jeff King <peff@peff.net>
---
 Makefile                     |  5 +++++
 cache.h                      |  5 +++++
 map-sha1pair-uint32-params.h | 12 ++++++++++++
 map-sha1pair-uint32.c        |  8 ++++++++
 map-sha1pair-uint32.h        |  8 ++++++++
 map-sha1pair.h               | 22 ++++++++++++++++++++++
 map-uint32.h                 | 16 ++++++++++++++++
 metadata-cache.c             |  3 +++
 metadata-cache.h             |  2 ++
 9 files changed, 81 insertions(+)
 create mode 100644 map-sha1pair-uint32-params.h
 create mode 100644 map-sha1pair-uint32.c
 create mode 100644 map-sha1pair-uint32.h
 create mode 100644 map-sha1pair.h
 create mode 100644 map-uint32.h

diff --git a/Makefile b/Makefile
index 08ddfa7..f020994 100644
--- a/Makefile
+++ b/Makefile
@@ -637,6 +637,10 @@ LIB_H += map-init.h
 LIB_H += map-object-void-params.h
 LIB_H += map-object-void.h
 LIB_H += map-object.h
+LIB_H += map-sha1pair-uint32-params.h
+LIB_H += map-sha1pair-uint32.h
+LIB_H += map-sha1pair.h
+LIB_H += map-uint32.h
 LIB_H += merge-file.h
 LIB_H += merge-recursive.h
 LIB_H += mergesort.h
@@ -752,6 +756,7 @@ LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
 LIB_OBJS += mailmap.o
 LIB_OBJS += map-object-void.o
+LIB_OBJS += map-sha1pair-uint32.o
 LIB_OBJS += match-trees.o
 LIB_OBJS += merge-file.o
 LIB_OBJS += merge-recursive.o
diff --git a/cache.h b/cache.h
index 67f28b4..23a2f93 100644
--- a/cache.h
+++ b/cache.h
@@ -677,6 +677,11 @@ static inline int is_empty_blob_sha1(const unsigned char *sha1)
 	return !hashcmp(sha1, EMPTY_BLOB_SHA1_BIN);
 }
 
+struct sha1pair {
+	unsigned char one[20];
+	unsigned char two[20];
+};
+
 int git_mkstemp(char *path, size_t n, const char *template);
 
 int git_mkstemps(char *path, size_t n, const char *template, int suffix_len);
diff --git a/map-sha1pair-uint32-params.h b/map-sha1pair-uint32-params.h
new file mode 100644
index 0000000..74c6506
--- /dev/null
+++ b/map-sha1pair-uint32-params.h
@@ -0,0 +1,12 @@
+#define NAME sha1pair_uint32
+#define PERSIST
+#define KEY_TYPE struct sha1pair
+#define PASS_KEY_BY_REF
+#define KEY_EQUAL sha1pair_equal
+#define KEY_SIZE 40
+#define KEY_TO_DISK sha1pair_to_disk
+#define HASH hash_sha1pair
+#define VALUE_TYPE uint32_t
+#define VALUE_SIZE 4
+#define VALUE_TO_DISK uint32_to_disk
+#define DISK_TO_VALUE disk_to_uint32
diff --git a/map-sha1pair-uint32.c b/map-sha1pair-uint32.c
new file mode 100644
index 0000000..93b3c16
--- /dev/null
+++ b/map-sha1pair-uint32.c
@@ -0,0 +1,8 @@
+#include "cache.h"
+#include "map-sha1pair-uint32.h"
+
+#include "map-sha1pair.h"
+#include "map-uint32.h"
+#include "map-sha1pair-uint32-params.h"
+#include "map-impl.h"
+#include "map-done.h"
diff --git a/map-sha1pair-uint32.h b/map-sha1pair-uint32.h
new file mode 100644
index 0000000..6e022fb
--- /dev/null
+++ b/map-sha1pair-uint32.h
@@ -0,0 +1,8 @@
+#ifndef MAP_SHA1PAIR_UINT32_H
+#define MAP_SHA1PAIR_UINT32_H
+
+#include "map-sha1pair-uint32-params.h"
+#include "map-decl.h"
+#include "map-done.h"
+
+#endif /* MAP_SHA1PAIR_UINT32_H */
diff --git a/map-sha1pair.h b/map-sha1pair.h
new file mode 100644
index 0000000..ea7ead8
--- /dev/null
+++ b/map-sha1pair.h
@@ -0,0 +1,22 @@
+#ifndef MAP_SHA1PAIR_H
+#define MAP_SHA1PAIR_H
+
+static inline unsigned int hash_sha1pair(const struct sha1pair *k, unsigned int n)
+{
+	unsigned int hash;
+	memcpy(&hash, k, sizeof(unsigned int));
+	return hash % n;
+}
+
+static inline int sha1pair_equal(const struct sha1pair *a, const struct sha1pair *b)
+{
+	return !hashcmp(a->one, b->one) && !hashcmp(a->two, b->two);
+}
+
+static inline void sha1pair_to_disk(const struct sha1pair *v, unsigned char *out)
+{
+	hashcpy(out, v->one);
+	hashcpy(out+20, v->two);
+}
+
+#endif /* MAP_SHA1PAIR_H */
diff --git a/map-uint32.h b/map-uint32.h
new file mode 100644
index 0000000..ce63dac
--- /dev/null
+++ b/map-uint32.h
@@ -0,0 +1,16 @@
+#ifndef MAP_UINT32_H
+#define MAP_UINT32_H
+
+static inline void uint32_to_disk(uint32_t v, unsigned char *out)
+{
+	v = htonl(v);
+	memcpy(out, &v, 4);
+}
+
+static inline void disk_to_uint32(const unsigned char *in, uint32_t *v)
+{
+	memcpy(v, in, 4);
+	*v = ntohl(*v);
+}
+
+#endif /* MAP_UINT32_H */
diff --git a/metadata-cache.c b/metadata-cache.c
index 94eabfb..d70d116 100644
--- a/metadata-cache.c
+++ b/metadata-cache.c
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "metadata-cache.h"
+#include "map-sha1pair-uint32.h"
 
 struct metadata_cache {
 	const char *name;
@@ -167,3 +168,5 @@ void name##_cache_set(ktype key, vtype value) \
 	metadata_cache_init(&name##_cache); \
 	map_persist_set_##maptype(&name##_map, key, value); \
 }
+
+IMPLEMENT_CACHE(rename, sha1pair_uint32, const struct sha1pair *, uint32_t)
diff --git a/metadata-cache.h b/metadata-cache.h
index 4354023..f3e6f51 100644
--- a/metadata-cache.h
+++ b/metadata-cache.h
@@ -5,4 +5,6 @@
 int name##_cache_get(ktype, vtype *); \
 void name##_cache_set(ktype, vtype);
 
+DECLARE_CACHE(rename, const struct sha1pair *, uint32_t)
+
 #endif /* METADATA_CACHE_H */
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 8/8] diff: optionally use rename cache
  2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
                                     ` (6 preceding siblings ...)
  2012-08-04 17:12                   ` [PATCH 7/8] implement rename cache Jeff King
@ 2012-08-04 17:14                   ` Jeff King
  7 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-04 17:14 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

This speeds up estimate_similarity by caching the similarity
score of pairs of blob sha1s.

Signed-off-by: Jeff King <peff@peff.net>
---
Some interesting things to time with this are:

  - "git log --raw -M" on a repo with a lot of paths or a lot of renames
    (I found on git.git, the speedup was not that impressive)

  - "git log --raw -C -C" on any repo (this speeds up a lot in git.git).

  - "git show -M" on commits with very large blobs

 cache.h           |  1 +
 diff.c            |  6 ++++++
 diffcore-rename.c | 11 ++++++++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/cache.h b/cache.h
index 23a2f93..7ee1caf 100644
--- a/cache.h
+++ b/cache.h
@@ -1228,6 +1228,7 @@ int add_files_to_cache(const char *prefix, const char **pathspec, int flags);
 
 /* diff.c */
 extern int diff_auto_refresh_index;
+extern int diff_cache_renames;
 
 /* match-trees.c */
 void shift_tree(const unsigned char *, const unsigned char *, unsigned char *, int);
diff --git a/diff.c b/diff.c
index 95706a5..c84e043 100644
--- a/diff.c
+++ b/diff.c
@@ -34,6 +34,7 @@ static int diff_no_prefix;
 static int diff_stat_graph_width;
 static int diff_dirstat_permille_default = 30;
 static struct diff_options default_diff_options;
+int diff_cache_renames;
 
 static char diff_colors[][COLOR_MAXLEN] = {
 	GIT_COLOR_RESET,
@@ -214,6 +215,11 @@ int git_diff_basic_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "diff.cacherenames")) {
+		diff_cache_renames = git_config_bool(var, value);
+		return 0;
+	}
+
 	if (!prefixcmp(var, "submodule."))
 		return parse_submodule_config_option(var, value);
 
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 216a7a4..611e1d3 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -6,6 +6,7 @@
 #include "diffcore.h"
 #include "hash.h"
 #include "progress.h"
+#include "metadata-cache.h"
 
 /* Table of rename/copy destinations */
 
@@ -137,7 +138,8 @@ static int estimate_similarity(struct diff_filespec *src,
 	 */
 	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
 	unsigned long delta_limit;
-	int score;
+	uint32_t score;
+	struct sha1pair pair;
 
 	/* We deal only with regular files.  Symlink renames are handled
 	 * only when they are exact matches --- in other words, no edits
@@ -175,6 +177,11 @@ static int estimate_similarity(struct diff_filespec *src,
 	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
+	hashcpy(pair.one, src->sha1);
+	hashcpy(pair.two, dst->sha1);
+	if (diff_cache_renames && rename_cache_get(&pair, &score))
+			return score;
+
 	if (!src->cnt_data && diff_populate_filespec(src, 0))
 		return 0;
 	if (!dst->cnt_data && diff_populate_filespec(dst, 0))
@@ -195,6 +202,8 @@ static int estimate_similarity(struct diff_filespec *src,
 		score = 0; /* should not happen */
 	else
 		score = (int)(src_copied * MAX_SCORE / max_size);
+	if (diff_cache_renames)
+		rename_cache_set(&pair, score);
 	return score;
 }
 
-- 
1.7.12.rc1.7.g7a223a6

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/8] implement metadata cache subsystem
  2012-08-04 17:11                   ` [PATCH 6/8] implement metadata cache subsystem Jeff King
@ 2012-08-04 22:49                     ` Junio C Hamano
  2012-08-06 20:31                       ` Jeff King
  2012-08-06 20:38                     ` Jeff King
  1 sibling, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-04 22:49 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> There are some calculations that git makes repeatedly, even
> though the results are invariant for a certain input (e.g.,
> the patch-id of a certain commit). We can make a space/time
> tradeoff by caching these on disk between runs.
>
> Even though these may be immutable for a certain commit, we
> don't want to directly store the results in the commit
> objects themselves, for a few reasons:
>
>   1. They are not necessarily used by all algorithms, so
>      bloating the commit object might slow down other
>      algorithms.
>
>   2. Because they can be calculated from the existing
>      commits, they are redundant with the existing
>      information. Thus they are an implementation detail of
>      our current algorithms, and should not be cast in stone
>      by including them in the commit sha1.
>
>   3. They may only be immutable under a certain set of
>      conditions (e.g., which grafts or replace refs we are
>      using). Keeping the storage external means we can
>      invalidate and regenerate the cache whenever those
>      conditions change.

4. The algorithm used to compute such values could improve over
time.  The same advantage argument as 3 applies to this case.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/8] implement generic key/value map
  2012-08-04 17:10                   ` [PATCH 1/8] implement generic key/value map Jeff King
@ 2012-08-04 22:58                     ` Junio C Hamano
  2012-08-06 20:35                       ` Jeff King
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2012-08-04 22:58 UTC (permalink / raw
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, git

Jeff King <peff@peff.net> writes:

> It is frequently useful to have a fast, generic data
> structure mapping keys to values. We already have something
> like this in the "decorate" API, but it has two downsides:
>
>   1. The key type must always be a "struct object *".
>
>   2. The value type is a void pointer, which means it is
>      inefficient and cumbersome for storing small values.
>      One must either encode their value inside the void
>      pointer, or allocate additional storage for the pointer
>      to point to.
>
> This patch introduces a generic map data structure, mapping
> keys of arbitrary type to values of arbitrary type.

Does the type of keys in a map have to be of the same size, or can a
key of a type with variable size (e.g. struct with a flex member at
the end)?  Same question for the type of values.

Is the type of keys in a map required to have a total order over it,
or is it suffice only to have equality defined?

The latter might matter once we start talking about a huge map that
we may not want to hold in-core.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/8] implement metadata cache subsystem
  2012-08-04 22:49                     ` Junio C Hamano
@ 2012-08-06 20:31                       ` Jeff King
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-06 20:31 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Sat, Aug 04, 2012 at 03:49:12PM -0700, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > There are some calculations that git makes repeatedly, even
> > though the results are invariant for a certain input (e.g.,
> > the patch-id of a certain commit). We can make a space/time
> > tradeoff by caching these on disk between runs.
> >
> > Even though these may be immutable for a certain commit, we
> > don't want to directly store the results in the commit
> > objects themselves, for a few reasons:
> >
> >   1. They are not necessarily used by all algorithms, so
> >      bloating the commit object might slow down other
> >      algorithms.
> >
> >   2. Because they can be calculated from the existing
> >      commits, they are redundant with the existing
> >      information. Thus they are an implementation detail of
> >      our current algorithms, and should not be cast in stone
> >      by including them in the commit sha1.
> >
> >   3. They may only be immutable under a certain set of
> >      conditions (e.g., which grafts or replace refs we are
> >      using). Keeping the storage external means we can
> >      invalidate and regenerate the cache whenever those
> >      conditions change.
> 
> 4. The algorithm used to compute such values could improve over
> time.  The same advantage argument as 3 applies to this case.

Yeah, agreed. That commit message is a year old, and was written for an
earlier iteration of the patch which was used for caching commit
generations. There's not really a better algorithm there, but your
comment certainly applies to rename similarities.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/8] implement generic key/value map
  2012-08-04 22:58                     ` Junio C Hamano
@ 2012-08-06 20:35                       ` Jeff King
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-06 20:35 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git

On Sat, Aug 04, 2012 at 03:58:10PM -0700, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > It is frequently useful to have a fast, generic data
> > structure mapping keys to values. We already have something
> > like this in the "decorate" API, but it has two downsides:
> >
> >   1. The key type must always be a "struct object *".
> >
> >   2. The value type is a void pointer, which means it is
> >      inefficient and cumbersome for storing small values.
> >      One must either encode their value inside the void
> >      pointer, or allocate additional storage for the pointer
> >      to point to.
> >
> > This patch introduces a generic map data structure, mapping
> > keys of arbitrary type to values of arbitrary type.
> 
> Does the type of keys in a map have to be of the same size, or can a
> key of a type with variable size (e.g. struct with a flex member at
> the end)?  Same question for the type of values.

Both have to be fixed size, since we represent the hash table using an
array. But there is nothing stopping you from storing a fixed-size
pointer to a variable-sized item (that is what is happening with "struct
object *", anyway; the actual storage is inside a "struct commit",
"struct tree", etc). But then you get the accompanying memory-management
issues (which are easy for "struct object", as our policy is to keep a
global valid-until-the-program-exits store of all objects we ever see).

> Is the type of keys in a map required to have a total order over it,
> or is it suffice only to have equality defined?

No, you only have to define equality. However, for the later patch which
adds a persistent backing store, you need to be able to serialize keys
to a byte representation, which provides an implicit total order (by
sorting the bytes).

> The latter might matter once we start talking about a huge map that
> we may not want to hold in-core.

Right. See patch 5/8.

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 6/8] implement metadata cache subsystem
  2012-08-04 17:11                   ` [PATCH 6/8] implement metadata cache subsystem Jeff King
  2012-08-04 22:49                     ` Junio C Hamano
@ 2012-08-06 20:38                     ` Jeff King
  1 sibling, 0 replies; 35+ messages in thread
From: Jeff King @ 2012-08-06 20:38 UTC (permalink / raw
  To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git

On Sat, Aug 04, 2012 at 01:11:46PM -0400, Jeff King wrote:

> +#define IMPLEMENT_CACHE(name, maptype, ktype, vtype) \
> +static struct map_persist_sha1pair_uint32 name##_map; \

A minor fixup, but this should obviously be "map_persist_##maptype". It
doesn't matter for this series (since we only instantiate one cache, and
it uses a sha1pair_uint32 map), but obviously this was meant to be
generic...

-Peff

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2012-08-06 20:38 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-31 14:15 [WIP PATCH] Manual rename correction Nguyen Thai Ngoc Duy
2012-07-31 16:32 ` Junio C Hamano
2012-07-31 19:23   ` Jeff King
2012-07-31 20:20     ` Junio C Hamano
2012-08-01  0:42       ` Jeff King
2012-08-01  6:01         ` Junio C Hamano
2012-08-01 21:54           ` Jeff King
2012-08-01 22:10             ` Junio C Hamano
2012-08-02 22:37               ` Jeff King
2012-08-02 22:51                 ` Junio C Hamano
2012-08-02 22:58                   ` Jeff King
2012-08-02  5:33             ` Junio C Hamano
2012-08-01  1:10     ` Nguyen Thai Ngoc Duy
2012-08-01  2:01       ` Jeff King
2012-08-01  4:36         ` Nguyen Thai Ngoc Duy
2012-08-01  6:09           ` Junio C Hamano
2012-08-01  6:34             ` Nguyen Thai Ngoc Duy
2012-08-01 21:32               ` Jeff King
2012-08-01 21:27           ` Jeff King
2012-08-02 12:08             ` Nguyen Thai Ngoc Duy
2012-08-02 22:41               ` Jeff King
2012-08-04 17:09                 ` [PATCH 0/8] caching rename results Jeff King
2012-08-04 17:10                   ` [PATCH 1/8] implement generic key/value map Jeff King
2012-08-04 22:58                     ` Junio C Hamano
2012-08-06 20:35                       ` Jeff King
2012-08-04 17:10                   ` [PATCH 2/8] map: add helper functions for objects as keys Jeff King
2012-08-04 17:11                   ` [PATCH 3/8] fast-export: use object to uint32 map instead of "decorate" Jeff King
2012-08-04 17:11                   ` [PATCH 4/8] decorate: use "map" for the underlying implementation Jeff King
2012-08-04 17:11                   ` [PATCH 5/8] map: implement persistent maps Jeff King
2012-08-04 17:11                   ` [PATCH 6/8] implement metadata cache subsystem Jeff King
2012-08-04 22:49                     ` Junio C Hamano
2012-08-06 20:31                       ` Jeff King
2012-08-06 20:38                     ` Jeff King
2012-08-04 17:12                   ` [PATCH 7/8] implement rename cache Jeff King
2012-08-04 17:14                   ` [PATCH 8/8] diff: optionally use " Jeff King

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).