[PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
@ 2008-06-02  3:54 Geoffrey Irving
  2008-06-02  6:13 ` Johannes Schindelin
  0 siblings, 1 reply; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02  3:54 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git@vger.kernel.org

The dominant cost of git-cherry is the computation of patch-ids for each
relevant commit.  Once computed, these pairs are now stored in a hash table
in $GIT_DIR/patch-id-cache to speed up repeated invocations.

The basic structure of patch-id-cache.c was cannibalized from Johannes
Schindelin's notes-index structure, though most of the code was rewritten.
The hash table is still kept strictly sorted by commit, but the entire table
is now read into memory.  See commit d56dbe8cb56ce9b4697d1f1c2425e2e133a932a5
for the original code.
---
 Makefile         |    1 +
 patch-id-cache.c |  180 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 patch-ids.c      |   18 +++++-
 3 files changed, 198 insertions(+), 1 deletions(-)
 create mode 100644 patch-id-cache.c

diff --git a/Makefile b/Makefile
index cce5a6e..3a5396d 100644
--- a/Makefile
+++ b/Makefile
@@ -435,6 +435,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += patch-id-cache.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/patch-id-cache.c b/patch-id-cache.c
new file mode 100644
index 0000000..a2b32c8
--- /dev/null
+++ b/patch-id-cache.c
@@ -0,0 +1,180 @@
+#include "cache.h"
+
+/*
+ * The patch-id-cache is a file consisting of a header, and a hash map
+ * containing commit hash -> patch-id hash pairs.
+ *
+ * Currently the entire hash table is read into memory.  Entries are
+ * kept strictly sorted rather than wrapping around, so the actual
+ * size of the table is slightly larger than the size of the key space.
+ *
+ * The disadvantages of a hash is its loose packing.  In order to operate
+ * reasonably well, it needs a size roughly double the number of entries.
+ * It also has a worst runtime linear in the number of entries.
+ *
+ * The advantage is an expected constant lookup time.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ *
+ * There is another advantage to hash maps: with not much effort, an
+ * incremental update can be performed in place, relying on O_TRUNC to
+ * detect interruptions.  This operation has an expected constant runtime.
+ * Incremental update is not yet implemented.
+ */
+
+struct patch_id_entry {
+	unsigned char commit_sha1[20];
+	unsigned char patch_id_sha1[20];
+};
+
+struct patch_id_cache {
+	char signature[4]; /* HASH */
+
+	off_t count, key_size, actual_size; /* for hash */
+	struct patch_id_entry entries[0];
+};
+
+static struct patch_id_cache *cache = 0;
+static size_t written_count = 0;
+
+const char *signature = "HASH";
+
+static size_t get_hash_size(size_t actual_size)
+{
+	return sizeof(struct patch_id_cache) + actual_size * sizeof(struct
patch_id_entry);
+}
+
+static void read_patch_id_cache()
+{
+	int fd = open(git_path("patch-id-cache"), O_RDONLY);
+
+	if (fd < 0) { // allocate a new cache
+		size_t size = 64;
+		cache = xcalloc(get_hash_size(size), 1);
+		cache->key_size = cache->actual_size = size;
+		return;
+	}
+
+	struct patch_id_cache header;
+	size_t len = read_in_full(fd, &header, sizeof(header));
+	if (len != sizeof(header))
+		die("cannot read patch-id-cache header");
+	if (header.key_size & (header.key_size-1))
+		die("patch-id-cache key size %lld is not a power of two", (long
long)header.key_size);
+
+	free(cache);
+	cache = xmalloc(get_hash_size(header.actual_size));
+	*cache = header;
+	size_t entry_size = header.actual_size * sizeof(struct patch_id_entry);
+	len = read_in_full(fd, cache->entries, entry_size);
+	if (len != entry_size)
+		die("cannot read patch-id-cache entries");
+	written_count = cache->count;
+
+	// It's impossible to check whether the cache is actually accurate,
+	// so assume it's correct if we get to this point.
+}
+
+void write_patch_id_cache()
+{
+	if (!cache || cache->count == written_count)
+		return;
+
+	struct lock_file update_lock;
+	int fd = hold_lock_file_for_update(&update_lock,
+			git_path("patch-id-cache"), 0);
+
+	if (fd < 0) {
+		error("could not construct patch-id-cache");
+		return;
+	}
+
+	memcpy(cache->signature, signature, 4);
+	if (write_in_full(fd, cache, get_hash_size(cache->actual_size)) < 0) {
+		error("could not write patch-id-cache");
+		return;
+	}
+
+	if (commit_lock_file(&update_lock))
+		error("could not finalize patch-id-cache");
+	else {
+		written_count = cache->count;
+		printf("patch-id-cache count %lld\n", (long long)written_count);
+	}
+}
+
+static unsigned long get_hash_index(const unsigned char *sha1)
+{
+	return (ntohl(*(unsigned long *)sha1) & (cache->key_size-1));
+}
+
+int get_cached_commit_patch_id(const unsigned char *commit_sha1,
+		unsigned char *patch_id_sha1)
+{
+	if (!cache)
+		read_patch_id_cache();
+
+	size_t i = get_hash_index(commit_sha1);
+	for (; i < cache->actual_size; i++) {
+		int cmp = hashcmp(commit_sha1, cache->entries[i].commit_sha1);
+		if (!cmp) {
+			hashcpy(patch_id_sha1, cache->entries[i].patch_id_sha1);
+			return 0;
+		} else if (cmp > 0 || is_null_sha1(cache->entries[i].commit_sha1))
+			break;
+	}
+	return -1;
+}
+
+void cache_commit_patch_id(const unsigned char *commit_sha1,
+		const unsigned char *patch_id_sha1)
+{
+	if (4*cache->count > 3*cache->key_size) { // resize
+		// allocate cache with twice the size
+		struct patch_id_cache *old_cache = cache;
+		size_t new_key_size = 2*cache->key_size,
+			new_actual_size = cache->key_size + cache->actual_size;
+		cache = xcalloc(get_hash_size(new_actual_size), 1);
+		cache->key_size = new_key_size;
+		cache->actual_size = new_actual_size;
+
+		// reinsert all entries
+		size_t i;
+	 	for (i = 0; i < old_cache->actual_size; i++)
+			cache_commit_patch_id(old_cache->entries[i].commit_sha1,
+				old_cache->entries[i].patch_id_sha1);
+	}
+
+	struct patch_id_entry entry;
+	hashcpy(entry.commit_sha1, commit_sha1);
+	hashcpy(entry.patch_id_sha1, patch_id_sha1);
+
+	size_t i = get_hash_index(commit_sha1);
+	for (;;) {
+		if (i == cache->actual_size) {
+			cache->actual_size++; // adding one suffices for expected O(1)
+			cache = xrealloc(cache, get_hash_size(cache->actual_size));
+			cache->entries[cache->actual_size-1] = entry;
+			return;
+		}
+		int cmp = hashcmp(entry.commit_sha1, cache->entries[i].commit_sha1);
+		if (!cmp) {
+			if (hashcmp(entry.patch_id_sha1, cache->entries[i].patch_id_sha1))
+				die("found mismatched entry in patch-id-cache");
+			return;
+		} else if (cmp < 0) {
+			i++;
+		} else if (is_null_sha1(cache->entries[i].commit_sha1)) {
+			cache->entries[i] = entry;
+			cache->count++;
+			return;
+		} else {
+			struct patch_id_entry tmp = cache->entries[i];
+			cache->entries[i] = entry;
+			entry = tmp;
+			i++;
+		}
+	}
+}
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..91fabeb 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -3,16 +3,30 @@
 #include "commit.h"
 #include "patch-ids.h"

+extern void write_patch_id_cache();
+extern int get_cached_commit_patch_id(const unsigned char
*commit_sha1, unsigned char *patch_id_sha1);
+extern void cache_commit_patch_id(const unsigned char *commit_sha1,
const unsigned char *patch_id_sha1);
+
 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	// pull patch-id out of the cache if possible
+	if (!get_cached_commit_patch_id(commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	int ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	// record commit, patch-id pair in cache
+	cache_commit_patch_id(commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -136,6 +150,8 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	write_patch_id_cache();
 	return 0;
 }

-- 
1.5.4.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02  3:54 [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry Geoffrey Irving
@ 2008-06-02  6:13 ` Johannes Schindelin
  2008-06-02  6:42   ` Jeff King
  2008-06-02 14:50   ` Geoffrey Irving
  0 siblings, 2 replies; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02  6:13 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: git@vger.kernel.org

Hi,

On Sun, 1 Jun 2008, Geoffrey Irving wrote:

> The dominant cost of git-cherry is the computation of patch-ids for each 
> relevant commit.  Once computed, these pairs are now stored in a hash 
> table in $GIT_DIR/patch-id-cache to speed up repeated invocations.
> 
> The basic structure of patch-id-cache.c was cannibalized from Johannes 
> Schindelin's notes-index structure, though most of the code was 
> rewritten. The hash table is still kept strictly sorted by commit, but 
> the entire table is now read into memory.

I do not think that this "read-the-entire-table-into-memory" paradigm is a 
wise choice. mmap()ing, I would have understood, but reading a potentially 
pretty large table into memory?

> See commit d56dbe8cb56ce9b4697d1f1c2425e2e133a932a5 for the original 
> code.

This is not in any official git.git repository.  And my branches are prone 
to be rebased.  So this is not a good reference.  The mailing list, 
however, would be one.

> diff --git a/Makefile b/Makefile
> index cce5a6e..3a5396d 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -435,6 +435,7 @@ LIB_OBJS += pager.o
>  LIB_OBJS += parse-options.o
>  LIB_OBJS += patch-delta.o
>  LIB_OBJS += patch-ids.o
> +LIB_OBJS += patch-id-cache.o

If all you do is a hashmap from SHA-1 to SHA-1, then I think 
"patch-id-cache" is a misnomer for that file and structure.

Let's not repeat the same naming mistakes as we have for path_list and 
decoration.

> +static void read_patch_id_cache()
> +{
> +	int fd = open(git_path("patch-id-cache"), O_RDONLY);
> +
> +	if (fd < 0) { // allocate a new cache
> +		size_t size = 64;
> +		cache = xcalloc(get_hash_size(size), 1);
> +		cache->key_size = cache->actual_size = size;
> +		return;
> +	}
> +
> +	struct patch_id_cache header;
> +	size_t len = read_in_full(fd, &header, sizeof(header));
> +	if (len != sizeof(header))
> +		die("cannot read patch-id-cache header");
> +	if (header.key_size & (header.key_size-1))
> +		die("patch-id-cache key size %lld is not a power of two", (long
> long)header.key_size);

Very long line.

Why does the key_size have to be a power of two?

> +	free(cache);
> +	cache = xmalloc(get_hash_size(header.actual_size));
> +	*cache = header;

This free() kind of defies the xmalloc in the if() clause above.

Besides, those three lines are a pretty awkward way to avoid writing 
xrealloc(), don't you think?

> +	size_t entry_size = header.actual_size * sizeof(struct patch_id_entry);

Declaration after statement.

> +	len = read_in_full(fd, cache->entries, entry_size);
> +	if (len != entry_size)
> +		die("cannot read patch-id-cache entries");
> +	written_count = cache->count;

Huh?  It is not written_count.  Besides, why do you use a global here?  
(Yes, I know, you probably just copied it from my code, but you could 
clean it up in the process ;-)

> +	// It's impossible to check whether the cache is actually accurate,

C++ style comments.

> +void write_patch_id_cache()
> +{
> +	if (!cache || cache->count == written_count)
> +		return;

Does that mean that the patch_id_cache is not updated when the number 
of commits stays the same after committing, branch -d and gc?

 > +
> +	struct lock_file update_lock;
> +	int fd = hold_lock_file_for_update(&update_lock,
> +			git_path("patch-id-cache"), 0);
> +
> +	if (fd < 0) {
> +		error("could not construct patch-id-cache");
> +		return;
> +	}

Not good.  Printing an error, but not returning any indicator of it.

> +int get_cached_commit_patch_id(const unsigned char *commit_sha1,
> +		unsigned char *patch_id_sha1)

Again, I do not think it is wise to name this function so focused on this 
particular purpose, when its use is clearly not limited to patch ids.

> +void cache_commit_patch_id(const unsigned char *commit_sha1,
> +		const unsigned char *patch_id_sha1)
> +{
> +	if (4*cache->count > 3*cache->key_size) { // resize

C++ comment.

> +		struct patch_id_cache *old_cache = cache;
> +		size_t new_key_size = 2*cache->key_size,
> +			new_actual_size = cache->key_size + cache->actual_size;
> +		cache = xcalloc(get_hash_size(new_actual_size), 1);
> +		cache->key_size = new_key_size;
> +		cache->actual_size = new_actual_size;
> +
> +		// reinsert all entries
> +		size_t i;
> +	 	for (i = 0; i < old_cache->actual_size; i++)
> +			cache_commit_patch_id(old_cache->entries[i].commit_sha1,
> +				old_cache->entries[i].patch_id_sha1);
> +	}

Does this not just _beg_ to live in an own function ("grow_hash")?

> +	struct patch_id_entry entry;

Declaration after statement.

> +	hashcpy(entry.commit_sha1, commit_sha1);
> +	hashcpy(entry.patch_id_sha1, patch_id_sha1);

It would be more elegant to copy the SHA-1s _after_ finding where to write 
them.

> +	size_t i = get_hash_index(commit_sha1);

Declaration after statement.

> +	for (;;) {
> +		if (i == cache->actual_size) {
> +			cache->actual_size++; // adding one suffices for expected O(1)

Long line.  C++ comment.  "one suffices"?

> +			cache = xrealloc(cache, get_hash_size(cache->actual_size));
> +			cache->entries[cache->actual_size-1] = entry;
> +			return;
> +		}
> +		int cmp = hashcmp(entry.commit_sha1, cache->entries[i].commit_sha1);

Declar... ah, well, suffice to say that you should read the 
CodingGuidelines, and try to fix up all the offending sites in your code.

> +		if (!cmp) {
> +			if (hashcmp(entry.patch_id_sha1, cache->entries[i].patch_id_sha1))
> +				die("found mismatched entry in patch-id-cache");

I wonder if that potentially expensive operation should not rather be 
wrapped in an assert() call (since I recently learnt that Git's source 
code has more than one instance of assert()).

> +			return;
> +		} else if (cmp < 0) {
> +			i++;
> +		} else if (is_null_sha1(cache->entries[i].commit_sha1)) {
> +			cache->entries[i] = entry;
> +			cache->count++;
> +			return;
> +		} else {
> +			struct patch_id_entry tmp = cache->entries[i];
> +			cache->entries[i] = entry;
> +			entry = tmp;
> +			i++;
> +		}

You seem to be wanting to implement a "sorted hashmap"... this is 
non-standard, and AFAICT affects performance negatively.

> diff --git a/patch-ids.c b/patch-ids.c
> index 3be5d31..91fabeb 100644
> --- a/patch-ids.c
> +++ b/patch-ids.c
> @@ -3,16 +3,30 @@
>  #include "commit.h"
>  #include "patch-ids.h"
> 
> +extern void write_patch_id_cache();
> +extern int get_cached_commit_patch_id(const unsigned char
> *commit_sha1, unsigned char *patch_id_sha1);
> +extern void cache_commit_patch_id(const unsigned char *commit_sha1,
> const unsigned char *patch_id_sha1);
> +

Uh oh.  This is not good.  If you do not have declarations like these in 
header files (that are included from the implementing .c file, of course), 
it is all too easy to have the signatures of the implementation and the 
declaration diverge.

So move them into an own header file, where they belong.  IIRC I did this 
in the notes code, so why did you choose not to imitate that?

>  static uint32_t take2(const unsigned char *id)
> @@ -136,6 +150,8 @@ int free_patch_ids(struct patch_ids *ids)
>  		next = patches->next;
>  		free(patches);
>  	}
> +
> +	write_patch_id_cache();
>  	return 0;

That's cute.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02  6:13 ` Johannes Schindelin
@ 2008-06-02  6:42   ` Jeff King
  2008-06-02 14:35     ` Geoffrey Irving
  2008-06-02 14:50   ` Geoffrey Irving
  1 sibling, 1 reply; 16+ messages in thread
From: Jeff King @ 2008-06-02  6:42 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Geoffrey Irving, git@vger.kernel.org

On Mon, Jun 02, 2008 at 07:13:14AM +0100, Johannes Schindelin wrote:

> I do not think that this "read-the-entire-table-into-memory" paradigm is a 
> wise choice. mmap()ing, I would have understood, but reading a potentially 
> pretty large table into memory?

When I was just a git-youth, I wrote a fast mmap-based cache for storing
SHA1 pairs. It might give some direction. You should be able to find it
here:

  http://mid.gmane.org/20060629035849.GA30749@coredump.intra.peff.net

It mmaps and binary searches a sorted list. New entries are added to an
in-memory list, and then at the end of a run, the two sorted lists are
merged to create the new on-disk version.

-Peff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02  6:42   ` Jeff King
@ 2008-06-02 14:35     ` Geoffrey Irving
  2008-06-02 15:37       ` Johannes Schindelin
  0 siblings, 1 reply; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02 14:35 UTC (permalink / raw)
  To: Jeff King; +Cc: Johannes Schindelin, git@vger.kernel.org

On Sun, Jun 1, 2008 at 11:42 PM, Jeff King <peff@peff.net> wrote:
> On Mon, Jun 02, 2008 at 07:13:14AM +0100, Johannes Schindelin wrote:
>
>> I do not think that this "read-the-entire-table-into-memory" paradigm is a
>> wise choice. mmap()ing, I would have understood, but reading a potentially
>> pretty large table into memory?
>
> When I was just a git-youth, I wrote a fast mmap-based cache for storing
> SHA1 pairs. It might give some direction. You should be able to find it
> here:
>
>  http://mid.gmane.org/20060629035849.GA30749@coredump.intra.peff.net
>
> It mmaps and binary searches a sorted list. New entries are added to an
> in-memory list, and then at the end of a run, the two sorted lists are
> merged to create the new on-disk version.

I don't need sorting (and neither did you), so I think a hash table is
better (O(1) instead of O(log n), and we don't even need to compute
hash keys.  I'll leave it up to you and Dscho (or anyone else who
cares to chime in) which one you think I should do.

Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02  6:13 ` Johannes Schindelin
  2008-06-02  6:42   ` Jeff King
@ 2008-06-02 14:50   ` Geoffrey Irving
  2008-06-02 15:52     ` Johannes Schindelin
  2008-06-02 17:23     ` Geoffrey Irving
  1 sibling, 2 replies; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02 14:50 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git@vger.kernel.org

On Sun, Jun 1, 2008 at 11:13 PM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Sun, 1 Jun 2008, Geoffrey Irving wrote:
>
>> The dominant cost of git-cherry is the computation of patch-ids for each
>> relevant commit.  Once computed, these pairs are now stored in a hash
>> table in $GIT_DIR/patch-id-cache to speed up repeated invocations.
>>
>> The basic structure of patch-id-cache.c was cannibalized from Johannes
>> Schindelin's notes-index structure, though most of the code was
>> rewritten. The hash table is still kept strictly sorted by commit, but
>> the entire table is now read into memory.
>
> I do not think that this "read-the-entire-table-into-memory" paradigm is a
> wise choice. mmap()ing, I would have understood, but reading a potentially
> pretty large table into memory?

I'll switch to mmapping.

>> See commit d56dbe8cb56ce9b4697d1f1c2425e2e133a932a5 for the original
>> code.
>
> This is not in any official git.git repository.  And my branches are prone
> to be rebased.  So this is not a good reference.  The mailing list,
> however, would be one.

That was added only because I knew it wouldn't be the final patch, and
that it'd be useful to you.

>> diff --git a/Makefile b/Makefile
>> index cce5a6e..3a5396d 100644
>> --- a/Makefile
>> +++ b/Makefile
>> @@ -435,6 +435,7 @@ LIB_OBJS += pager.o
>>  LIB_OBJS += parse-options.o
>>  LIB_OBJS += patch-delta.o
>>  LIB_OBJS += patch-ids.o
>> +LIB_OBJS += patch-id-cache.o
>
> If all you do is a hashmap from SHA-1 to SHA-1, then I think
> "patch-id-cache" is a misnomer for that file and structure.
>
> Let's not repeat the same naming mistakes as we have for path_list and
> decoration.

Is persistent_sha1_map a good name?

>> +static void read_patch_id_cache()
>> +{
>> +     int fd = open(git_path("patch-id-cache"), O_RDONLY);
>> +
>> +     if (fd < 0) { // allocate a new cache
>> +             size_t size = 64;
>> +             cache = xcalloc(get_hash_size(size), 1);
>> +             cache->key_size = cache->actual_size = size;
>> +             return;
>> +     }
>> +
>> +     struct patch_id_cache header;
>> +     size_t len = read_in_full(fd, &header, sizeof(header));
>> +     if (len != sizeof(header))
>> +             die("cannot read patch-id-cache header");
>> +     if (header.key_size & (header.key_size-1))
>> +             die("patch-id-cache key size %lld is not a power of two", (long
>> long)header.key_size);
>
> Very long line.
>
> Why does the key_size have to be a power of two?

It doesn't, but the current code creates it that way, so that was a
sanity check.  Well, actually the current code uses & instead of %, so
it does require it unless I switch back to %.

>> +     free(cache);
>> +     cache = xmalloc(get_hash_size(header.actual_size));
>> +     *cache = header;
>
> This free() kind of defies the xmalloc in the if() clause above.
>
> Besides, those three lines are a pretty awkward way to avoid writing
> xrealloc(), don't you think?
>
>> +     size_t entry_size = header.actual_size * sizeof(struct patch_id_entry);
>
> Declaration after statement.
>
>> +     len = read_in_full(fd, cache->entries, entry_size);
>> +     if (len != entry_size)
>> +             die("cannot read patch-id-cache entries");
>> +     written_count = cache->count;
>
> Huh?  It is not written_count.  Besides, why do you use a global here?
> (Yes, I know, you probably just copied it from my code, but you could
> clean it up in the process ;-)
>
>> +     // It's impossible to check whether the cache is actually accurate,
>
> C++ style comments.
>
>> +void write_patch_id_cache()
>> +{
>> +     if (!cache || cache->count == written_count)
>> +             return;
>
> Does that mean that the patch_id_cache is not updated when the number
> of commits stays the same after committing, branch -d and gc?

The patch_id_cache is only updated when it changes.  Since entries are
immutable and are never deleted, all changes increase the count.

>> +     struct lock_file update_lock;
>> +     int fd = hold_lock_file_for_update(&update_lock,
>> +                     git_path("patch-id-cache"), 0);
>> +
>> +     if (fd < 0) {
>> +             error("could not construct patch-id-cache");
>> +             return;
>> +     }
>
> Not good.  Printing an error, but not returning any indicator of it.
>
>> +int get_cached_commit_patch_id(const unsigned char *commit_sha1,
>> +             unsigned char *patch_id_sha1)
>
> Again, I do not think it is wise to name this function so focused on this
> particular purpose, when its use is clearly not limited to patch ids.
>
>> +void cache_commit_patch_id(const unsigned char *commit_sha1,
>> +             const unsigned char *patch_id_sha1)
>> +{
>> +     if (4*cache->count > 3*cache->key_size) { // resize
>
> C++ comment.
>
>> +             struct patch_id_cache *old_cache = cache;
>> +             size_t new_key_size = 2*cache->key_size,
>> +                     new_actual_size = cache->key_size + cache->actual_size;
>> +             cache = xcalloc(get_hash_size(new_actual_size), 1);
>> +             cache->key_size = new_key_size;
>> +             cache->actual_size = new_actual_size;
>> +
>> +             // reinsert all entries
>> +             size_t i;
>> +             for (i = 0; i < old_cache->actual_size; i++)
>> +                     cache_commit_patch_id(old_cache->entries[i].commit_sha1,
>> +                             old_cache->entries[i].patch_id_sha1);
>> +     }
>
> Does this not just _beg_ to live in an own function ("grow_hash")?
>
>> +     struct patch_id_entry entry;
>
> Declaration after statement.
>
>> +     hashcpy(entry.commit_sha1, commit_sha1);
>> +     hashcpy(entry.patch_id_sha1, patch_id_sha1);
>
> It would be more elegant to copy the SHA-1s _after_ finding where to write
> them.

Alas, that would break the elegance of my loop, since the loop swaps
in new entries to keep the table sorted.  I can remove the sorting if
you want: I only left it in there to be more similar to your code.

>> +     size_t i = get_hash_index(commit_sha1);
>
> Declaration after statement.
>
>> +     for (;;) {
>> +             if (i == cache->actual_size) {
>> +                     cache->actual_size++; // adding one suffices for expected O(1)
>
> Long line.  C++ comment.  "one suffices"?
>
>> +                     cache = xrealloc(cache, get_hash_size(cache->actual_size));
>> +                     cache->entries[cache->actual_size-1] = entry;
>> +                     return;
>> +             }
>> +             int cmp = hashcmp(entry.commit_sha1, cache->entries[i].commit_sha1);
>
> Declar... ah, well, suffice to say that you should read the
> CodingGuidelines, and try to fix up all the offending sites in your code.

I'd be happy to do that, but I don't see mention of either C++
comments or declarations after statements in the CodingGuidelines.

>> +             if (!cmp) {
>> +                     if (hashcmp(entry.patch_id_sha1, cache->entries[i].patch_id_sha1))
>> +                             die("found mismatched entry in patch-id-cache");
>
> I wonder if that potentially expensive operation should not rather be
> wrapped in an assert() call (since I recently learnt that Git's source
> code has more than one instance of assert()).

A 20 byte comparison doesn't seem potentially expensive to me.

>> +                     return;
>> +             } else if (cmp < 0) {
>> +                     i++;
>> +             } else if (is_null_sha1(cache->entries[i].commit_sha1)) {
>> +                     cache->entries[i] = entry;
>> +                     cache->count++;
>> +                     return;
>> +             } else {
>> +                     struct patch_id_entry tmp = cache->entries[i];
>> +                     cache->entries[i] = entry;
>> +                     entry = tmp;
>> +                     i++;
>> +             }
>
> You seem to be wanting to implement a "sorted hashmap"... this is
> non-standard, and AFAICT affects performance negatively.

I'll strip it out.  I actually stripped it out initially, and then
added it back to mirror your code.

>> diff --git a/patch-ids.c b/patch-ids.c
>> index 3be5d31..91fabeb 100644
>> --- a/patch-ids.c
>> +++ b/patch-ids.c
>> @@ -3,16 +3,30 @@
>>  #include "commit.h"
>>  #include "patch-ids.h"
>>
>> +extern void write_patch_id_cache();
>> +extern int get_cached_commit_patch_id(const unsigned char
>> *commit_sha1, unsigned char *patch_id_sha1);
>> +extern void cache_commit_patch_id(const unsigned char *commit_sha1,
>> const unsigned char *patch_id_sha1);
>> +
>
> Uh oh.  This is not good.  If you do not have declarations like these in
> header files (that are included from the implementing .c file, of course),
> it is all too easy to have the signatures of the implementation and the
> declaration diverge.
>
> So move them into an own header file, where they belong.  IIRC I did this
> in the notes code, so why did you choose not to imitate that?

No good reason.

>>  static uint32_t take2(const unsigned char *id)
>> @@ -136,6 +150,8 @@ int free_patch_ids(struct patch_ids *ids)
>>               next = patches->next;
>>               free(patches);
>>       }
>> +
>> +     write_patch_id_cache();
>>       return 0;
>
> That's cute.

Thanks (assuming cute means good).

I'll wait to actually make any of these changes until you and Jeff
decide whether I should stick to hashes or use binary search.  It
seems sad not to use hashes for a map when we get the best hash keys
in the world for free, so I'd prefer not switching.

Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 14:35     ` Geoffrey Irving
@ 2008-06-02 15:37       ` Johannes Schindelin
  2008-06-02 15:49         ` Geoffrey Irving
  0 siblings, 1 reply; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02 15:37 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Jeff King, git@vger.kernel.org

Hi,

On Mon, 2 Jun 2008, Geoffrey Irving wrote:

> On Sun, Jun 1, 2008 at 11:42 PM, Jeff King <peff@peff.net> wrote:
> > On Mon, Jun 02, 2008 at 07:13:14AM +0100, Johannes Schindelin wrote:
> >
> >> I do not think that this "read-the-entire-table-into-memory" paradigm 
> >> is a wise choice. mmap()ing, I would have understood, but reading a 
> >> potentially pretty large table into memory?
> >
> > When I was just a git-youth, I wrote a fast mmap-based cache for 
> > storing SHA1 pairs. It might give some direction. You should be able 
> > to find it here:
> >
> >  http://mid.gmane.org/20060629035849.GA30749@coredump.intra.peff.net
> >
> > It mmaps and binary searches a sorted list. New entries are added to 
> > an in-memory list, and then at the end of a run, the two sorted lists 
> > are merged to create the new on-disk version.
> 
> I don't need sorting (and neither did you), so I think a hash table is 
> better (O(1) instead of O(log n), and we don't even need to compute hash 
> keys.  I'll leave it up to you and Dscho (or anyone else who cares to 
> chime in) which one you think I should do.

My tests suggested that the lookup time advantage of hashes makes them a 
more appropriate choice than sorted lists, if you look up often, but add 
rarely.

Another issue that just hit me: this cache is append-only, so if it grows 
too large, you have no other option than to scratch and recreate it.  
Maybe this needs porcelain support, too?  (git gc?)

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 15:37       ` Johannes Schindelin
@ 2008-06-02 15:49         ` Geoffrey Irving
  2008-06-02 15:56           ` Shawn O. Pearce
  2008-06-02 16:18           ` Johannes Schindelin
  0 siblings, 2 replies; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02 15:49 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Jeff King, git@vger.kernel.org

On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Mon, 2 Jun 2008, Geoffrey Irving wrote:
>
>> On Sun, Jun 1, 2008 at 11:42 PM, Jeff King <peff@peff.net> wrote:
>> > On Mon, Jun 02, 2008 at 07:13:14AM +0100, Johannes Schindelin wrote:
>> >
>> >> I do not think that this "read-the-entire-table-into-memory" paradigm
>> >> is a wise choice. mmap()ing, I would have understood, but reading a
>> >> potentially pretty large table into memory?
>> >
>> > When I was just a git-youth, I wrote a fast mmap-based cache for
>> > storing SHA1 pairs. It might give some direction. You should be able
>> > to find it here:
>> >
>> >  http://mid.gmane.org/20060629035849.GA30749@coredump.intra.peff.net
>> >
>> > It mmaps and binary searches a sorted list. New entries are added to
>> > an in-memory list, and then at the end of a run, the two sorted lists
>> > are merged to create the new on-disk version.
>>
>> I don't need sorting (and neither did you), so I think a hash table is
>> better (O(1) instead of O(log n), and we don't even need to compute hash
>> keys.  I'll leave it up to you and Dscho (or anyone else who cares to
>> chime in) which one you think I should do.
>
> My tests suggested that the lookup time advantage of hashes makes them a
> more appropriate choice than sorted lists, if you look up often, but add
> rarely.
>
> Another issue that just hit me: this cache is append-only, so if it grows
> too large, you have no other option than to scratch and recreate it.
> Maybe this needs porcelain support, too?  (git gc?)

If so, the correct operation is to go through the hash and remove
entries that refer to commits that no longer exist.  I can add this if
you want.  Hopefully somewhere along the way git-gc constructs an easy
to traverse list of extant commits, and this will be straightforward.

Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 14:50   ` Geoffrey Irving
@ 2008-06-02 15:52     ` Johannes Schindelin
  2008-06-02 17:23     ` Geoffrey Irving
  1 sibling, 0 replies; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02 15:52 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: git@vger.kernel.org

Hi,

On Mon, 2 Jun 2008, Geoffrey Irving wrote:

> On Sun, Jun 1, 2008 at 11:13 PM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > On Sun, 1 Jun 2008, Geoffrey Irving wrote:
> >
> >> See commit d56dbe8cb56ce9b4697d1f1c2425e2e133a932a5 for the original 
> >> code.
> >
> > This is not in any official git.git repository.  And my branches are 
> > prone to be rebased.  So this is not a good reference.  The mailing 
> > list, however, would be one.
>
> That was added only because I knew it wouldn't be the final patch, and 
> that it'd be useful to you.

Ah, then you might want to say "[PATCH/RFC]" in the subject.  I thought 
you meant this for inclusion.

> >> diff --git a/Makefile b/Makefile
> >> index cce5a6e..3a5396d 100644
> >> --- a/Makefile
> >> +++ b/Makefile
> >> @@ -435,6 +435,7 @@ LIB_OBJS += pager.o
> >>  LIB_OBJS += parse-options.o
> >>  LIB_OBJS += patch-delta.o
> >>  LIB_OBJS += patch-ids.o
> >> +LIB_OBJS += patch-id-cache.o
> >
> > If all you do is a hashmap from SHA-1 to SHA-1, then I think
> > "patch-id-cache" is a misnomer for that file and structure.
> >
> > Let's not repeat the same naming mistakes as we have for path_list and
> > decoration.
> 
> Is persistent_sha1_map a good name?

I'd prefer cached_sha1_map, but I do not really care.

> >> +void write_patch_id_cache()
> >> +{
> >> +     if (!cache || cache->count == written_count)
> >> +             return;
> >
> > Does that mean that the patch_id_cache is not updated when the number 
> > of commits stays the same after committing, branch -d and gc?
> 
> The patch_id_cache is only updated when it changes.  Since entries are 
> immutable and are never deleted, all changes increase the count.

Right, I only realized that with my previous post to this thread.  
However, in that case I would prefer a flag "int dirty:1" to the cache 
structure, which is set whenever it needs writing.  Certainly not a global 
variable (which the cache already is).
 
> >> +     hashcpy(entry.commit_sha1, commit_sha1);
> >> +     hashcpy(entry.patch_id_sha1, patch_id_sha1);
> >
> > It would be more elegant to copy the SHA-1s _after_ finding where to 
> > write them.
> 
> Alas, that would break the elegance of my loop, since the loop swaps in 
> new entries to keep the table sorted.  I can remove the sorting if you 
> want: I only left it in there to be more similar to your code.

I think that my code tried to do too much here; it was before I benched 
hashmap vs binary-search.

So I think that can just go and simplify the elegance of the loop even 
more ;-)

> > Declar... ah, well, suffice to say that you should read the 
> > CodingGuidelines, and try to fix up all the offending sites in your 
> > code.
> 
> I'd be happy to do that, but I don't see mention of either C++
> comments or declarations after statements in the CodingGuidelines.

>From the Coding Guidelines:

	As for more concrete guidelines, just imitate the existing code
	(this is a good guideline, no matter which project you are
	contributing to).

> 
> >> +             if (!cmp) {
> >> +                     if (hashcmp(entry.patch_id_sha1, cache->entries[i].patch_id_sha1))
> >> +                             die("found mismatched entry in patch-id-cache");
> >
> > I wonder if that potentially expensive operation should not rather be 
> > wrapped in an assert() call (since I recently learnt that Git's source 
> > code has more than one instance of assert()).
> 
> A 20 byte comparison doesn't seem potentially expensive to me.

It's all a question of repetition, isn't it?

In any case, I am not a fan of wasting cycles unnecessarily.  The check 
would indicate a programming error, not a user error, and should therefore 
punish the programmer, not the user.

> >>  static uint32_t take2(const unsigned char *id)
> >> @@ -136,6 +150,8 @@ int free_patch_ids(struct patch_ids *ids)
> >>               next = patches->next;
> >>               free(patches);
> >>       }
> >> +
> >> +     write_patch_id_cache();
> >>       return 0;
> >
> > That's cute.
> 
> Thanks (assuming cute means good).

Yes, it does!

> I'll wait to actually make any of these changes until you and Jeff 
> decide whether I should stick to hashes or use binary search.  It seems 
> sad not to use hashes for a map when we get the best hash keys in the 
> world for free, so I'd prefer not switching.

Me, too.

BTW I think that you did some awesome work here, and I am glad that you 
could use my code as starting point.

Thanks,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 15:49         ` Geoffrey Irving
@ 2008-06-02 15:56           ` Shawn O. Pearce
  2008-06-02 16:18           ` Johannes Schindelin
  1 sibling, 0 replies; 16+ messages in thread
From: Shawn O. Pearce @ 2008-06-02 15:56 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Johannes Schindelin, Jeff King, git@vger.kernel.org

Geoffrey Irving <irving@naml.us> wrote:
> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
> > Another issue that just hit me: this cache is append-only, so if it grows
> > too large, you have no other option than to scratch and recreate it.
> > Maybe this needs porcelain support, too?  (git gc?)
> 
> If so, the correct operation is to go through the hash and remove
> entries that refer to commits that no longer exist.  I can add this if
> you want.  Hopefully somewhere along the way git-gc constructs an easy
> to traverse list of extant commits, and this will be straightforward.

git-gc doesn't make such a list.  Down deep with git-pack-objects
(which is called by git-repack, which is called by git-gc) yes,
we do make the list of commits that we can find as reachable, and
thus should stay in the repository.  But that is really low-level
plumbing.  Wedging a SHA1->SHA1 hashmap gc task down into that is
not a good idea.

Instead you'll need to implement something that does `git rev-list
--all -g` (or the internal equivilant) and then remove any entries
in your hashmap that aren't in that result set.  That's not going
to be very cheap.

Given how small entries are (what, 40 bytes?) I'd only want to bother
with that collection process if the estimated potential wasted space
was over 1M (26,000 entries) or some reasonable threshold like that.

E.g. we could just set the GC for this to be once every 26,000
additions, and only during git-gc.  Yea, you might waste about 1M
worth of space before we clean up.  Big deal, I'll bet you have
more than that in loose unreachable objects laying around from
git-rebase -i usage.  ;-)

-- 
Shawn.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 15:49         ` Geoffrey Irving
  2008-06-02 15:56           ` Shawn O. Pearce
@ 2008-06-02 16:18           ` Johannes Schindelin
  2008-06-02 16:26             ` Geoffrey Irving
  1 sibling, 1 reply; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02 16:18 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Jeff King, git@vger.kernel.org

Hi,

On Mon, 2 Jun 2008, Geoffrey Irving wrote:

> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > Another issue that just hit me: this cache is append-only, so if it 
> > grows too large, you have no other option than to scratch and recreate 
> > it. Maybe this needs porcelain support, too?  (git gc?)
> 
> If so, the correct operation is to go through the hash and remove 
> entries that refer to commits that no longer exist.  I can add this if 
> you want.  Hopefully somewhere along the way git-gc constructs an easy 
> to traverse list of extant commits, and this will be straightforward.

I don't know... if you have created a cached patch-id for every commit (by 
mistake, for example) and do not need it anymore, it might make git-cherry 
substantially faster to just scrap the cache.

Hrm,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 16:18           ` Johannes Schindelin
@ 2008-06-02 16:26             ` Geoffrey Irving
  2008-06-02 18:15               ` Johannes Schindelin
  0 siblings, 1 reply; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02 16:26 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Jeff King, git@vger.kernel.org

On Mon, Jun 2, 2008 at 9:18 AM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Mon, 2 Jun 2008, Geoffrey Irving wrote:
>
>> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
>> <Johannes.Schindelin@gmx.de> wrote:
>>
>> > Another issue that just hit me: this cache is append-only, so if it
>> > grows too large, you have no other option than to scratch and recreate
>> > it. Maybe this needs porcelain support, too?  (git gc?)
>>
>> If so, the correct operation is to go through the hash and remove
>> entries that refer to commits that no longer exist.  I can add this if
>> you want.  Hopefully somewhere along the way git-gc constructs an easy
>> to traverse list of extant commits, and this will be straightforward.
>
> I don't know... if you have created a cached patch-id for every commit (by
> mistake, for example) and do not need it anymore, it might make git-cherry
> substantially faster to just scrap the cache.

Well, ideally hash maps are O(1), but it could be a difference between
a "compare 40 bytes" constant and a "read a 4k block into memory"
constant, so in practice yes.  Scrapping it entirely will also make
the implementation much simpler.

It seems a little sad to wipe all that effort each time, but
regenerating the cache is likely to be less expensive than a git-gc,
so it shouldn't change any amortized complexities.

Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 14:50   ` Geoffrey Irving
  2008-06-02 15:52     ` Johannes Schindelin
@ 2008-06-02 17:23     ` Geoffrey Irving
  2008-06-02 18:22       ` Johannes Schindelin
  1 sibling, 1 reply; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-02 17:23 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git@vger.kernel.org

On Mon, Jun 2, 2008 at 7:50 AM, Geoffrey Irving <irving@naml.us> wrote:
> On Sun, Jun 1, 2008 at 11:13 PM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>> Hi,
>>
>> On Sun, 1 Jun 2008, Geoffrey Irving wrote:
>>
>>> The dominant cost of git-cherry is the computation of patch-ids for each
>>> relevant commit.  Once computed, these pairs are now stored in a hash
>>> table in $GIT_DIR/patch-id-cache to speed up repeated invocations.
>>>
>>> The basic structure of patch-id-cache.c was cannibalized from Johannes
>>> Schindelin's notes-index structure, though most of the code was
>>> rewritten. The hash table is still kept strictly sorted by commit, but
>>> the entire table is now read into memory.
>>
>> I do not think that this "read-the-entire-table-into-memory" paradigm is a
>> wise choice. mmap()ing, I would have understood, but reading a potentially
>> pretty large table into memory?
>
> I'll switch to mmapping.

The git_mmap function in compat/mmap.c dies if NO_MMAP is defined and
the map isn't MAP_PRIVATE.  If I want to write an entry, will the
memory be automatically updated if I write directly to the file
descriptor (I haven't used mmap before)?

Also, do you think it's okay to write directly into the mmap'ed memory
for every insertion, or should I try to be fancier?  Immediate writing
would simplify the code a lot, and I don't think there's a significant
performance issue since computing an entry is expensive.

Thanks,
Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 16:26             ` Geoffrey Irving
@ 2008-06-02 18:15               ` Johannes Schindelin
  2008-06-07 23:50                 ` Geoffrey Irving
  0 siblings, 1 reply; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02 18:15 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Jeff King, git@vger.kernel.org

Hi,

On Mon, 2 Jun 2008, Geoffrey Irving wrote:

> On Mon, Jun 2, 2008 at 9:18 AM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > On Mon, 2 Jun 2008, Geoffrey Irving wrote:
> >
> >> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
> >> <Johannes.Schindelin@gmx.de> wrote:
> >>
> >> > Another issue that just hit me: this cache is append-only, so if it 
> >> > grows too large, you have no other option than to scratch and 
> >> > recreate it. Maybe this needs porcelain support, too?  (git gc?)
> >>
> >> If so, the correct operation is to go through the hash and remove 
> >> entries that refer to commits that no longer exist.  I can add this 
> >> if you want.  Hopefully somewhere along the way git-gc constructs an 
> >> easy to traverse list of extant commits, and this will be 
> >> straightforward.
> >
> > I don't know... if you have created a cached patch-id for every commit 
> > (by mistake, for example) and do not need it anymore, it might make 
> > git-cherry substantially faster to just scrap the cache.
> 
> Well, ideally hash maps are O(1), but it could be a difference between a 
> "compare 40 bytes" constant and a "read a 4k block into memory" 
> constant, so in practice yes.  Scrapping it entirely will also make the 
> implementation much simpler.
> 
> It seems a little sad to wipe all that effort each time, but 
> regenerating the cache is likely to be less expensive than a git-gc, so 
> it shouldn't change any amortized complexities.

Well, how about only scrapping the cache if it is older than, say, 2 
weeks, and is larger than, say, 200kB?  That should help.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 17:23     ` Geoffrey Irving
@ 2008-06-02 18:22       ` Johannes Schindelin
  0 siblings, 0 replies; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-02 18:22 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: git@vger.kernel.org

Hi,

On Mon, 2 Jun 2008, Geoffrey Irving wrote:

> On Mon, Jun 2, 2008 at 7:50 AM, Geoffrey Irving <irving@naml.us> wrote:
>
> I'll switch to mmapping.
> 
> The git_mmap function in compat/mmap.c dies if NO_MMAP is defined and 
> the map isn't MAP_PRIVATE.  If I want to write an entry, will the memory 
> be automatically updated if I write directly to the file descriptor (I 
> haven't used mmap before)?

I think that you should only mmap() with MAP_PRIVATE, because you can 
corrupt the data easily when a write failure is not handled properly.

So I'd suggest mmap() for reading, and writing to a locked file (I think 
you did that already).

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-02 18:15               ` Johannes Schindelin
@ 2008-06-07 23:50                 ` Geoffrey Irving
  2008-06-08 16:10                   ` Johannes Schindelin
  0 siblings, 1 reply; 16+ messages in thread
From: Geoffrey Irving @ 2008-06-07 23:50 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Jeff King, git@vger.kernel.org

On Mon, Jun 2, 2008 at 11:15 AM, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
> Hi,
>
> On Mon, 2 Jun 2008, Geoffrey Irving wrote:
>
>> On Mon, Jun 2, 2008 at 9:18 AM, Johannes Schindelin
>> <Johannes.Schindelin@gmx.de> wrote:
>>
>> > On Mon, 2 Jun 2008, Geoffrey Irving wrote:
>> >
>> >> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
>> >> <Johannes.Schindelin@gmx.de> wrote:
>> >>
>> >> > Another issue that just hit me: this cache is append-only, so if it
>> >> > grows too large, you have no other option than to scratch and
>> >> > recreate it. Maybe this needs porcelain support, too?  (git gc?)
>> >>
>> >> If so, the correct operation is to go through the hash and remove
>> >> entries that refer to commits that no longer exist.  I can add this
>> >> if you want.  Hopefully somewhere along the way git-gc constructs an
>> >> easy to traverse list of extant commits, and this will be
>> >> straightforward.
>> >
>> > I don't know... if you have created a cached patch-id for every commit
>> > (by mistake, for example) and do not need it anymore, it might make
>> > git-cherry substantially faster to just scrap the cache.
>>
>> Well, ideally hash maps are O(1), but it could be a difference between a
>> "compare 40 bytes" constant and a "read a 4k block into memory"
>> constant, so in practice yes.  Scrapping it entirely will also make the
>> implementation much simpler.
>>
>> It seems a little sad to wipe all that effort each time, but
>> regenerating the cache is likely to be less expensive than a git-gc, so
>> it shouldn't change any amortized complexities.
>
> Well, how about only scrapping the cache if it is older than, say, 2
> weeks, and is larger than, say, 200kB?  That should help.

That heuristic is insufficient, since it doesn't do anything in the
normal case where a new entry appears every few days (e.g., when
syncing between two branches with cherry-pick).

I don't know what the best alternative is, so I left garbage
collection out of the patch I just submitted.  We can add it once we
decide what to do.  I'm not sure it's a serious problem: if you
"accidentally" added entries for all commits in the git tree, the file
is still under 1M.

Geoffrey

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry
  2008-06-07 23:50                 ` Geoffrey Irving
@ 2008-06-08 16:10                   ` Johannes Schindelin
  0 siblings, 0 replies; 16+ messages in thread
From: Johannes Schindelin @ 2008-06-08 16:10 UTC (permalink / raw)
  To: Geoffrey Irving; +Cc: Jeff King, git@vger.kernel.org

Hi,

On Sat, 7 Jun 2008, Geoffrey Irving wrote:

> On Mon, Jun 2, 2008 at 11:15 AM, Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
>
> > On Mon, 2 Jun 2008, Geoffrey Irving wrote:
> >
> >> On Mon, Jun 2, 2008 at 9:18 AM, Johannes Schindelin
> >> <Johannes.Schindelin@gmx.de> wrote:
> >>
> >> > On Mon, 2 Jun 2008, Geoffrey Irving wrote:
> >> >
> >> >> On Mon, Jun 2, 2008 at 8:37 AM, Johannes Schindelin
> >> >> <Johannes.Schindelin@gmx.de> wrote:
> >> >>
> >> >> > Another issue that just hit me: this cache is append-only, so if 
> >> >> > it grows too large, you have no other option than to scratch and 
> >> >> > recreate it. Maybe this needs porcelain support, too?  (git gc?)
> >> >>
> >> >> If so, the correct operation is to go through the hash and remove 
> >> >> entries that refer to commits that no longer exist.  I can add 
> >> >> this if you want.  Hopefully somewhere along the way git-gc 
> >> >> constructs an easy to traverse list of extant commits, and this 
> >> >> will be straightforward.
> >> >
> >> > I don't know... if you have created a cached patch-id for every 
> >> > commit (by mistake, for example) and do not need it anymore, it 
> >> > might make git-cherry substantially faster to just scrap the cache.
> >>
> >> Well, ideally hash maps are O(1), but it could be a difference 
> >> between a "compare 40 bytes" constant and a "read a 4k block into 
> >> memory" constant, so in practice yes.  Scrapping it entirely will 
> >> also make the implementation much simpler.
> >>
> >> It seems a little sad to wipe all that effort each time, but 
> >> regenerating the cache is likely to be less expensive than a git-gc, 
> >> so it shouldn't change any amortized complexities.
> >
> > Well, how about only scrapping the cache if it is older than, say, 2 
> > weeks, and is larger than, say, 200kB?  That should help.
> 
> That heuristic is insufficient, since it doesn't do anything in the 
> normal case where a new entry appears every few days (e.g., when syncing 
> between two branches with cherry-pick).

Right, it is insufficient in such a case, but then, it does not really 
matter, methinks.  The cache is small enough anyway, and I think that many 
people will not really use it as much as you do.

However, I realized one very real issue with your patch: you do not 
provide a way to _disable_ the caching.  I think at least a config 
variable is needed, and while at it, a fallback when you cannot write to 
the repository.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-06-08 16:12 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-02  3:54 [PATCH] Adding a cache of commit to patch-id pairs to speed up git-cherry Geoffrey Irving
2008-06-02  6:13 ` Johannes Schindelin
2008-06-02  6:42   ` Jeff King
2008-06-02 14:35     ` Geoffrey Irving
2008-06-02 15:37       ` Johannes Schindelin
2008-06-02 15:49         ` Geoffrey Irving
2008-06-02 15:56           ` Shawn O. Pearce
2008-06-02 16:18           ` Johannes Schindelin
2008-06-02 16:26             ` Geoffrey Irving
2008-06-02 18:15               ` Johannes Schindelin
2008-06-07 23:50                 ` Geoffrey Irving
2008-06-08 16:10                   ` Johannes Schindelin
2008-06-02 14:50   ` Geoffrey Irving
2008-06-02 15:52     ` Johannes Schindelin
2008-06-02 17:23     ` Geoffrey Irving
2008-06-02 18:22       ` Johannes Schindelin

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).