[[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content
@ 2016-07-29 16:19 Kevin Willford
  2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Kevin Willford @ 2016-07-29 16:19 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford

This patch series is to remove the hand rolled hashmap in the patch_ids
and use the hashmap.h implementation.  It also introduces the idea of having
a header only patch id so that the contents of the files do not have to be
loaded in order to determine if two commits are different.


Kevin Willford (4):
  patch-ids: stop using a hand-rolled hashmap implementation
  patch-ids: replace the seen indicator with a commit pointer
  patch-ids: add flag to create the diff patch id using header only data
  rebase: avoid computing unnecessary patch IDs

 builtin/log.c                        |  2 +-
 diff.c                               | 16 +++---
 diff.h                               |  2 +-
 patch-ids.c                          | 99 +++++++++++++++---------------------
 patch-ids.h                          | 11 ++--
 revision.c                           | 18 ++-----
 t/perf/p3400-rebase.sh               | 36 +++++++++++++
 t/t6007-rev-list-cherry-pick-file.sh | 17 +++++++
 8 files changed, 114 insertions(+), 87 deletions(-)
 create mode 100644 t/perf/p3400-rebase.sh

-- 
2.9.2.windows.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
@ 2016-07-29 16:19 ` Kevin Willford
  2016-07-29 20:47   ` Junio C Hamano
  2016-07-29 21:29   ` Junio C Hamano
  2016-07-29 16:19 ` [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer Kevin Willford
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 25+ messages in thread
From: Kevin Willford @ 2016-07-29 16:19 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford, Kevin Willford

From: Kevin Willford <kewillf@microsoft.com>

This change will use the hashmap from the hashmap.h to keep track of the
patch_ids that have been encountered instead of using an internal
implementation.  This simplifies the implementation of the patch ids.

Signed-off-by: Kevin Willford <kcwillford@gmail.com>
---
 patch-ids.c | 86 +++++++++++++++++++++----------------------------------------
 patch-ids.h |  7 +++--
 2 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/patch-ids.c b/patch-ids.c
index a4d0016..db31fa6 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -16,90 +16,62 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	return diff_flush_patch_id(options, sha1);
 }
 
-static const unsigned char *patch_id_access(size_t index, void *table)
+static int patch_id_cmp(struct patch_id *a,
+			struct patch_id *b,
+			void *keydata)
 {
-	struct patch_id **id_table = table;
-	return id_table[index]->patch_id;
+	return hashcmp(a->patch_id, b->patch_id);
 }
 
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
-{
-	return sha1_pos(id, table, nr, patch_id_access);
-}
-
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-	struct patch_id_bucket *next;
-	int nr;
-	struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+	hashmap_init(&ids->patches, (hashmap_cmp_fn)patch_id_cmp, 256);
 	return 0;
 }
 
 int free_patch_ids(struct patch_ids *ids)
 {
-	struct patch_id_bucket *next, *patches;
-
-	free(ids->table);
-	for (patches = ids->patches; patches; patches = next) {
-		next = patches->next;
-		free(patches);
-	}
+	hashmap_free(&ids->patches, 1);
 	return 0;
 }
 
-static struct patch_id *add_commit(struct commit *commit,
-				   struct patch_ids *ids,
-				   int no_add)
+static int init_patch_id_entry(struct patch_id *patch,
+			       struct commit *commit,
+			       struct patch_ids *ids)
 {
-	struct patch_id_bucket *bucket;
-	struct patch_id *ent;
-	unsigned char sha1[20];
-	int pos;
-
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
-		return NULL;
-	pos = patch_pos(ids->table, ids->nr, sha1);
-	if (0 <= pos)
-		return ids->table[pos];
-	if (no_add)
-		return NULL;
-
-	pos = -1 - pos;
+	if (commit_patch_id(commit, &ids->diffopts, patch->patch_id))
+		return -1;
 
-	bucket = ids->patches;
-	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-		bucket = xcalloc(1, sizeof(*bucket));
-		bucket->next = ids->patches;
-		ids->patches = bucket;
-	}
-	ent = &bucket->bucket[bucket->nr++];
-	hashcpy(ent->patch_id, sha1);
-
-	ALLOC_GROW(ids->table, ids->nr + 1, ids->alloc);
-	if (pos < ids->nr)
-		memmove(ids->table + pos + 1, ids->table + pos,
-			sizeof(ent) * (ids->nr - pos));
-	ids->nr++;
-	ids->table[pos] = ent;
-	return ids->table[pos];
+	hashmap_entry_init(patch, sha1hash(patch->patch_id));
+	return 0;
 }
 
 struct patch_id *has_commit_patch_id(struct commit *commit,
 				     struct patch_ids *ids)
 {
-	return add_commit(commit, ids, 1);
+	struct patch_id patch;
+
+	memset(&patch, 0, sizeof(patch));
+	if (init_patch_id_entry(&patch, commit, ids))
+		return NULL;
+
+	return hashmap_get(&ids->patches, &patch, NULL);
 }
 
 struct patch_id *add_commit_patch_id(struct commit *commit,
 				     struct patch_ids *ids)
 {
-	return add_commit(commit, ids, 0);
+	struct patch_id *key = xcalloc(1, sizeof(*key));
+
+	if (init_patch_id_entry(key, commit, ids)) {
+		free(key);
+		return NULL;
+	}
+
+	hashmap_add(&ids->patches, key);
+	return key;
 }
diff --git a/patch-ids.h b/patch-ids.h
index eeb56b3..9569ee0 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -2,15 +2,14 @@
 #define PATCH_IDS_H
 
 struct patch_id {
-	unsigned char patch_id[20];
+	struct hashmap_entry ent;
+	unsigned char patch_id[GIT_SHA1_RAWSZ];
 	char seen;
 };
 
 struct patch_ids {
+	struct hashmap patches;
 	struct diff_options diffopts;
-	int nr, alloc;
-	struct patch_id **table;
-	struct patch_id_bucket *patches;
 };
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-- 
2.9.2.gvfs.2.42.gb7633a3


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer
  2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
  2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
@ 2016-07-29 16:19 ` Kevin Willford
  2016-07-29 21:03   ` Junio C Hamano
  2016-07-29 16:19 ` [[PATCH v2] 3/4] patch-ids: add flag to create the diff patch id using header only data Kevin Willford
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Kevin Willford @ 2016-07-29 16:19 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford, Kevin Willford

From: Kevin Willford <kewillf@microsoft.com>

The cherry_pick_list was looping through the original side checking the
seen indicator and setting the cherry_flag on the commit.  If we save
off the commit in the patch_id we can set the cherry_flag on the correct
commit when running through the other side when a patch_id match is found.

Signed-off-by: Kevin Willford <kcwillford@gmail.com>
---
 patch-ids.c |  1 +
 patch-ids.h |  2 +-
 revision.c  | 18 +++---------------
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/patch-ids.c b/patch-ids.c
index db31fa6..bafaae2 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -43,6 +43,7 @@ static int init_patch_id_entry(struct patch_id *patch,
 			       struct commit *commit,
 			       struct patch_ids *ids)
 {
+	patch->commit = commit;
 	if (commit_patch_id(commit, &ids->diffopts, patch->patch_id))
 		return -1;
 
diff --git a/patch-ids.h b/patch-ids.h
index 9569ee0..dea1ecd 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -4,7 +4,7 @@
 struct patch_id {
 	struct hashmap_entry ent;
 	unsigned char patch_id[GIT_SHA1_RAWSZ];
-	char seen;
+	struct commit *commit;
 };
 
 struct patch_ids {
diff --git a/revision.c b/revision.c
index edba5b7..19cc067 100644
--- a/revision.c
+++ b/revision.c
@@ -846,7 +846,7 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		 */
 		if (left_first != !!(flags & SYMMETRIC_LEFT))
 			continue;
-		commit->util = add_commit_patch_id(commit, &ids);
+		add_commit_patch_id(commit, &ids);
 	}
 
 	/* either cherry_mark or cherry_pick are true */
@@ -873,21 +873,9 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		id = has_commit_patch_id(commit, &ids);
 		if (!id)
 			continue;
-		id->seen = 1;
-		commit->object.flags |= cherry_flag;
-	}
 
-	/* Now check the original side for seen ones */
-	for (p = list; p; p = p->next) {
-		struct commit *commit = p->item;
-		struct patch_id *ent;
-
-		ent = commit->util;
-		if (!ent)
-			continue;
-		if (ent->seen)
-			commit->object.flags |= cherry_flag;
-		commit->util = NULL;
+		commit->object.flags |= cherry_flag;
+		id->commit->object.flags |= cherry_flag;
 	}
 
 	free_patch_ids(&ids);
-- 
2.9.2.gvfs.2.42.gb7633a3


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [[PATCH v2] 3/4] patch-ids: add flag to create the diff patch id using header only data
  2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
  2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
  2016-07-29 16:19 ` [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer Kevin Willford
@ 2016-07-29 16:19 ` Kevin Willford
  2016-07-29 16:19 ` [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs Kevin Willford
  2016-07-29 20:22 ` [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Junio C Hamano
  4 siblings, 0 replies; 25+ messages in thread
From: Kevin Willford @ 2016-07-29 16:19 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford, Kevin Willford

From: Kevin Willford <kewillf@microsoft.com>

This will allow a diff patch id to be created using only the header data
so that the contents of the file will not have to be loaded.

Signed-off-by: Kevin Willford <kcwillford@gmail.com>
---
 diff.c      | 16 ++++++++++------
 diff.h      |  2 +-
 patch-ids.c |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/diff.c b/diff.c
index 7d03419..28a4190 100644
--- a/diff.c
+++ b/diff.c
@@ -4455,7 +4455,7 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4490,9 +4490,6 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 
 		diff_fill_sha1_info(p->one);
 		diff_fill_sha1_info(p->two);
-		if (fill_mmfile(&mf1, p->one) < 0 ||
-				fill_mmfile(&mf2, p->two) < 0)
-			return error("unable to read files to diff");
 
 		len1 = remove_space(p->one->path, strlen(p->one->path));
 		len2 = remove_space(p->two->path, strlen(p->two->path));
@@ -4527,6 +4524,13 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 					len2, p->two->path);
 		git_SHA1_Update(&ctx, buffer, len1);
 
+		if (diff_header_only)
+			continue;
+
+		if (fill_mmfile(&mf1, p->one) < 0 ||
+			fill_mmfile(&mf2, p->two) < 0)
+			return error("unable to read files to diff");
+
 		if (diff_filespec_is_binary(p->one) ||
 		    diff_filespec_is_binary(p->two)) {
 			git_SHA1_Update(&ctx, oid_to_hex(&p->one->oid),
@@ -4549,11 +4553,11 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1);
+	int result = diff_get_patch_id(options, sha1, diff_header_only);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 125447b..7883729 100644
--- a/diff.h
+++ b/diff.h
@@ -342,7 +342,7 @@ extern int run_diff_files(struct rev_info *revs, unsigned int option);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index bafaae2..69a14a3 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -13,7 +13,7 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.oid.hash, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	return diff_flush_patch_id(options, sha1, 0);
 }
 
 static int patch_id_cmp(struct patch_id *a,
-- 
2.9.2.gvfs.2.42.gb7633a3


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
                   ` (2 preceding siblings ...)
  2016-07-29 16:19 ` [[PATCH v2] 3/4] patch-ids: add flag to create the diff patch id using header only data Kevin Willford
@ 2016-07-29 16:19 ` Kevin Willford
  2016-07-29 21:46   ` Junio C Hamano
  2016-07-29 20:22 ` [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Junio C Hamano
  4 siblings, 1 reply; 25+ messages in thread
From: Kevin Willford @ 2016-07-29 16:19 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford, Kevin Willford

From: Kevin Willford <kewillf@microsoft.com>

The `rebase` family of Git commands avoid applying patches that were
already integrated upstream. They do that by using the revision walking
option that computes the patch IDs of the two sides of the rebase
(local-only patches vs upstream-only ones) and skipping those local
patches whose patch ID matches one of the upstream ones.

In many cases, this causes unnecessary churn, as already the set of
paths touched by a given commit would suffice to determine that an
upstream patch has no local equivalent.

This hurts performance in particular when there are a lot of upstream
patches, and/or large ones.

Therefore, let's introduce the concept of a "diff-header-only" patch ID,
compare those first, and only evaluate the "full" patch ID lazily.

Please note that in contrast to the "full" patch IDs, those
"diff-header-only" patch IDs are prone to collide with one another, as
adjacent commits frequently touch the very same files. Hence we now
have to be careful to allow multiple hash entries with the same hash.
We accomplish that by using the hashmap_add() function that does not even
test for hash collisions.  This also allows us to evaluate the full patch ID
lazily, i.e. only when we found commits with matching diff-header-only
patch IDs.

We add a performance test that demonstrates ~1-6% improvement.  In
practice this will depend on various factors such as how many upstream
changes and how big those changes are along with whether file system
caches are cold or warm.  As Git's test suite has no way of catching
performance regressions, we also add a regression test that verifies
that the full patch ID computation is skipped when the diff-header-only
computation suffices.

Signed-off-by: Kevin Willford <kcwillford@gmail.com>
---
 builtin/log.c                        |  2 +-
 patch-ids.c                          | 22 ++++++++++++++++------
 patch-ids.h                          |  2 +-
 t/perf/p3400-rebase.sh               | 36 ++++++++++++++++++++++++++++++++++++
 t/t6007-rev-list-cherry-pick-file.sh | 17 +++++++++++++++++
 5 files changed, 71 insertions(+), 8 deletions(-)
 create mode 100644 t/perf/p3400-rebase.sh

diff --git a/builtin/log.c b/builtin/log.c
index fd1652f..b076993 100644
--- a/builtin/log.c
+++ b/builtin/log.c
@@ -1331,7 +1331,7 @@ static void prepare_bases(struct base_tree_info *bases,
 		struct object_id *patch_id;
 		if (commit->util)
 			continue;
-		if (commit_patch_id(commit, &diffopt, sha1))
+		if (commit_patch_id(commit, &diffopt, sha1, 0))
 			die(_("cannot get patch id"));
 		ALLOC_GROW(bases->patch_id, bases->nr_patch_id + 1, bases->alloc_patch_id);
 		patch_id = bases->patch_id + bases->nr_patch_id;
diff --git a/patch-ids.c b/patch-ids.c
index 69a14a3..0a4828a 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -5,7 +5,7 @@
 #include "patch-ids.h"
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1)
+		    unsigned char *sha1, int diff_header_only)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.oid.hash,
@@ -13,13 +13,21 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.oid.hash, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1, 0);
+	return diff_flush_patch_id(options, sha1, diff_header_only);
 }
 
 static int patch_id_cmp(struct patch_id *a,
 			struct patch_id *b,
-			void *keydata)
+			struct diff_options *opt)
 {
+	if (is_null_sha1(a->patch_id) &&
+	    commit_patch_id(a->commit, opt, a->patch_id, 0))
+		return error("Could not get patch ID for %s",
+			oid_to_hex(&a->commit->object.oid));
+	if (is_null_sha1(b->patch_id) &&
+	    commit_patch_id(b->commit, opt, b->patch_id, 0))
+		return error("Could not get patch ID for %s",
+			oid_to_hex(&b->commit->object.oid));
 	return hashcmp(a->patch_id, b->patch_id);
 }
 
@@ -43,11 +51,13 @@ static int init_patch_id_entry(struct patch_id *patch,
 			       struct commit *commit,
 			       struct patch_ids *ids)
 {
+	unsigned char header_only_patch_id[GIT_SHA1_RAWSZ];
+
 	patch->commit = commit;
-	if (commit_patch_id(commit, &ids->diffopts, patch->patch_id))
+	if (commit_patch_id(commit, &ids->diffopts, header_only_patch_id, 1))
 		return -1;
 
-	hashmap_entry_init(patch, sha1hash(patch->patch_id));
+	hashmap_entry_init(patch, sha1hash(header_only_patch_id));
 	return 0;
 }
 
@@ -60,7 +70,7 @@ struct patch_id *has_commit_patch_id(struct commit *commit,
 	if (init_patch_id_entry(&patch, commit, ids))
 		return NULL;
 
-	return hashmap_get(&ids->patches, &patch, NULL);
+	return hashmap_get(&ids->patches, &patch, &ids->diffopts);
 }
 
 struct patch_id *add_commit_patch_id(struct commit *commit,
diff --git a/patch-ids.h b/patch-ids.h
index dea1ecd..0f34ea1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -13,7 +13,7 @@ struct patch_ids {
 };
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1);
+		    unsigned char *sha1, int);
 int init_patch_ids(struct patch_ids *);
 int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
diff --git a/t/perf/p3400-rebase.sh b/t/perf/p3400-rebase.sh
new file mode 100644
index 0000000..b3e7d52
--- /dev/null
+++ b/t/perf/p3400-rebase.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+test_description='Tests rebase performance'
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+test_expect_success 'setup' '
+	git checkout -f -b base &&
+	git checkout -b to-rebase &&
+	git checkout -b upstream &&
+	for i in $(seq 100)
+	do
+		# simulate huge diffs
+		echo change$i >unrelated-file$i &&
+		seq 1000 >>unrelated-file$i &&
+		git add unrelated-file$i &&
+		test_tick &&
+		git commit -m commit$i unrelated-file$i &&
+		echo change$i >unrelated-file$i &&
+		seq 1000 | tac >>unrelated-file$i &&
+		git add unrelated-file$i &&
+		test_tick &&
+		git commit -m commit$i-reverse unrelated-file$i ||
+		break
+	done &&
+	git checkout to-rebase &&
+	test_commit our-patch interesting-file
+'
+
+test_perf 'rebase on top of a lot of unrelated changes' '
+	git rebase --onto upstream HEAD^ &&
+	git rebase --onto base HEAD^
+'
+
+test_done
diff --git a/t/t6007-rev-list-cherry-pick-file.sh b/t/t6007-rev-list-cherry-pick-file.sh
index 28d4f6b..fff3322 100755
--- a/t/t6007-rev-list-cherry-pick-file.sh
+++ b/t/t6007-rev-list-cherry-pick-file.sh
@@ -206,5 +206,22 @@ test_expect_success '--count --left-right' '
 	git rev-list --count --left-right C...D > actual &&
 	test_cmp expect actual
 '
+remove_loose_object () {
+	sha1="$(git rev-parse "$1")" &&
+	remainder=${sha1#??} &&
+	firsttwo=${sha1%$remainder} &&
+	rm .git/objects/$firsttwo/$remainder
+}
+test_expect_success '--cherry-pick avoids looking at full diffs' '
+	git checkout -b shy-diff &&
+	test_commit dont-look-at-me &&
+	echo Hello >dont-look-at-me.t &&
+	test_tick &&
+	git commit -m tip dont-look-at-me.t &&
+	git checkout -b mainline HEAD^ &&
+	test_commit to-cherry-pick &&
+	remove_loose_object shy-diff^:dont-look-at-me.t &&
+	git rev-list --cherry-pick ...shy-diff
+'
 
 test_done
-- 
2.9.2.gvfs.2.42.gb7633a3


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content
  2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
                   ` (3 preceding siblings ...)
  2016-07-29 16:19 ` [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs Kevin Willford
@ 2016-07-29 20:22 ` Junio C Hamano
  2016-08-01  9:01   ` Johannes Schindelin
  4 siblings, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-07-29 20:22 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git

Kevin Willford <kcwillford@gmail.com> writes:

> This patch series is to remove the hand rolled hashmap in the patch_ids
> and use the hashmap.h implementation.  It also introduces the idea of having
> a header only patch id so that the contents of the files do not have to be
> loaded in order to determine if two commits are different.

> Subject: Re: [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content

Did you do "format-patch --subject-prefix='[PATCH v2]'" or something
like that?  When applied, that would result in a commit title like
this:

	1/4] patch-ids: stop using a ...

because we stop at the first ']', and we do not bother to count up
to "matching bracket".

    git format-patch --subject-prefix='PATCH v2'

or even better

    git format-patch -v2

would have been more appropriate.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
@ 2016-07-29 20:47   ` Junio C Hamano
  2016-08-01  8:54     ` Johannes Schindelin
  2016-07-29 21:29   ` Junio C Hamano
  1 sibling, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-07-29 20:47 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git, Kevin Willford

Kevin Willford <kcwillford@gmail.com> writes:

> From: Kevin Willford <kewillf@microsoft.com>
>
> This change will use the hashmap from the hashmap.h to keep track of the
> patch_ids that have been encountered instead of using an internal
> implementation.  This simplifies the implementation of the patch ids.
>
> Signed-off-by: Kevin Willford <kcwillford@gmail.com>
> ---
>  patch-ids.c | 86 +++++++++++++++++++++----------------------------------------
>  patch-ids.h |  7 +++--
>  2 files changed, 32 insertions(+), 61 deletions(-)

The patch text itself is almost unreadble because of a lot of
verbose code it had to carry before this change, and the removal of
that unreadable code of course is the point of this very welcome
clean-up ;-).  The resulting code is very readable.

>  struct patch_id *has_commit_patch_id(struct commit *commit,
>  				     struct patch_ids *ids)
>  {
> -	return add_commit(commit, ids, 1);
> +	struct patch_id patch;
> +
> +	memset(&patch, 0, sizeof(patch));
> +	if (init_patch_id_entry(&patch, commit, ids))
> +		return NULL;
> +	return hashmap_get(&ids->patches, &patch, NULL);
>  }
>  
>  struct patch_id *add_commit_patch_id(struct commit *commit,
>  				     struct patch_ids *ids)
>  {
> -	return add_commit(commit, ids, 0);
> +	struct patch_id *key = xcalloc(1, sizeof(*key));
> +
> +	if (init_patch_id_entry(key, commit, ids)) {
> +		free(key);
> +		return NULL;
> +	}

This is a tangent, but this made me wonder if it is safe to simply
free(3) the result of calling hashmap_entry_init() which is called
in init_patch_id_entry().  It would obviously become a resource
leak, if a hashmap_entry (which the api documentation says is "an
opaque structure") holds any allocated resource.

The fact that hashmap_entry_init() is there but there is no
corresponding hashmap_entry_clear() hints that there is nothing to
be worried about and I can see from the implementation of
hashmap_entry_init() that no extra resource is held inside, but an
API user should not have to guess.  We may want to do one of the two
things:

 * document that an embedded hashmap_entry does not hold any
   resource that need to be released and it is safe to free the user
   structure that embeds one; or

 * implement hashmap_entry_clear() that currently is a no-op.

If we anticipate that the hashmap implementation may gain more
fields in this "opaque" structure, the latter might be a more
future-proof approach, as all the callers of hashmap_entry_init()
would already be calling hashmap_entry_clear() to clean it up when
such a change to the hashmap implementation happens.  On the other
hand, a caller that does not call hashmap_entry_clear() would not be
noticed by anybody as leaking resources until such a change happens,
so the future-proofing may not have much practical value (iow, the
existing callers of _init() would need to be audited anyway to make
sure they also call _clear()).

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer
  2016-07-29 16:19 ` [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer Kevin Willford
@ 2016-07-29 21:03   ` Junio C Hamano
  0 siblings, 0 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-07-29 21:03 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git, Kevin Willford

Kevin Willford <kcwillford@gmail.com> writes:

> From: Kevin Willford <kewillf@microsoft.com>
>
> The cherry_pick_list was looping through the original side checking the
> seen indicator and setting the cherry_flag on the commit.  If we save
> off the commit in the patch_id we can set the cherry_flag on the correct
> commit when running through the other side when a patch_id match is found.
>
> Signed-off-by: Kevin Willford <kcwillford@gmail.com>
> ---
>  patch-ids.c |  1 +
>  patch-ids.h |  2 +-
>  revision.c  | 18 +++---------------
>  3 files changed, 5 insertions(+), 16 deletions(-)

We lose the final loop to fix up the shorter side, because the
second loop now marks both sides.

And as a side effect, we do not use commit->util which is a scarce
shared resource (aka historical API wart).

Makes sense.  Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
  2016-07-29 20:47   ` Junio C Hamano
@ 2016-07-29 21:29   ` Junio C Hamano
  1 sibling, 0 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-07-29 21:29 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git, Kevin Willford

Kevin Willford <kcwillford@gmail.com> writes:

> +static int patch_id_cmp(struct patch_id *a,
> +			struct patch_id *b,
> +			void *keydata)
>  {
> +	return hashcmp(a->patch_id, b->patch_id);
>  }
>  
>  int init_patch_ids(struct patch_ids *ids)
>  {
>  	memset(ids, 0, sizeof(*ids));
>  	diff_setup(&ids->diffopts);
>  	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
>  	diff_setup_done(&ids->diffopts);
> +	hashmap_init(&ids->patches, (hashmap_cmp_fn)patch_id_cmp, 256);
>  	return 0;
>  }

This is a tangent, and I do not suggest to change patch 1/4 to flip
the style, but I am not sure if this is a good style, or casting it
the other way around is better from the type-checking point of view,
i.e.

    static int cmp_fn(const void *a_, const void *b_, const void *keydata)
    {
	struct patch_id *a = a_;
        struct patch_id *b = b_;
	return hashcmp(a->patch_id, b->patch_id);
    }

    ...
    	hashmap_init(..., cmp_fn, ...);
    ...

I see many existing calls to hashmap_init() follow this pattern, so
as I said, patch 1/4 is fine as-is.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-07-29 16:19 ` [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs Kevin Willford
@ 2016-07-29 21:46   ` Junio C Hamano
  2016-08-01  8:58     ` Johannes Schindelin
  0 siblings, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-07-29 21:46 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git, Kevin Willford

Kevin Willford <kcwillford@gmail.com> writes:

>  static int patch_id_cmp(struct patch_id *a,
>  			struct patch_id *b,
> -			void *keydata)
> +			struct diff_options *opt)
>  {
> +	if (is_null_sha1(a->patch_id) &&
> +	    commit_patch_id(a->commit, opt, a->patch_id, 0))
> +		return error("Could not get patch ID for %s",
> +			oid_to_hex(&a->commit->object.oid));
> +	if (is_null_sha1(b->patch_id) &&
> +	    commit_patch_id(b->commit, opt, b->patch_id, 0))
> +		return error("Could not get patch ID for %s",
> +			oid_to_hex(&b->commit->object.oid));
>  	return hashcmp(a->patch_id, b->patch_id);
>  }

These error returns initially looks slightly iffy in that in general
the caller of any_cmp_fn() wants to know how a/b compares, but by
returning error(), it always says "a is smaller than b".  This
however may be OK because the callers in hashmap_get* implementation
only wants to know "are they equal?", and we are saying "no they
cannot possibly be equal" here.  The original that ran a full
commit_patch_id() in now-removed add_commit() helper function didn't
even diagnose this error and silently omitted the commit from the
candidate list, so this may be even seen as an improvement.

The idea of using the two level hash, computing the more expensive
one only when the hashmap hashes of the result of the cheaper hash
function collide, is excellent.  Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-07-29 20:47   ` Junio C Hamano
@ 2016-08-01  8:54     ` Johannes Schindelin
  2016-08-01 20:04       ` Junio C Hamano
  0 siblings, 1 reply; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-01  8:54 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git, Kevin Willford

Hi Junio,

first of all: Kevin & I are colleagues and I helped prepare this patch
series. I had the idea to have a two-level patch ID to help e.g. when an
alternate object store is hosted on a (slow) network drive.

On Fri, 29 Jul 2016, Junio C Hamano wrote:

> Kevin Willford <kcwillford@gmail.com> writes:
> 
> >  struct patch_id *add_commit_patch_id(struct commit *commit,
> >  				     struct patch_ids *ids)
> >  {
> > -	return add_commit(commit, ids, 0);
> > +	struct patch_id *key = xcalloc(1, sizeof(*key));
> > +
> > +	if (init_patch_id_entry(key, commit, ids)) {
> > +		free(key);
> > +		return NULL;
> > +	}
> 
> This is a tangent, but this made me wonder if it is safe to simply
> free(3) the result of calling hashmap_entry_init() which is called
> in init_patch_id_entry().  It would obviously become a resource
> leak, if a hashmap_entry (which the api documentation says is "an
> opaque structure") holds any allocated resource.

It would be a serious bug if hashmap_entry_init() played games with
references, given its signature (that this function does not have any
access to the hashmap structure, only to the entry itself):

	void hashmap_entry_init(void *entry, unsigned int hash)

Please note that the `void *entry` really needs to point to a struct whose
first field is of type `struct hashmap_entry`. This is not type-safe, of
course, but C does not allow a strong sub-typing of the kind we want to
use here.

> The fact that hashmap_entry_init() is there but there is no
> corresponding hashmap_entry_clear() hints that there is nothing to
> be worried about and I can see from the implementation of
> hashmap_entry_init() that no extra resource is held inside, but an
> API user should not have to guess.  We may want to do one of the two
> things:
> 
>  * document that an embedded hashmap_entry does not hold any
>    resource that need to be released and it is safe to free the user
>    structure that embeds one; or
> 
>  * implement hashmap_entry_clear() that currently is a no-op.

Urgh. The only reason we have hashmap_entry_init() is that we *may* want
to extend `struct hashmap_entry` at some point. That is *already*
over-engineered because that point in time seems quite unlikely to arrive,
like, ever.

In that light, as you said, why would we overengineer things even further
by introducing a hashmap_entry_clear(), especially given that we won't
catch any forgotten _clear() calls, given that it is a no-op anyway?

Let's just not.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-07-29 21:46   ` Junio C Hamano
@ 2016-08-01  8:58     ` Johannes Schindelin
  2016-08-01 20:11       ` Junio C Hamano
  0 siblings, 1 reply; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-01  8:58 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git, Kevin Willford

Hi Junio,

On Fri, 29 Jul 2016, Junio C Hamano wrote:

> Kevin Willford <kcwillford@gmail.com> writes:
> 
> >  static int patch_id_cmp(struct patch_id *a,
> >  			struct patch_id *b,
> > -			void *keydata)
> > +			struct diff_options *opt)
> >  {
> > +	if (is_null_sha1(a->patch_id) &&
> > +	    commit_patch_id(a->commit, opt, a->patch_id, 0))
> > +		return error("Could not get patch ID for %s",
> > +			oid_to_hex(&a->commit->object.oid));
> > +	if (is_null_sha1(b->patch_id) &&
> > +	    commit_patch_id(b->commit, opt, b->patch_id, 0))
> > +		return error("Could not get patch ID for %s",
> > +			oid_to_hex(&b->commit->object.oid));
> >  	return hashcmp(a->patch_id, b->patch_id);
> >  }
> 
> These error returns initially looks slightly iffy in that in general
> the caller of any_cmp_fn() wants to know how a/b compares, but by
> returning error(), it always says "a is smaller than b".

I am to blame, as this is my design.

And yes, it is kind of funny that we require a cmpfn that returns <0, ==0
and >0 for comparisons, when hashmaps try to avoid any order.

Do you want a note in the commit message about this "abuse" of a negative
return value, or a code comment?

> The idea of using the two level hash, computing the more expensive
> one only when the hashmap hashes of the result of the cheaper hash
> function collide, is excellent.

Thanks :-)

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content
  2016-07-29 20:22 ` [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Junio C Hamano
@ 2016-08-01  9:01   ` Johannes Schindelin
  0 siblings, 0 replies; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-01  9:01 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git

Hi Junio,

On Fri, 29 Jul 2016, Junio C Hamano wrote:

> Kevin Willford <kcwillford@gmail.com> writes:
> 
> > This patch series is to remove the hand rolled hashmap in the patch_ids
> > and use the hashmap.h implementation.  It also introduces the idea of having
> > a header only patch id so that the contents of the files do not have to be
> > loaded in order to determine if two commits are different.
> 
> > Subject: Re: [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content
> 
> Did you do "format-patch --subject-prefix='[PATCH v2]'" or something
> like that?

This is my fault, sorry, I suggested this usage (misremembering the
semantics because I only use my mail-patch-series.sh helper and never use
format-patch manually anymore).

>     git format-patch --subject-prefix='PATCH v2'
> 
> or even better
> 
>     git format-patch -v2
> 
> would have been more appropriate.

Oooh, I did not know about the latter form. Nice.

Thanks,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-08-01  8:54     ` Johannes Schindelin
@ 2016-08-01 20:04       ` Junio C Hamano
  2016-08-01 22:34         ` Eric Wong
  2016-08-02 10:30         ` Johannes Schindelin
  0 siblings, 2 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-08-01 20:04 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> It would be a serious bug if hashmap_entry_init() played games with
> references, given its signature (that this function does not have any
> access to the hashmap structure, only to the entry itself):
>
> 	void hashmap_entry_init(void *entry, unsigned int hash)

I do not think we are on the same page.  The "reference to other
resource" I wondered was inside the hashmap_entry structure, IOW,
"the entry itself".

Which is declared to be opaque to the API users, so whoever defined
that API cannot blame me for not checking its definition to see that
it only has "unsigned int hash" and no allocated memory or open file
descriptor in it that needs freeing.

By the way, the first parameter of the function being "void *" is
merely to help lazy API users, who have their own structure that
embeds the hashmap_entry as its first element, as API documentation
tells them to do, e.g.

	struct foo {
        	struct hashmap_entry e;
                ... other "foo" specific fields come here ...
	} foo;

and because of the lazy "void *", they do not have to do this:

	hashmap_entry_init(&foo->e, ...);

which would be required if the first parameter were "struct
hashmap_entry *", but they can just do this:

	hashmap_entry_init(&foo, ...);

I have a slight preference to avoid the lazy "void *", but that is
an unrelated tangent.

>> The fact that hashmap_entry_init() is there but there is no
>> corresponding hashmap_entry_clear() hints that there is nothing to
>> be worried about and I can see from the implementation of
>> hashmap_entry_init() that no extra resource is held inside, but an
>> API user should not have to guess.  We may want to do one of the two
>> things:
>> 
>>  * document that an embedded hashmap_entry does not hold any
>>    resource that need to be released and it is safe to free the user
>>    structure that embeds one; or
>> 
>>  * implement hashmap_entry_clear() that currently is a no-op.
>
> Urgh. The only reason we have hashmap_entry_init() is that we *may* want
> to extend `struct hashmap_entry` at some point. That is *already*
> over-engineered because that point in time seems quite unlikely to arrive,
> like, ever.

I am saying that an uneven over-enginnering is bad.

To be consistent with having _init() and declaring the entry
structure to be opaque, you would either need _clear() or document
you would never need to worry about the lack of _clear().

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-01  8:58     ` Johannes Schindelin
@ 2016-08-01 20:11       ` Junio C Hamano
  2016-08-02  9:50         ` Jakub Narębski
  2016-08-02 10:45         ` Johannes Schindelin
  0 siblings, 2 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-08-01 20:11 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Hi Junio,
>
> On Fri, 29 Jul 2016, Junio C Hamano wrote:
>
>> Kevin Willford <kcwillford@gmail.com> writes:
>> 
>> >  static int patch_id_cmp(struct patch_id *a,
>> >  			struct patch_id *b,
>> > -			void *keydata)
>> > +			struct diff_options *opt)
>> >  {
>> > +	if (is_null_sha1(a->patch_id) &&
>> > +	    commit_patch_id(a->commit, opt, a->patch_id, 0))
>> > +		return error("Could not get patch ID for %s",
>> > +			oid_to_hex(&a->commit->object.oid));
>> > +	if (is_null_sha1(b->patch_id) &&
>> > +	    commit_patch_id(b->commit, opt, b->patch_id, 0))
>> > +		return error("Could not get patch ID for %s",
>> > +			oid_to_hex(&b->commit->object.oid));
>> >  	return hashcmp(a->patch_id, b->patch_id);
>> >  }
>> 
>> These error returns initially looks slightly iffy in that in general
>> the caller of any_cmp_fn() wants to know how a/b compares, but by
>> returning error(), it always says "a is smaller than b".
>
> I am to blame, as this is my design.
>
> And yes, it is kind of funny that we require a cmpfn that returns <0, ==0
> and >0 for comparisons, when hashmaps try to avoid any order.

Perhaps hashmap API needs fixing in the longer term not to call this
type hashmap_cmp_fn; instead it should lose cmp and say something
like hashmap_eq_fn or something.

> Do you want a note in the commit message about this "abuse" of a negative
> return value, or a code comment?

I do not think negative (or non-zero) return is an "abuse" at all.
It is misleading in the context of the function whose name has "cmp"
in it, but that is not the fault of this function, rather, the
breakage is more in the API that calls a function that wants to know
only equality a "cmp".  A in-code comment before the function name
may be appropriate:

        /*
         * hashmap API calls hashmap_cmp_fn, but it only wants
         * "does the key match the entry?" with 0 (matches) and
         * non-zero (does not match).
         */
        static int patch_id_match(const struct patch_id *ent,
                                  const struct patch_id *key,
                                  const void *keydata)
        {
                ...


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-08-01 20:04       ` Junio C Hamano
@ 2016-08-01 22:34         ` Eric Wong
  2016-08-02 10:30         ` Johannes Schindelin
  1 sibling, 0 replies; 25+ messages in thread
From: Eric Wong @ 2016-08-01 22:34 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Schindelin, Kevin Willford, git, Kevin Willford

Junio C Hamano <gitster@pobox.com> wrote:
> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> 
> > It would be a serious bug if hashmap_entry_init() played games with
> > references, given its signature (that this function does not have any
> > access to the hashmap structure, only to the entry itself):
> >
> > 	void hashmap_entry_init(void *entry, unsigned int hash)

<snip>

> I have a slight preference to avoid the lazy "void *", but that is
> an unrelated tangent.

Me too.  I noticed this while working on the http-walker
speedups and my self-rejected last patch:

  https://public-inbox.org/git/20160711210243.GA1604%40whir/

Extracting list_entry from list.h (currently in next[1]) and
exposing that generically as "container_of" for use with
hashmap_* would make it safer-to-use and allow structs to belong
to multiple hashmaps.

list_entry is also an alias for container_of in the Linux
kernel, but we don't have enough code using list_* to
warrant two names for the same macro.

[1] https://public-inbox.org/git/20160711205131.1291-4-e%4080x24.org/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-01 20:11       ` Junio C Hamano
@ 2016-08-02  9:50         ` Jakub Narębski
  2016-08-02 17:06           ` Junio C Hamano
  2016-08-02 10:45         ` Johannes Schindelin
  1 sibling, 1 reply; 25+ messages in thread
From: Jakub Narębski @ 2016-08-02  9:50 UTC (permalink / raw)
  To: Junio C Hamano, Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

W dniu 01.08.2016 o 22:11, Junio C Hamano pisze:
> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>> On Fri, 29 Jul 2016, Junio C Hamano wrote:

>>> These error returns initially looks slightly iffy in that in general
>>> the caller of any_cmp_fn() wants to know how a/b compares, but by
>>> returning error(), it always says "a is smaller than b".
>>
>> I am to blame, as this is my design.
>>
>> And yes, it is kind of funny that we require a cmpfn that returns <0, ==0
>> and >0 for comparisons, when hashmaps try to avoid any order.
> 
> Perhaps hashmap API needs fixing in the longer term not to call this
> type hashmap_cmp_fn; instead it should lose cmp and say something
> like hashmap_eq_fn or something.

The problem is that one expects hashmap_cmp_fn() to return ==0 on equality,
while one would expect for hashmap_eq_fn() to return true (==1) on equality.
So we would have to rewrite all calling sites.

It would be nice to have a comment about how hashmap uses cmpfn in
hashmap.h.  

Though... currently our hashmap implementation uses linked
list (separate chaining) for handling hash collisions (for collision
resolution). More sophisticated data structures, such as balanced search
trees, or splay trees, are worth considering only if the load factor is
large, or if the hash distribution is likely to be very non-uniform,
or if one must guarantee good performance even in a worst-case scenario.
So almost certainly comparing function (which is needed for trees)
won't be needed.

Best,
-- 
Jakub Narębski

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-08-01 20:04       ` Junio C Hamano
  2016-08-01 22:34         ` Eric Wong
@ 2016-08-02 10:30         ` Johannes Schindelin
  2016-08-02 17:01           ` Junio C Hamano
  1 sibling, 1 reply; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-02 10:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git, Kevin Willford

Hi Junio,

On Mon, 1 Aug 2016, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> 
> > It would be a serious bug if hashmap_entry_init() played games with
> > references, given its signature (that this function does not have any
> > access to the hashmap structure, only to the entry itself):
> >
> > 	void hashmap_entry_init(void *entry, unsigned int hash)
> 
> I do not think we are on the same page.  The "reference to other
> resource" I wondered was inside the hashmap_entry structure, IOW,
> "the entry itself".

Oh, I see now.

> Which is declared to be opaque to the API users,

Actually, not really. We cannot do that in C: we need to define the struct
in hashmap.h so that its size is known to the users.

> so whoever defined that API cannot blame me for not checking its
> definition to see that it only has "unsigned int hash" and no allocated
> memory or open file descriptor in it that needs freeing.

That is the reason, I guess, why we have the documentation in
Documentation/technical/api-hashmap.txt: it would have to talk about your
hypothetical hashmap_entry_clear() (which would better be named
*_release() BTW, unless I misunderstood what you want a hypothetical
future version of that function to do).

And quite frankly, unless we *have* to, I would rather try to avoid
introducing that function as much as possible, as it would make using the
hashmap API even more finicky than it already is.

> By the way, the first parameter of the function being "void *" is
> merely to help lazy API users, who have their own structure that
> embeds the hashmap_entry as its first element, as API documentation
> tells them to do, e.g.
> 
> 	struct foo {
>         	struct hashmap_entry e;
>                 ... other "foo" specific fields come here ...
> 	} foo;
> 
> and because of the lazy "void *", they do not have to do this:
> 
> 	hashmap_entry_init(&foo->e, ...);
> 
> which would be required if the first parameter were "struct
> hashmap_entry *", but they can just do this:
> 
> 	hashmap_entry_init(&foo, ...);

Yes, I know that. It is the common way to simulate subclassing in C, for
lack of a more compile-safe construct.

> I have a slight preference to avoid the lazy "void *", but that is
> an unrelated tangent.

Oh, we are already safely in Unrelated Tangent Land for a while, I would
think. Nothing of what we are discussing in this thread has anything to do
with Kevin's patch series, which is about trying to use resources more
sensibly when using the revision machinery's --cherry-pick option.

And since we are already there, I'll offer an opinion in favor of `void
*`: doing the &foo->e dance could quite possibly suggest that `e` is a
field just like any other field (and does not necessarily *need* to be the
first).

But again, this has nothing to do with the patch series we are discussing
here.

> >> The fact that hashmap_entry_init() is there but there is no
> >> corresponding hashmap_entry_clear() hints that there is nothing to be
> >> worried about and I can see from the implementation of
> >> hashmap_entry_init() that no extra resource is held inside, but an
> >> API user should not have to guess.  We may want to do one of the two
> >> things:
> >> 
> >>  * document that an embedded hashmap_entry does not hold any
> >>    resource that need to be released and it is safe to free the user
> >>    structure that embeds one; or
> >> 
> >>  * implement hashmap_entry_clear() that currently is a no-op.
> >
> > Urgh. The only reason we have hashmap_entry_init() is that we *may* want
> > to extend `struct hashmap_entry` at some point. That is *already*
> > over-engineered because that point in time seems quite unlikely to arrive,
> > like, ever.
> 
> I am saying that an uneven over-enginnering is bad.

Hmm. I guess that the _init() function could be replaced by an _INIT macro
a la STRBUF_INIT. Not sure it is really worth the effort, though.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-01 20:11       ` Junio C Hamano
  2016-08-02  9:50         ` Jakub Narębski
@ 2016-08-02 10:45         ` Johannes Schindelin
  2016-08-02 17:08           ` Junio C Hamano
  1 sibling, 1 reply; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-02 10:45 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git, Kevin Willford

Hi Junio,

On Mon, 1 Aug 2016, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> 
> > On Fri, 29 Jul 2016, Junio C Hamano wrote:
> >
> >> Kevin Willford <kcwillford@gmail.com> writes:
> >> 
> >> >  static int patch_id_cmp(struct patch_id *a,
> >> >  			struct patch_id *b,
> >> > -			void *keydata)
> >> > +			struct diff_options *opt)
> >> >  {
> >> > +	if (is_null_sha1(a->patch_id) &&
> >> > +	    commit_patch_id(a->commit, opt, a->patch_id, 0))
> >> > +		return error("Could not get patch ID for %s",
> >> > +			oid_to_hex(&a->commit->object.oid));
> >> > +	if (is_null_sha1(b->patch_id) &&
> >> > +	    commit_patch_id(b->commit, opt, b->patch_id, 0))
> >> > +		return error("Could not get patch ID for %s",
> >> > +			oid_to_hex(&b->commit->object.oid));
> >> >  	return hashcmp(a->patch_id, b->patch_id);
> >> >  }
> >> 
> >> These error returns initially looks slightly iffy in that in general
> >> the caller of any_cmp_fn() wants to know how a/b compares, but by
> >> returning error(), it always says "a is smaller than b".
> >
> > I am to blame, as this is my design.
> >
> > And yes, it is kind of funny that we require a cmpfn that returns <0, ==0
> > and >0 for comparisons, when hashmaps try to avoid any order.
> 
> Perhaps hashmap API needs fixing in the longer term not to call this
> type hashmap_cmp_fn; instead it should lose cmp and say something
> like hashmap_eq_fn or something.

Maybe.

But to make sure: you do not expect Kevin to do that in the context of
this here patch series, right?

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-08-02 10:30         ` Johannes Schindelin
@ 2016-08-02 17:01           ` Junio C Hamano
  2016-08-02 18:04             ` Junio C Hamano
  0 siblings, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-08-02 17:01 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Hi Junio,
>
> On Mon, 1 Aug 2016, Junio C Hamano wrote:
>
>> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>> 
>> > It would be a serious bug if hashmap_entry_init() played games with
>> > references, given its signature (that this function does not have any
>> > access to the hashmap structure, only to the entry itself):
>> >
>> > 	void hashmap_entry_init(void *entry, unsigned int hash)
>> 
>> I do not think we are on the same page.  The "reference to other
>> resource" I wondered was inside the hashmap_entry structure, IOW,
>> "the entry itself".
>
> Oh, I see now.
>
>> Which is declared to be opaque to the API users,
>
> Actually, not really. We cannot do that in C: we need to define the struct
> in hashmap.h so that its size is known to the users.

What I meant was what Documentation/technical/api-hashmap.txt said.
I know that we cannot embed a true "opaque" structure in C just as
you do.

> That is the reason, I guess, why we have the documentation in
> Documentation/technical/api-hashmap.txt: it would have to talk about your
> hypothetical hashmap_entry_clear() (which would better be named
> *_release() BTW, unless I misunderstood what you want a hypothetical
> future version of that function to do).

I think there is no misunderstanding on your part.  I am following
the conclusion (as I recall) of a discussion on the list not in so
distant past about X_free(x) that frees the resource an instance of
"struct X" holds and also the instance itself, and X_clear(x) that
only frees the resources without freeing the instance (the latter of
which allows x to be on stack, you do X_init(&x) and conclude with
X_clear(&x)).  The name X_clear() is more in line with existing API
functions like string_list_clear(), argv_array_clear().  An oddball
is strbuf_release(), which I think made you make the above comment.

>> I have a slight preference to avoid the lazy "void *", but that is
>> an unrelated tangent.
>
> Oh, we are already safely in Unrelated Tangent Land for a while, I would
> think. Nothing of what we are discussing in this thread has anything to do
> with Kevin's patch series,...

Oh, no question about that.  Go back to my review, to which your
message I am responding to is a reply to.  What you wrote are all
about things after "This is a tangent, but this made me wonder if it
is safe to simply free(3) the result...", which pointed out rough
points in the hashmap API from the API user's point of view and
suggested a few possible improvement opportunities.

>> I am saying that an uneven over-enginnering is bad.
>
> Hmm. I guess that the _init() function could be replaced by an _INIT macro
> a la STRBUF_INIT. Not sure it is really worth the effort, though.

I do not think so, as X_init() and X_INIT() from the point of view
of the API user would not make much difference; as long as the
documentation does not say it is safe to make it go out of scope
without "deinitialize" it, the reader will be left wondering.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-02  9:50         ` Jakub Narębski
@ 2016-08-02 17:06           ` Junio C Hamano
  0 siblings, 0 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-08-02 17:06 UTC (permalink / raw)
  To: Jakub Narębski
  Cc: Johannes Schindelin, Kevin Willford, git, Kevin Willford

Jakub Narębski <jnareb@gmail.com> writes:

> The problem is that one expects hashmap_cmp_fn() to return ==0 on equality,
> while one would expect for hashmap_eq_fn() to return true (==1) on equality.
> So we would have to rewrite all calling sites.

Yes, and I do not think it is a "problem".  There only are about a
dozen callsites of hashmap_init().

> It would be nice to have a comment about how hashmap uses cmpfn in
> hashmap.h.  

That is the absolute minimum but I think that is already done in the
Documentation/technical/api-hashmap.txt.

> Though... currently our hashmap implementation uses linked
> list (separate chaining) for handling hash collisions (for collision
> resolution). More sophisticated data structures, such as balanced search
> trees,...

But that requires total ordering of the elements registered in a
hashmap.  So far, because cmp_fn that was misnamed was only about
equality, you can safely use a hashmap to store things that do not
have inherent order among them.  Besides, if your hashmap has to
optimize the hash collision resolution part with complex strucure,
your hash function is bad or your hash bucket growing strategy is
suboptimal, or both, which is the first thing you should look at,
not the "now how would we find it in the bucket with too many
things?"

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-02 10:45         ` Johannes Schindelin
@ 2016-08-02 17:08           ` Junio C Hamano
  2016-08-04  3:00             ` Junio C Hamano
  0 siblings, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-08-02 17:08 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

>> Perhaps hashmap API needs fixing in the longer term not to call this
>> type hashmap_cmp_fn; instead it should lose cmp and say something
>> like hashmap_eq_fn or something.
>
> Maybe.
>
> But to make sure: you do not expect Kevin to do that in the context of
> this here patch series, right?

I think I already answered this in the message you are responding
to.

> Do you want a note in the commit message about this "abuse" of a negative
> return value, or a code comment?

I do not think negative (or non-zero) return is an "abuse" at all.
It is misleading in the context of the function whose name has "cmp"
in it, but that is not the fault of this function, rather, the
breakage is more in the API that calls a function that wants to know
only equality a "cmp".  A in-code comment before the function name
may be appropriate:

        /*
         * hashmap API calls hashmap_cmp_fn, but it only wants
         * "does the key match the entry?" with 0 (matches) and
         * non-zero (does not match).
         */
        static int patch_id_match(const struct patch_id *ent,
                                  const struct patch_id *key,
                                  const void *keydata)
        {
                ...


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation
  2016-08-02 17:01           ` Junio C Hamano
@ 2016-08-02 18:04             ` Junio C Hamano
  0 siblings, 0 replies; 25+ messages in thread
From: Junio C Hamano @ 2016-08-02 18:04 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Junio C Hamano <gitster@pobox.com> writes:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
>> Oh, we are already safely in Unrelated Tangent Land for a while, I would
>> think. Nothing of what we are discussing in this thread has anything to do
>> with Kevin's patch series,...
>
> Oh, no question about that.  Go back to my review, to which your
> message I am responding to is a reply to.  What you wrote are all
> about things after "This is a tangent, but this made me wonder if it
> is safe to simply free(3) the result...", which pointed out rough
> points in the hashmap API from the API user's point of view and
> suggested a few possible improvement opportunities.

In other words, I'd be happy with a patch like this, outside the
scope of and independent from this series.

When the hashmap_entry structure does acquire references to external
resources (which I wouldn't judge the likelihood of), this paragraph
will become stale, but that is exactly the point at which _clear()
function needs to be added to the API and described here, replacing
this paragraph.

I do not mind having an empty _clear() before that happens, but I do
not think it adds much safety.  A disciplined user of the API may
call that empty _clear() to make her code future-proof, but we know
there are undisciplined developers and reviewers and there will be
codepaths that call _init() without calling the empty _clear(), and
we won't notice it.  Whoever is adding the real need for _clear()
must audit the codebase at that point _anyway_, and that is why I
think having an empty _clear() before would not buy us much.

 Documentation/technical/api-hashmap.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt
index ad7a5bd..1dcec3d 100644
--- a/Documentation/technical/api-hashmap.txt
+++ b/Documentation/technical/api-hashmap.txt
@@ -104,6 +104,11 @@ If `free_entries` is true, each hashmap_entry in the map is freed as well
 `entry` points to the entry to initialize.
 +
 `hash` is the hash code of the entry.
++
+The hashmap_entry structure does not hold references to external resources,
+and it is safe to just discard it once you are done with it (i.e. if
+your structure was allocated with xmalloc(), you can just free(3) it,
+and if it is on stack, you can just let it go out of scope).

 `void *hashmap_get(const struct hashmap *map, const void *key, const void *keydata)`::

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-02 17:08           ` Junio C Hamano
@ 2016-08-04  3:00             ` Junio C Hamano
  2016-08-04 14:21               ` Johannes Schindelin
  0 siblings, 1 reply; 25+ messages in thread
From: Junio C Hamano @ 2016-08-04  3:00 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Kevin Willford, git, Kevin Willford

Junio C Hamano <gitster@pobox.com> writes:

> I do not think negative (or non-zero) return is an "abuse" at all.
> It is misleading in the context of the function whose name has "cmp"
> in it, but that is not the fault of this function, rather, the
> breakage is more in the API that calls a function that wants to know
> only equality a "cmp".  A in-code comment before the function name
> may be appropriate:
>
>         /*
>          * hashmap API calls hashmap_cmp_fn, but it only wants
>          * "does the key match the entry?" with 0 (matches) and
>          * non-zero (does not match).
>          */
>         static int patch_id_match(const struct patch_id *ent,
>                                   const struct patch_id *key,
>                                   const void *keydata)
>         {
>                 ...

How about this one instead (to be squashed into 4/4)?

The updated wording directly addresses the puzzlement I initially
felt "This returns error() which is always negative, so comparing
(A, B) would say A < B, while comparing (B, A) would say B < A.
Would it cause a problem in the caller?" while reading the function
by being explicit that the sign does not matter.

 patch-ids.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/patch-ids.c b/patch-ids.c
index 0a4828a..082412a 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -16,6 +16,16 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	return diff_flush_patch_id(options, sha1, diff_header_only);
 }
 
+/*
+ * When we cannot load the full patch-id for both commits for whatever
+ * reason, the function returns -1 (i.e. return error(...)). Despite
+ * the "cmp" in the name of this function, the caller only cares about
+ * the return value being zero (a and b are equivalent) or non-zero (a
+ * and b are different), and returning non-zero would keep both in the
+ * result, even if they actually were equivalent, in order to err on
+ * the side of safety.  The actual value being negative does not have
+ * any significance; only that it is non-zero matters.
+ */
 static int patch_id_cmp(struct patch_id *a,
 			struct patch_id *b,
 			struct diff_options *opt)

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs
  2016-08-04  3:00             ` Junio C Hamano
@ 2016-08-04 14:21               ` Johannes Schindelin
  0 siblings, 0 replies; 25+ messages in thread
From: Johannes Schindelin @ 2016-08-04 14:21 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Kevin Willford, git, Kevin Willford

Hi Junio,

On Wed, 3 Aug 2016, Junio C Hamano wrote:

> Junio C Hamano <gitster@pobox.com> writes:
> 
> > I do not think negative (or non-zero) return is an "abuse" at all.
> > It is misleading in the context of the function whose name has "cmp"
> > in it, but that is not the fault of this function, rather, the
> > breakage is more in the API that calls a function that wants to know
> > only equality a "cmp".  A in-code comment before the function name
> > may be appropriate:
> >
> >         /*
> >          * hashmap API calls hashmap_cmp_fn, but it only wants
> >          * "does the key match the entry?" with 0 (matches) and
> >          * non-zero (does not match).
> >          */
> >         static int patch_id_match(const struct patch_id *ent,
> >                                   const struct patch_id *key,
> >                                   const void *keydata)
> >         {
> >                 ...
> 
> How about this one instead (to be squashed into 4/4)?
> 
> The updated wording directly addresses the puzzlement I initially
> felt "This returns error() which is always negative, so comparing
> (A, B) would say A < B, while comparing (B, A) would say B < A.
> Would it cause a problem in the caller?" while reading the function
> by being explicit that the sign does not matter.

Please squash it in. Kevin is on vacation and I am sure he is fine with
this change.

Thanks,
Dscho

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-08-04 14:21 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-29 16:19 [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Kevin Willford
2016-07-29 16:19 ` [[PATCH v2] 1/4] patch-ids: stop using a hand-rolled hashmap implementation Kevin Willford
2016-07-29 20:47   ` Junio C Hamano
2016-08-01  8:54     ` Johannes Schindelin
2016-08-01 20:04       ` Junio C Hamano
2016-08-01 22:34         ` Eric Wong
2016-08-02 10:30         ` Johannes Schindelin
2016-08-02 17:01           ` Junio C Hamano
2016-08-02 18:04             ` Junio C Hamano
2016-07-29 21:29   ` Junio C Hamano
2016-07-29 16:19 ` [[PATCH v2] 2/4] patch-ids: replace the seen indicator with a commit pointer Kevin Willford
2016-07-29 21:03   ` Junio C Hamano
2016-07-29 16:19 ` [[PATCH v2] 3/4] patch-ids: add flag to create the diff patch id using header only data Kevin Willford
2016-07-29 16:19 ` [[PATCH v2] 4/4] rebase: avoid computing unnecessary patch IDs Kevin Willford
2016-07-29 21:46   ` Junio C Hamano
2016-08-01  8:58     ` Johannes Schindelin
2016-08-01 20:11       ` Junio C Hamano
2016-08-02  9:50         ` Jakub Narębski
2016-08-02 17:06           ` Junio C Hamano
2016-08-02 10:45         ` Johannes Schindelin
2016-08-02 17:08           ` Junio C Hamano
2016-08-04  3:00             ` Junio C Hamano
2016-08-04 14:21               ` Johannes Schindelin
2016-07-29 20:22 ` [[PATCH v2] 0/4] Use header data patch ids for rebase to avoid loading file content Junio C Hamano
2016-08-01  9:01   ` Johannes Schindelin

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).