[PATCH] Use path comparison for patch ids before the file content

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH] Use path comparison for patch ids before the file content
@ 2016-07-14 20:17 Kevin Willford
  2016-07-14 22:13 ` Junio C Hamano
  2016-07-15 11:59 ` Johannes Schindelin
  0 siblings, 2 replies; 3+ messages in thread
From: Kevin Willford @ 2016-07-14 20:17 UTC (permalink / raw)
  To: git; +Cc: Kevin Willford

When limiting the list in a revision walk using cherry pick, patch ids are
calculated by producing the diff of the content of the files.  This would
be more efficent by using a patch id looking at the paths that were
changed in the commits and only if all the file changed are the same fall
back to getting the content of the files in the commits to determine if
the commits are the same.

This change uses a hashmap to store entries with a hash of the patch id
calculated by just using the file paths.  The entries constist of the
commit and the patch id calculated using file contents which is initially
empty.  If there are commits that are found in the hashmap it means that
the same files were changed in the commits and the file contents need to
be checked in order to determine if the commits are truly the same.  The
patch id that is calcuated based on the file contents is then stored in the
hashmap entry if needed in later comparisons.  If the commits are determined to be
the same the cherry_flag is set on the commit being checked as well as the
commit in the hashmap entry saving running though the original list of
commits checking a seen flag.  This will speed up a rebase where the
upstream has many changes but none of them have been pulled into the
current branch.
---
 diff.c      |  16 +++++----
 diff.h      |   2 +-
 patch-ids.c | 114 +++++++++++++++++++++++++++++-------------------------------
 patch-ids.h |   7 ++--
 revision.c  |  19 ++--------
 5 files changed, 73 insertions(+), 85 deletions(-)

diff --git a/diff.c b/diff.c
index fa78fc1..f38b663 100644
--- a/diff.c
+++ b/diff.c
@@ -4449,7 +4449,7 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, int use_path_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4484,9 +4484,6 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 
 		diff_fill_sha1_info(p->one);
 		diff_fill_sha1_info(p->two);
-		if (fill_mmfile(&mf1, p->one) < 0 ||
-				fill_mmfile(&mf2, p->two) < 0)
-			return error("unable to read files to diff");
 
 		len1 = remove_space(p->one->path, strlen(p->one->path));
 		len2 = remove_space(p->two->path, strlen(p->two->path));
@@ -4521,6 +4518,13 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 					len2, p->two->path);
 		git_SHA1_Update(&ctx, buffer, len1);
 
+		if (use_path_only)
+			continue;
+
+		if (fill_mmfile(&mf1, p->one) < 0 ||
+			fill_mmfile(&mf2, p->two) < 0)
+			return error("unable to read files to diff");
+
 		if (diff_filespec_is_binary(p->one) ||
 		    diff_filespec_is_binary(p->two)) {
 			git_SHA1_Update(&ctx, sha1_to_hex(p->one->sha1), 40);
@@ -4541,11 +4545,11 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1, int use_path_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1);
+	int result = diff_get_patch_id(options, sha1, use_path_only);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 125447b..7883729 100644
--- a/diff.h
+++ b/diff.h
@@ -342,7 +342,7 @@ extern int run_diff_files(struct rev_info *revs, unsigned int option);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index a4d0016..f0262ce 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -4,8 +4,9 @@
 #include "sha1-lookup.h"
 #include "patch-ids.h"
 
-int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1)
+
+static int ll_commit_patch_id(struct commit *commit, struct diff_options *options,
+	unsigned char *sha1, int use_path_only)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.oid.hash,
@@ -13,93 +14,90 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.oid.hash, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
-}
 
-static const unsigned char *patch_id_access(size_t index, void *table)
-{
-	struct patch_id **id_table = table;
-	return id_table[index]->patch_id;
+
+	return diff_flush_patch_id(options, sha1, use_path_only);
 }
 
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
+int commit_patch_id(struct commit *commit, struct diff_options *options,
+		    unsigned char *sha1)
 {
-	return sha1_pos(id, table, nr, patch_id_access);
+	return ll_commit_patch_id(commit, options, sha1, 0);
 }
 
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-	struct patch_id_bucket *next;
-	int nr;
-	struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+	hashmap_init(&ids->patches, NULL, 0);
 	return 0;
 }
 
 int free_patch_ids(struct patch_ids *ids)
 {
-	struct patch_id_bucket *next, *patches;
-
-	free(ids->table);
-	for (patches = ids->patches; patches; patches = next) {
-		next = patches->next;
-		free(patches);
-	}
+	hashmap_free(&ids->patches, 1);
 	return 0;
 }
 
-static struct patch_id *add_commit(struct commit *commit,
-				   struct patch_ids *ids,
-				   int no_add)
+struct patch_id *has_commit_patch_id(struct commit *commit,
+				     struct patch_ids *ids)
 {
-	struct patch_id_bucket *bucket;
-	struct patch_id *ent;
+	struct patch_id *entry;
 	unsigned char sha1[20];
-	int pos;
+	unsigned char sha1_2[20];
+	struct patch_id key;
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
+	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 1))
 		return NULL;
-	pos = patch_pos(ids->table, ids->nr, sha1);
-	if (0 <= pos)
-		return ids->table[pos];
-	if (no_add)
+
+	memset(&key, 0, sizeof(key));
+	hashmap_entry_init(&key, sha1hash(sha1));
+	key.commit = commit;
+
+	entry = hashmap_get(&ids->patches, &key, NULL);
+	if (!entry)
+		return NULL;
+
+	/*
+	 * Since we found a match in the hashmap that means the
+	 * same files where changed in this commit so we need to
+	 * get the patch id using the contents of the files to
+	 * compare with each of the commits in the hash bucket
+	 * to determine if the commits are truely the same
+	 */
+	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 0))
 		return NULL;
 
-	pos = -1 - pos;
+	for (; entry; entry = hashmap_get_next(&ids->patches, entry)) {
+		if (is_null_sha1(entry->patch_id))
+		{
+			if (ll_commit_patch_id(entry->commit, &ids->diffopts, sha1_2, 0))
+				return NULL;
+
+			hashcpy(entry->patch_id, sha1_2);
+		}
 
-	bucket = ids->patches;
-	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-		bucket = xcalloc(1, sizeof(*bucket));
-		bucket->next = ids->patches;
-		ids->patches = bucket;
+		if (!hashcmp(entry->patch_id, sha1))
+			return entry;
 	}
-	ent = &bucket->bucket[bucket->nr++];
-	hashcpy(ent->patch_id, sha1);
-
-	ALLOC_GROW(ids->table, ids->nr + 1, ids->alloc);
-	if (pos < ids->nr)
-		memmove(ids->table + pos + 1, ids->table + pos,
-			sizeof(ent) * (ids->nr - pos));
-	ids->nr++;
-	ids->table[pos] = ent;
-	return ids->table[pos];
-}
 
-struct patch_id *has_commit_patch_id(struct commit *commit,
-				     struct patch_ids *ids)
-{
-	return add_commit(commit, ids, 1);
+	return NULL;
 }
 
 struct patch_id *add_commit_patch_id(struct commit *commit,
 				     struct patch_ids *ids)
 {
-	return add_commit(commit, ids, 0);
-}
+	unsigned char sha1[20];
+	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 1))
+		return NULL;
+
+	struct patch_id *key = xmalloc(sizeof(*key));
+	memset(key, 0, sizeof(*key));
+	hashmap_entry_init(key, sha1hash(sha1));
+	key->commit = commit;
+	hashmap_add(&ids->patches, key);
+
+	return key;
+}
\ No newline at end of file
diff --git a/patch-ids.h b/patch-ids.h
index eeb56b3..58e55ae 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -2,15 +2,14 @@
 #define PATCH_IDS_H
 
 struct patch_id {
+	struct hashmap_entry ent;
 	unsigned char patch_id[20];
-	char seen;
+	struct commit *commit;
 };
 
 struct patch_ids {
+	struct hashmap patches;
 	struct diff_options diffopts;
-	int nr, alloc;
-	struct patch_id **table;
-	struct patch_id_bucket *patches;
 };
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
diff --git a/revision.c b/revision.c
index edba5b7..f715345 100644
--- a/revision.c
+++ b/revision.c
@@ -846,7 +846,7 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		 */
 		if (left_first != !!(flags & SYMMETRIC_LEFT))
 			continue;
-		commit->util = add_commit_patch_id(commit, &ids);
+		add_commit_patch_id(commit, &ids);
 	}
 
 	/* either cherry_mark or cherry_pick are true */
@@ -873,23 +873,10 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		id = has_commit_patch_id(commit, &ids);
 		if (!id)
 			continue;
-		id->seen = 1;
-		commit->object.flags |= cherry_flag;
-	}
-
-	/* Now check the original side for seen ones */
-	for (p = list; p; p = p->next) {
-		struct commit *commit = p->item;
-		struct patch_id *ent;
 
-		ent = commit->util;
-		if (!ent)
-			continue;
-		if (ent->seen)
-			commit->object.flags |= cherry_flag;
-		commit->util = NULL;
+		commit->object.flags |= cherry_flag;
+		id->commit->object.flags |= cherry_flag;
 	}
-
 	free_patch_ids(&ids);
 }
 
-- 
2.9.0.windows.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] Use path comparison for patch ids before the file content
  2016-07-14 20:17 [PATCH] Use path comparison for patch ids before the file content Kevin Willford
@ 2016-07-14 22:13 ` Junio C Hamano
  2016-07-15 11:59 ` Johannes Schindelin
  1 sibling, 0 replies; 3+ messages in thread
From: Junio C Hamano @ 2016-07-14 22:13 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git

Kevin Willford <kcwillford@gmail.com> writes:

> When limiting the list in a revision walk using cherry pick, patch ids are
> calculated by producing the diff of the content of the files.  This would
> be more efficent by using a patch id looking at the paths that were
> changed in the commits and only if all the file changed are the same fall
> back to getting the content of the files in the commits to determine if
> the commits are the same.

The basic idea of this change makes sense.  When we have many
commits, but if we can tell that no other commit changes the same
set of paths as this commit does, we can immediately know that this
commit cannot have an equivalent other commit among the rest.  By
first computing a lot cheaper "hash of touched paths" for commits,
and throwing them into separate bins keyed by the "hash of touched
paths", you can narrow the commits whose patch IDs must be compared,
and if a bin happens to be a singleton, you do not even need to
produce any patch ID by running a textual diff.  I like it.

Explaining this as "hash of touched paths" is somewhat misleading.
Your "use_path_only" mode actually hashes a lot more than just
paths.  Because the "use_path_only" mode actually hashes the entire
basic diff header and not just paths, it can differentiate a commit
that adds a file and another commit that modifies the same file, for
example.

> ...  This will speed up a rebase where the
> upstream has many changes but none of them have been pulled into the
> current branch.
> ---

Missing sign-off.

>  diff.c      |  16 +++++----
>  diff.h      |   2 +-

The changes in the above two files looked OK to me.
I didn't read the changes to the other three files carefully.

>  patch-ids.c | 114 +++++++++++++++++++++++++++++-------------------------------
>  patch-ids.h |   7 ++--
>  revision.c  |  19 ++--------
>  5 files changed, 73 insertions(+), 85 deletions(-)

>
> diff --git a/patch-ids.c b/patch-ids.c
> index a4d0016..f0262ce 100644
> --- a/patch-ids.c
> +++ b/patch-ids.c
> @@ -4,8 +4,9 @@
> ...
> +}
> \ No newline at end of file

No newline at end of file.

Thanks.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Use path comparison for patch ids before the file content
  2016-07-14 20:17 [PATCH] Use path comparison for patch ids before the file content Kevin Willford
  2016-07-14 22:13 ` Junio C Hamano
@ 2016-07-15 11:59 ` Johannes Schindelin
  1 sibling, 0 replies; 3+ messages in thread
From: Johannes Schindelin @ 2016-07-15 11:59 UTC (permalink / raw)
  To: Kevin Willford; +Cc: git

Hi Kevin,

congratulations for your first patch on the Git mailing list!

On Thu, 14 Jul 2016, Kevin Willford wrote:

> When limiting the list in a revision walk using cherry pick, patch ids are
> calculated by producing the diff of the content of the files.  This would
> be more efficent by using a patch id looking at the paths that were
> changed in the commits and only if all the file changed are the same fall
> back to getting the content of the files in the commits to determine if
> the commits are the same.
> 
> This change uses a hashmap to store entries with a hash of the patch id
> calculated by just using the file paths.  The entries constist of the
> commit and the patch id calculated using file contents which is initially
> empty.  If there are commits that are found in the hashmap it means that
> the same files were changed in the commits and the file contents need to
> be checked in order to determine if the commits are truly the same.  The
> patch id that is calcuated based on the file contents is then stored in the
> hashmap entry if needed in later comparisons.  If the commits are determined to be
> the same the cherry_flag is set on the commit being checked as well as the
> commit in the hashmap entry saving running though the original list of
> commits checking a seen flag.  This will speed up a rebase where the
> upstream has many changes but none of them have been pulled into the
> current branch.
> ---

How about pulling out the change that replaces patch-id's ad-hoc hashmap
with the struct hashmap solution?

It would become a two-part patch series, then.

BTW I tried my hand at a different commit message, maybe you want to
cherry-pick parts of it?

-- snip --
rebase: avoid computing unnecessary patch IDs

The `rebase` family of Git commands avoid applying patches that were
already integrated upstream. They do that by using the revision walking
option that computes the patch IDs of the two sides of the rebase
(local-only patches vs upstream-only ones) and skipping those local
patches whose patch ID matches one of the upstream ones.

In many cases, this causes unnecessary churn, as already the set of
paths touched by a given commit would suffice to determine that an
upstream patch has no local equivalent.

This hurts performance in particular when there are a lot of upstream
patches, and/or large ones.

Therefore, let's introduce the concept of a "header-only" patch ID,
compare those first, and only evaluate the "full" patch ID lazily.

Please note that in contrast to the "full" patch IDs, those "header-only"
patch IDs are prone to collide with one another, as adjacent commits
frequently touch the very same files. Hence we now have to be careful to
allow multiple hash entries with the same hash.
-- snap --

And as Junio pointed out, please add your Signed-off-by: lines (see
https://github.com/git/git/blob/v2.9.1/Documentation/SubmittingPatches#L239-L291
for an explanation).

And here are a couple of comments on the code (please read all the way to
the end, I cut the parts that I do not address):

> diff --git a/diff.c b/diff.c
> index fa78fc1..f38b663 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -4449,7 +4449,7 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
>  }
>  
>  /* returns 0 upon success, and writes result into sha1 */
> -static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
> +static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, int use_path_only)

If we used `diff_header_only`, we could address Junio's concern about the
naming of this parameter.

> diff --git a/patch-ids.c b/patch-ids.c
> index a4d0016..f0262ce 100644
> --- a/patch-ids.c
> +++ b/patch-ids.c
> @@ -4,8 +4,9 @@
>  #include "sha1-lookup.h"
>  #include "patch-ids.h"
>  
> -int commit_patch_id(struct commit *commit, struct diff_options *options,
> -		    unsigned char *sha1)
> +
> +static int ll_commit_patch_id(struct commit *commit, struct diff_options *options,

How about simply changing the signature of the commit_patch_id() function?
It's not like Git guarantees any kind of stable API of its libgit.a or
something.

> +int commit_patch_id(struct commit *commit, struct diff_options *options,
> +		    unsigned char *sha1)
>  {
> -	return sha1_pos(id, table, nr, patch_id_access);
> +	return ll_commit_patch_id(commit, options, sha1, 0);
>  }
>  
> -#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
> -struct patch_id_bucket {
> -	struct patch_id_bucket *next;
> -	int nr;
> -	struct patch_id bucket[BUCKET_SIZE];
> -};

The idea of the original code was to get as close to 4kB for the initial
(and probably final) hashmap. I do not think we can, or have to, achieve
the same with struct hashmap. But we should use a larger initial size than
64 (maybe 128? I would actually go for 256, even if that roughly doubles
the initial hashmap size) by passing an explicit value to hashmap_init():

>  int init_patch_ids(struct patch_ids *ids)
>  {
>  	memset(ids, 0, sizeof(*ids));
>  	diff_setup(&ids->diffopts);
>  	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
>  	diff_setup_done(&ids->diffopts);
> +	hashmap_init(&ids->patches, NULL, 0);

Like here (just replace the 0 by 128 or 256).

Also, since the "header-only" patch ID makes hash collisions much more
likely (I, for one, frequently have patches in the same patch series that
touch the very same set of files), we need to provide a hashmap_cmp_fn.

For the hashmap population, I would suggest a function like this:

static int patch_id_cmp(const struct patch_id *a, constr struct patch_id *b,
	const void *keydata)
{
	return hashcmp(a->commit, b->commit);
}

This will guarantee that all the commits will be added to the hash map,
even if some "header-only" patch IDs collide.

(If you go with my suggestion to split out the conversion to the struct
hashmap in a preparatory patch, then you probably want to use
hashcmp(a->patch_id, b->patch_id) in the first patch, and then replace it
with the version above in the second patch.)

Of course, we will need to change the cmpfn after populating the hashmap.
See below for my thoughts on that:

>  	return 0;
>  }
>  
>  int free_patch_ids(struct patch_ids *ids)
>  {
> -	struct patch_id_bucket *next, *patches;
> -
> -	free(ids->table);
> -	for (patches = ids->patches; patches; patches = next) {
> -		next = patches->next;
> -		free(patches);
> -	}
> +	hashmap_free(&ids->patches, 1);
>  	return 0;
>  }
>  
> -static struct patch_id *add_commit(struct commit *commit,
> -				   struct patch_ids *ids,
> -				   int no_add)
> +struct patch_id *has_commit_patch_id(struct commit *commit,
> +				     struct patch_ids *ids)
>  {
> -	struct patch_id_bucket *bucket;
> -	struct patch_id *ent;
> +	struct patch_id *entry;
>  	unsigned char sha1[20];
> -	int pos;
> +	unsigned char sha1_2[20];
> +	struct patch_id key;
>  
> -	if (commit_patch_id(commit, &ids->diffopts, sha1))
> +	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 1))
>  		return NULL;
> -	pos = patch_pos(ids->table, ids->nr, sha1);
> -	if (0 <= pos)
> -		return ids->table[pos];
> -	if (no_add)
> +
> +	memset(&key, 0, sizeof(key));
> +	hashmap_entry_init(&key, sha1hash(sha1));
> +	key.commit = commit;
> +
> +	entry = hashmap_get(&ids->patches, &key, NULL);
> +	if (!entry)
> +		return NULL;
> +
> +	/*
> +	 * Since we found a match in the hashmap that means the
> +	 * same files where changed in this commit so we need to
> +	 * get the patch id using the contents of the files to
> +	 * compare with each of the commits in the hash bucket
> +	 * to determine if the commits are truely the same
> +	 */
> +	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 0))
>  		return NULL;
>  
> -	pos = -1 - pos;
> +	for (; entry; entry = hashmap_get_next(&ids->patches, entry)) {
> +		if (is_null_sha1(entry->patch_id))
> +		{
> +			if (ll_commit_patch_id(entry->commit, &ids->diffopts, sha1_2, 0))
> +				return NULL;
> +
> +			hashcpy(entry->patch_id, sha1_2);
> +		}
>  
> -	bucket = ids->patches;
> -	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
> -		bucket = xcalloc(1, sizeof(*bucket));
> -		bucket->next = ids->patches;
> -		ids->patches = bucket;
> +		if (!hashcmp(entry->patch_id, sha1))
> +			return entry;
>  	}
> -	ent = &bucket->bucket[bucket->nr++];
> -	hashcpy(ent->patch_id, sha1);
> -
> -	ALLOC_GROW(ids->table, ids->nr + 1, ids->alloc);
> -	if (pos < ids->nr)
> -		memmove(ids->table + pos + 1, ids->table + pos,
> -			sizeof(ent) * (ids->nr - pos));
> -	ids->nr++;
> -	ids->table[pos] = ent;
> -	return ids->table[pos];
> -}
>  
> -struct patch_id *has_commit_patch_id(struct commit *commit,
> -				     struct patch_ids *ids)
> -{
> -	return add_commit(commit, ids, 1);
> +	return NULL;
>  }

If you go with my suggestion of patch_id_cmp() above, you will have to
change the cmpfn after the hashmap-populating phase.

Happily, we know that this phase is over as soon as has_commit_patch_id()
is called, and it does not hurt to re-set the cmpfn multiple times.

Even better: if you re-set the cmpfn, you can now use something like this:

static int full_patch_id_cmp(const struct patch_id *a,
    const struct patch_id *b, const diff_options *opt)
{
    if (is_null_sha1(a->patch_id) &&
      commit_patch_id(a->commit, opt, a->patch_id, 0))
        return error("Could not get patch ID for %s",
            oid_to_hex(&a->commit->object.oid));
    if (is_null_sha1(b->patch_id) &&
      commit_patch_id(b->commit, opt, b->patch_id, 0))
        return error("Could not get patch ID for %s",
            oid_to_hex(&b->commit->object.oid));
    return hashcmp(a->patch_id, b->patch_id);
}

With this cmpfn, your has_commit_patch_id() would become something like
this:

struct patch_id *has_commit_patch_id(struct commit *commit,
				     struct patch_ids *ids)
{
	unsigned char sha1[20];
	struct patch_id key;

	if (commit_patch_id(commit, &ids->diffopt, sha1, 1))
		return NULL;

	ids->patches.cmpfn = full_patch_id_cmp;
	memset(&key, 0, sizeof(key));
	key.hash = sha1hash(sha1);
	key.commit = commit;
	return hashmap_get(&ids->patches, &key, &ids->diffopt);
}

>  struct patch_id *add_commit_patch_id(struct commit *commit,
>  				     struct patch_ids *ids)
>  {
> -	return add_commit(commit, ids, 0);
> -}
> +	unsigned char sha1[20];
> +	if (ll_commit_patch_id(commit, &ids->diffopts, sha1, 1))
> +		return NULL;
> +
> +	struct patch_id *key = xmalloc(sizeof(*key));

Git's source code does not allow declarations after statements, therefore
`struct patch_id *key;` needs to come before the if().

> +	memset(key, 0, sizeof(*key));

Since you want to initialize it to zeroes anyway, how about

	key = xcalloc(1, sizeof(*key));

> @@ -873,23 +873,10 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
>  		id = has_commit_patch_id(commit, &ids);
>  		if (!id)
>  			continue;
> -		id->seen = 1;
> -		commit->object.flags |= cherry_flag;
> -	}
> -
> -	/* Now check the original side for seen ones */
> -	for (p = list; p; p = p->next) {
> -		struct commit *commit = p->item;
> -		struct patch_id *ent;
>  
> -		ent = commit->util;
> -		if (!ent)
> -			continue;
> -		if (ent->seen)
> -			commit->object.flags |= cherry_flag;
> -		commit->util = NULL;
> +		commit->object.flags |= cherry_flag;
> +		id->commit->object.flags |= cherry_flag;

I really like this simplification.

>  	}
> -
>  	free_patch_ids(&ids);

Let's keep this empty line here; There is no need to change this existing
code.

BTW I could imagine splitting out even just the feature to compute
header-only patch IDs, and keep the changes to make use of it for a third
patch in the series.

I also think that this patch series could use a test that verifies that we
no longer generate unneeded diffs. Maybe by rebasing a commit on top of an
commit touching a different file, after corrupting the blob of the latter
one? Let me give it a try.

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2016-07-15 11:59 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-14 20:17 [PATCH] Use path comparison for patch ids before the file content Kevin Willford
2016-07-14 22:13 ` Junio C Hamano
2016-07-15 11:59 ` Johannes Schindelin

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).