git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] promisor-remote.c: use oidset for deduplication
@ 2022-01-13 20:32 John Cai via GitGitGadget
  2022-01-13 23:45 ` Junio C Hamano
  2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
  0 siblings, 2 replies; 7+ messages in thread
From: John Cai via GitGitGadget @ 2022-01-13 20:32 UTC (permalink / raw)
  To: git; +Cc: John Cai, John Cai

From: John Cai <johncai86@gmail.com>

swap out oid_array for oidset in promisor-remote.c since we get
a deduplicated set of oids to operate on.

swap our calls for oid_array for oidset in callers of
promisor_remote_get_direct().

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: John Cai <johncai86@gmail.com>
---
    promisor-remote.c: use oidset for deduplication
    
    
    swap out oid_array for oidset in promisor-remote.c since we get a
    deduplicated set of oids to operate on. Any direct fetch we can save
    will be well worth it.
    
    oidset does use a slightly larger memory footprint than oid_array, so
    this change is a tradeoff between memory footprint and network calls.
    
    What I'm not sure about is if it's worth swapping out all the calls of
    oid_array for oidset in callers of promisor_remote_get_direct().

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1187%2Fjohn-cai%2Fjc-dedupe-promisor-fetch-v1
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1187/john-cai/jc-dedupe-promisor-fetch-v1
Pull-Request: https://github.com/git/git/pull/1187

 builtin/index-pack.c   |  9 +++---
 builtin/pack-objects.c |  9 +++---
 diff.c                 | 13 +++-----
 diffcore-rename.c      | 12 +++----
 diffcore.h             |  3 +-
 merge-ort.c            |  8 ++---
 object-file.c          |  4 ++-
 promisor-remote.c      | 72 ++++++++++++++----------------------------
 promisor-remote.h      |  6 ++--
 read-cache.c           |  9 +++---
 10 files changed, 57 insertions(+), 88 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 3c2e6aee3cc..7de2eb42794 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1387,18 +1387,17 @@ static void fix_unresolved_deltas(struct hashfile *f)
 		/*
 		 * Prefetch the delta bases.
 		 */
-		struct oid_array to_fetch = OID_ARRAY_INIT;
+		struct oidset to_fetch = OIDSET_INIT;
 		for (i = 0; i < nr_ref_deltas; i++) {
 			struct ref_delta_entry *d = sorted_by_pos[i];
 			if (!oid_object_info_extended(the_repository, &d->oid,
 						      NULL,
 						      OBJECT_INFO_FOR_PREFETCH))
 				continue;
-			oid_array_append(&to_fetch, &d->oid);
+			oidset_insert(&to_fetch, &d->oid);
 		}
-		promisor_remote_get_direct(the_repository,
-					   to_fetch.oid, to_fetch.nr);
-		oid_array_clear(&to_fetch);
+		promisor_remote_get_direct(the_repository, &to_fetch);
+		oidset_clear(&to_fetch);
 	}
 
 	for (i = 0; i < nr_ref_deltas; i++) {
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ba2006f2212..4cef3bd78b2 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1894,7 +1894,7 @@ static int can_reuse_delta(const struct object_id *base_oid,
 }
 
 static void prefetch_to_pack(uint32_t object_index_start) {
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 	uint32_t i;
 
 	for (i = object_index_start; i < to_pack.nr_objects; i++) {
@@ -1905,11 +1905,10 @@ static void prefetch_to_pack(uint32_t object_index_start) {
 					      NULL,
 					      OBJECT_INFO_FOR_PREFETCH))
 			continue;
-		oid_array_append(&to_fetch, &entry->idx.oid);
+		oidset_insert(&to_fetch, &entry->idx.oid);
 	}
-	promisor_remote_get_direct(the_repository,
-				   to_fetch.oid, to_fetch.nr);
-	oid_array_clear(&to_fetch);
+	promisor_remote_get_direct(the_repository, &to_fetch);
+	oidset_clear(&to_fetch);
 }
 
 static void check_object(struct object_entry *entry, uint32_t object_index)
diff --git a/diff.c b/diff.c
index c862771a589..e48a5c7bfcc 100644
--- a/diff.c
+++ b/diff.c
@@ -6609,14 +6609,14 @@ void diffcore_fix_diff_index(void)
 }
 
 void diff_add_if_missing(struct repository *r,
-			 struct oid_array *to_fetch,
+			 struct oidset *to_fetch,
 			 const struct diff_filespec *filespec)
 {
 	if (filespec && filespec->oid_valid &&
 	    !S_ISGITLINK(filespec->mode) &&
 	    oid_object_info_extended(r, &filespec->oid, NULL,
 				     OBJECT_INFO_FOR_PREFETCH))
-		oid_array_append(to_fetch, &filespec->oid);
+		oidset_insert(to_fetch, &filespec->oid);
 }
 
 void diff_queued_diff_prefetch(void *repository)
@@ -6624,7 +6624,7 @@ void diff_queued_diff_prefetch(void *repository)
 	struct repository *repo = repository;
 	int i;
 	struct diff_queue_struct *q = &diff_queued_diff;
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
@@ -6632,12 +6632,9 @@ void diff_queued_diff_prefetch(void *repository)
 		diff_add_if_missing(repo, &to_fetch, p->two);
 	}
 
-	/*
-	 * NEEDSWORK: Consider deduplicating the OIDs sent.
-	 */
-	promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
+	promisor_remote_get_direct(repo, &to_fetch);
 
-	oid_array_clear(&to_fetch);
+	oidset_clear(&to_fetch);
 }
 
 void diffcore_std(struct diff_options *options)
diff --git a/diffcore-rename.c b/diffcore-rename.c
index bebd4ed6a42..2fe89ef01b8 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -95,7 +95,7 @@ static void inexact_prefetch(void *prefetch_options)
 {
 	struct inexact_prefetch_options *options = prefetch_options;
 	int i;
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 
 	for (i = 0; i < rename_dst_nr; i++) {
 		if (rename_dst[i].p->renamed_pair)
@@ -118,8 +118,8 @@ static void inexact_prefetch(void *prefetch_options)
 		diff_add_if_missing(options->repo, &to_fetch,
 				    rename_src[i].p->one);
 	}
-	promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
-	oid_array_clear(&to_fetch);
+	promisor_remote_get_direct(options->repo, &to_fetch);
+	oidset_clear(&to_fetch);
 }
 
 static int estimate_similarity(struct repository *r,
@@ -837,7 +837,7 @@ static void basename_prefetch(void *prefetch_options)
 	struct strintmap *dests = options->dests;
 	struct dir_rename_info *info = options->info;
 	int i;
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 
 	/*
 	 * TODO: The following loops mirror the code/logic from
@@ -890,8 +890,8 @@ static void basename_prefetch(void *prefetch_options)
 		}
 	}
 
-	promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
-	oid_array_clear(&to_fetch);
+	promisor_remote_get_direct(options->repo, &to_fetch);
+	oidset_clear(&to_fetch);
 }
 
 static int find_basename_matches(struct diff_options *options,
diff --git a/diffcore.h b/diffcore.h
index badc2261c20..9f593e01165 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -11,6 +11,7 @@ struct repository;
 struct strintmap;
 struct strmap;
 struct userdiff_driver;
+struct oidset;
 
 /* This header file is internal between diff.c and its diff transformers
  * (e.g. diffcore-rename, diffcore-pickaxe).  Never include this header
@@ -229,7 +230,7 @@ int diffcore_count_changes(struct repository *r,
  * repository, add that OID to to_fetch.
  */
 void diff_add_if_missing(struct repository *r,
-			 struct oid_array *to_fetch,
+			 struct oidset *to_fetch,
 			 const struct diff_filespec *filespec);
 
 #endif
diff --git a/merge-ort.c b/merge-ort.c
index c3197970219..107c086bb6a 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -3903,7 +3903,7 @@ static void prefetch_for_content_merges(struct merge_options *opt,
 					struct string_list *plist)
 {
 	struct string_list_item *e;
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 
 	if (opt->repo != the_repository || !has_promisor_remote())
 		return;
@@ -3939,12 +3939,12 @@ static void prefetch_for_content_merges(struct merge_options *opt,
 			    S_ISREG(vi->mode) &&
 			    oid_object_info_extended(opt->repo, &vi->oid, NULL,
 						     OBJECT_INFO_FOR_PREFETCH))
-				oid_array_append(&to_fetch, &vi->oid);
+				oidset_insert(&to_fetch, &vi->oid);
 		}
 	}
 
-	promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
-	oid_array_clear(&to_fetch);
+	promisor_remote_get_direct(opt->repo, &to_fetch);
+	oidset_clear(&to_fetch);
 }
 
 static void process_entries(struct merge_options *opt,
diff --git a/object-file.c b/object-file.c
index 8be57f48de7..baec8f099de 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1519,6 +1519,7 @@ static int do_oid_object_info_extended(struct repository *r,
 	struct cached_object *co;
 	struct pack_entry e;
 	int rtype;
+	struct oidset to_fetch = OIDSET_INIT;
 	const struct object_id *real = oid;
 	int already_retried = 0;
 
@@ -1587,7 +1588,8 @@ static int do_oid_object_info_extended(struct repository *r,
 			 * TODO Investigate checking promisor_remote_get_direct()
 			 * TODO return value and stopping on error here.
 			 */
-			promisor_remote_get_direct(r, real, 1);
+			oidset_insert(&to_fetch, real);
+			promisor_remote_get_direct(r, &to_fetch);
 			already_retried = 1;
 			continue;
 		}
diff --git a/promisor-remote.c b/promisor-remote.c
index db2ebdc66ef..00166269f60 100644
--- a/promisor-remote.c
+++ b/promisor-remote.c
@@ -12,12 +12,13 @@ struct promisor_remote_config {
 
 static int fetch_objects(struct repository *repo,
 			 const char *remote_name,
-			 const struct object_id *oids,
-			 int oid_nr)
+			 struct oidset *oids)
 {
 	struct child_process child = CHILD_PROCESS_INIT;
-	int i;
 	FILE *child_in;
+	struct oidset_iter iter;
+	const struct object_id *oid;
+	oidset_iter_init(oids, &iter);
 
 	child.git_cmd = 1;
 	child.in = -1;
@@ -31,10 +32,10 @@ static int fetch_objects(struct repository *repo,
 		die(_("promisor-remote: unable to fork off fetch subprocess"));
 	child_in = xfdopen(child.in, "w");
 
-	trace2_data_intmax("promisor", repo, "fetch_count", oid_nr);
+	trace2_data_intmax("promisor", repo, "fetch_count", oidset_size(oids));
 
-	for (i = 0; i < oid_nr; i++) {
-		if (fputs(oid_to_hex(&oids[i]), child_in) < 0)
+	while((oid = oidset_iter_next(&iter))) {
+		if (fputs(oid_to_hex(oid), child_in) < 0)
 			die_errno(_("promisor-remote: could not write to fetch subprocess"));
 		if (fputc('\n', child_in) < 0)
 			die_errno(_("promisor-remote: could not write to fetch subprocess"));
@@ -198,70 +199,43 @@ int repo_has_promisor_remote(struct repository *r)
 	return !!repo_promisor_remote_find(r, NULL);
 }
 
-static int remove_fetched_oids(struct repository *repo,
-			       struct object_id **oids,
-			       int oid_nr, int to_free)
+static void remove_fetched_oids(struct repository *repo,
+			       struct oidset *oids)
 {
-	int i, remaining_nr = 0;
-	int *remaining = xcalloc(oid_nr, sizeof(*remaining));
-	struct object_id *old_oids = *oids;
-	struct object_id *new_oids;
-
-	for (i = 0; i < oid_nr; i++)
-		if (oid_object_info_extended(repo, &old_oids[i], NULL,
-					     OBJECT_INFO_SKIP_FETCH_OBJECT)) {
-			remaining[i] = 1;
-			remaining_nr++;
-		}
-
-	if (remaining_nr) {
-		int j = 0;
-		CALLOC_ARRAY(new_oids, remaining_nr);
-		for (i = 0; i < oid_nr; i++)
-			if (remaining[i])
-				oidcpy(&new_oids[j++], &old_oids[i]);
-		*oids = new_oids;
-		if (to_free)
-			free(old_oids);
-	}
+	struct oidset_iter iter;
+	const struct object_id *oid;
 
-	free(remaining);
+	oidset_iter_init(oids, &iter);
 
-	return remaining_nr;
+	while((oid = oidset_iter_next(&iter)))
+		if (oid_object_info_extended(repo, oid, NULL,
+					     OBJECT_INFO_SKIP_FETCH_OBJECT)){
+			oidset_remove(oids, oid);
+		}
 }
 
 int promisor_remote_get_direct(struct repository *repo,
-			       const struct object_id *oids,
-			       int oid_nr)
+				       struct oidset *oids)
 {
 	struct promisor_remote *r;
-	struct object_id *remaining_oids = (struct object_id *)oids;
-	int remaining_nr = oid_nr;
-	int to_free = 0;
 	int res = -1;
 
-	if (oid_nr == 0)
+	if (oidset_size(oids) == 0)
 		return 0;
 
 	promisor_remote_init(repo);
 
 	for (r = repo->promisor_remote_config->promisors; r; r = r->next) {
-		if (fetch_objects(repo, r->name, remaining_oids, remaining_nr) < 0) {
-			if (remaining_nr == 1)
+		if (fetch_objects(repo, r->name, oids) < 0) {
+			if (oidset_size(oids) == 1)
 				continue;
-			remaining_nr = remove_fetched_oids(repo, &remaining_oids,
-							 remaining_nr, to_free);
-			if (remaining_nr) {
-				to_free = 1;
+			remove_fetched_oids(repo, oids);
+			if (oidset_size(oids))
 				continue;
-			}
 		}
 		res = 0;
 		break;
 	}
 
-	if (to_free)
-		free(remaining_oids);
-
 	return res;
 }
diff --git a/promisor-remote.h b/promisor-remote.h
index edc45ab0f5f..c90837d3592 100644
--- a/promisor-remote.h
+++ b/promisor-remote.h
@@ -3,8 +3,7 @@
 
 #include "repository.h"
 
-struct object_id;
-
+struct oidset;
 /*
  * Promisor remote linked list
  *
@@ -45,7 +44,6 @@ static inline int has_promisor_remote(void)
  * If oid_nr is 0, this function returns 0 (success) immediately.
  */
 int promisor_remote_get_direct(struct repository *repo,
-			       const struct object_id *oids,
-			       int oid_nr);
+			       struct oidset *oids);
 
 #endif /* PROMISOR_REMOTE_H */
diff --git a/read-cache.c b/read-cache.c
index cbe73f14e5e..b78801b1f20 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -3732,7 +3732,7 @@ void prefetch_cache_entries(const struct index_state *istate,
 			    must_prefetch_predicate must_prefetch)
 {
 	int i;
-	struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct oidset to_fetch = OIDSET_INIT;
 
 	for (i = 0; i < istate->cache_nr; i++) {
 		struct cache_entry *ce = istate->cache[i];
@@ -3743,9 +3743,8 @@ void prefetch_cache_entries(const struct index_state *istate,
 					      NULL,
 					      OBJECT_INFO_FOR_PREFETCH))
 			continue;
-		oid_array_append(&to_fetch, &ce->oid);
+		oidset_insert(&to_fetch, &ce->oid);
 	}
-	promisor_remote_get_direct(the_repository,
-				   to_fetch.oid, to_fetch.nr);
-	oid_array_clear(&to_fetch);
+	promisor_remote_get_direct(the_repository, &to_fetch);
+	oidset_clear(&to_fetch);
 }

base-commit: 1ffcbaa1a5f10c9f706314d77f88de20a4a498c2
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-13 20:32 [PATCH] promisor-remote.c: use oidset for deduplication John Cai via GitGitGadget
@ 2022-01-13 23:45 ` Junio C Hamano
  2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 7+ messages in thread
From: Junio C Hamano @ 2022-01-13 23:45 UTC (permalink / raw)
  To: John Cai via GitGitGadget; +Cc: git, John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

> diff --git a/object-file.c b/object-file.c
> index 8be57f48de7..baec8f099de 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1519,6 +1519,7 @@ static int do_oid_object_info_extended(struct repository *r,
>  	struct cached_object *co;
>  	struct pack_entry e;
>  	int rtype;
> +	struct oidset to_fetch = OIDSET_INIT;
>  	const struct object_id *real = oid;
>  	int already_retried = 0;
>  
> @@ -1587,7 +1588,8 @@ static int do_oid_object_info_extended(struct repository *r,
>  			 * TODO Investigate checking promisor_remote_get_direct()
>  			 * TODO return value and stopping on error here.
>  			 */
> -			promisor_remote_get_direct(r, real, 1);
> +			oidset_insert(&to_fetch, real);
> +			promisor_remote_get_direct(r, &to_fetch);
>  			already_retried = 1;
>  			continue;
>  		}

Everything that migrated from oid_array to oidset should be OK
because it is likely that existing oid_array_clear() would not
compile until it is replaced with oidset_clear(), at which point
the oidset will be cleared when we are done with it.

However, this one did not use an oid_array and did not have to clear
to release the resources.  So it is very likely that nobody clears
to_fetch that is introduced by this patch, no?

I suspect that this introduces a new leak.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-13 20:32 [PATCH] promisor-remote.c: use oidset for deduplication John Cai via GitGitGadget
  2022-01-13 23:45 ` Junio C Hamano
@ 2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
  2022-01-14 19:14   ` Junio C Hamano
  2022-01-24 22:55   ` John Cai
  1 sibling, 2 replies; 7+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-01-14 12:11 UTC (permalink / raw)
  To: John Cai via GitGitGadget; +Cc: git, John Cai


On Thu, Jan 13 2022, John Cai via GitGitGadget wrote:

> From: John Cai <johncai86@gmail.com>
>
> swap out oid_array for oidset in promisor-remote.c since we get
> a deduplicated set of oids to operate on.
>
> swap our calls for oid_array for oidset in callers of
> promisor_remote_get_direct().
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

FWIW we had an off-list discussion about oidset v.s. a custom hashmap to
do the same off-list, not about s/oid_array/oidset/ in this area.

> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>     promisor-remote.c: use oidset for deduplication
>     
>     

Extra \n between $subject and $body.

>     swap out oid_array for oidset in promisor-remote.c since we get a

Nit: s/swap/Swap/, i.e. start the sentence with a capital letter...

>     deduplicated set of oids to operate on. Any direct fetch we can save
>     will be well worth it.
>     
>     oidset does use a slightly larger memory footprint than oid_array, so
>     this change is a tradeoff between memory footprint and network calls.
>     
>     What I'm not sure about is if it's worth swapping out all the calls of
>     oid_array for oidset in callers of promisor_remote_get_direct().

From what I understand of GGG I think you updated only the summary on
the PR but not the commit itself, the latter is what would go into
git.git. Here the commit should be updated so we get this message.

The part after the "---" is usually just used in this project for ad-hoc
list-only comment.

>  builtin/index-pack.c   |  9 +++---
>  builtin/pack-objects.c |  9 +++---
>  diff.c                 | 13 +++-----
>  diffcore-rename.c      | 12 +++----
>  diffcore.h             |  3 +-
>  merge-ort.c            |  8 ++---
>  object-file.c          |  4 ++-
>  promisor-remote.c      | 72 ++++++++++++++----------------------------
>  promisor-remote.h      |  6 ++--
>  read-cache.c           |  9 +++---
>  10 files changed, 57 insertions(+), 88 deletions(-)

Rather than inline comments, a comment that applies to all of these:

The difference between these two APIs is thaht oidset is hash-backed,
and you'd insert into it and we de-duplicate any duplicate OIDs on-the-fly.

The oid_array is just an realloc()'d "struct object_id *oid". On
insertion you can insert duplicates, but it has the ability to track
"I've sorted these", and "let's iterate over this sorted, and de-dup any
duplicates".

We have the two APIs for a reason, but I don't know in any of these
cases whether this change is safe.

Does e.g. index-pack.c always receive de-duplicated OIDs and we were
wasting CPU cycles using an oidset?

Do some of these like pack-objects.c receive de-duplicated OIDs from
e.g. "git repack" *now*, but we just lack test coverage to see that
they're happy to get duplicate OIDs on stdin (e.g. manually from a
user), and this would introduce a bug?

But most importantly is it worth it? What's the rationale for the
change? Less CPU/memory use? Getting e.g. "hyperfine" or "/usr/bin/time
-v" output for those (if so) would be valuable.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
@ 2022-01-14 19:14   ` Junio C Hamano
  2022-01-14 23:12     ` Junio C Hamano
  2022-01-24 22:55   ` John Cai
  1 sibling, 1 reply; 7+ messages in thread
From: Junio C Hamano @ 2022-01-14 19:14 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: John Cai via GitGitGadget, git, John Cai

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> From what I understand of GGG I think you updated only the summary on
> the PR but not the commit itself, the latter is what would go into
> git.git. Here the commit should be updated so we get this message.

Ah, I have long learned to ignore the blurb after the three-dash
line when a patch comes via GGG (because the PR text is often
useless and most often people seem to use text identical to the
commit log message).  I didn't realize that the proposed commit log
message was useless in this particular patch.  I was wondering why
you were talking about an extra blank line after the title there ;-)

> The part after the "---" is usually just used in this project for ad-hoc
> list-only comment.

True.  "What I'm not sure about is ..." however does belong to the
list-only comment, though.  If it is not 

> The difference between these two APIs is thaht oidset is hash-backed,
> and you'd insert into it and we de-duplicate any duplicate OIDs on-the-fly.
> 
> The oid_array is just an realloc()'d "struct object_id *oid". On
> insertion you can insert duplicates, but it has the ability to track
> "I've sorted these", and "let's iterate over this sorted, and de-dup any
> duplicates".
>
> We have the two APIs for a reason, but I don't know in any of these
> cases whether this change is safe.
>
> Does e.g. index-pack.c always receive de-duplicated OIDs and we were
> wasting CPU cycles using an oidset?
>
> Do some of these like pack-objects.c receive de-duplicated OIDs from
> e.g. "git repack" *now*, but we just lack test coverage to see that
> they're happy to get duplicate OIDs on stdin (e.g. manually from a
> user), and this would introduce a bug?

Also, if oid_array is used to produce a de-duplicated list of object
names in the current code, it is very likely that oid_array is
sorted (perhaps the objects are fed in sorted order), and the
callers depend on the order of the objects they find in the array.
Throwing sorted list of object names at oidset and then iterating
over what is in the oidset would likely to destroy the original
ordering.  I do not offhand know if the callers are broken by such a
change (either correctness-wise or performance-wise).

> But most importantly is it worth it? What's the rationale for the
> change? Less CPU/memory use? Getting e.g. "hyperfine" or "/usr/bin/time
> -v" output for those (if so) would be valuable.

Thanks.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-14 19:14   ` Junio C Hamano
@ 2022-01-14 23:12     ` Junio C Hamano
  0 siblings, 0 replies; 7+ messages in thread
From: Junio C Hamano @ 2022-01-14 23:12 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: John Cai via GitGitGadget, git, John Cai

Junio C Hamano <gitster@pobox.com> writes:

> Also, if oid_array is used to produce a de-duplicated list of object
> names in the current code, it is very likely that oid_array is
> sorted (perhaps the objects are fed in sorted order), and the
> callers depend on the order of the objects they find in the array.
> Throwing sorted list of object names at oidset and then iterating
> over what is in the oidset would likely to destroy the original
> ordering.  I do not offhand know if the callers are broken by such a
> change (either correctness-wise or performance-wise).

Since I had a bit of downtime waiting for CI, I took a look.

It seems that the list of objects collected in the oidset/oid_array
is fed to "git fetch --stdin" for lazy fetching, so I would be
surprised if the order of objects matter.  So I would stop worrying
about correctness or performance due to ordering change.

>> But most importantly is it worth it? What's the rationale for the
>> change? Less CPU/memory use? Getting e.g. "hyperfine" or "/usr/bin/time
>> -v" output for those (if so) would be valuable.

But I still agree with this.  How much duplication would a typical
request for a lazy fetch involve?  Would it cause duplicate "want"
to make the protocol exchange noticeably larger?

Having said all that, the change makes the code smaller, and
possibly easier to follow.

The primary reason why promisor-remote.c::remove_fetched_oids()
shrinks so much is because we used to scan the array and copied
the surviving ones into a new array, while the new code just
iterates over the oidset and removes the ones that were fetched.
I am assuming that mucking with the oidset's contents while you
are iterating over it is safe, but if that is not the case, then
the advantage of the smaller code disappears.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
  2022-01-14 19:14   ` Junio C Hamano
@ 2022-01-24 22:55   ` John Cai
  2022-01-25 19:17     ` Jonathan Tan
  1 sibling, 1 reply; 7+ messages in thread
From: John Cai @ 2022-01-24 22:55 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: John Cai via GitGitGadget, git, jonathantanmy, Junio C Hamano



On 14 Jan 2022, at 7:11, Ævar Arnfjörð Bjarmason wrote:

> On Thu, Jan 13 2022, John Cai via GitGitGadget wrote:
>
>> From: John Cai <johncai86@gmail.com>
>>
>> swap out oid_array for oidset in promisor-remote.c since we get
>> a deduplicated set of oids to operate on.
>>
>> swap our calls for oid_array for oidset in callers of
>> promisor_remote_get_direct().
>>
>> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> FWIW we had an off-list discussion about oidset v.s. a custom hashmap to
> do the same off-list, not about s/oid_array/oidset/ in this area.
>
>> Signed-off-by: John Cai <johncai86@gmail.com>
>> ---
>>     promisor-remote.c: use oidset for deduplication
>>
>>
>
> Extra \n between $subject and $body.
>
>>     swap out oid_array for oidset in promisor-remote.c since we get a
>
> Nit: s/swap/Swap/, i.e. start the sentence with a capital letter...
>
>>     deduplicated set of oids to operate on. Any direct fetch we can save
>>     will be well worth it.
>>
>>     oidset does use a slightly larger memory footprint than oid_array, so
>>     this change is a tradeoff between memory footprint and network calls.
>>
>>     What I'm not sure about is if it's worth swapping out all the calls of
>>     oid_array for oidset in callers of promisor_remote_get_direct().
>
> From what I understand of GGG I think you updated only the summary on
> the PR but not the commit itself, the latter is what would go into
> git.git. Here the commit should be updated so we get this message.
>
> The part after the "---" is usually just used in this project for ad-hoc
> list-only comment.
>
>>  builtin/index-pack.c   |  9 +++---
>>  builtin/pack-objects.c |  9 +++---
>>  diff.c                 | 13 +++-----
>>  diffcore-rename.c      | 12 +++----
>>  diffcore.h             |  3 +-
>>  merge-ort.c            |  8 ++---
>>  object-file.c          |  4 ++-
>>  promisor-remote.c      | 72 ++++++++++++++----------------------------
>>  promisor-remote.h      |  6 ++--
>>  read-cache.c           |  9 +++---
>>  10 files changed, 57 insertions(+), 88 deletions(-)
>
> Rather than inline comments, a comment that applies to all of these:
>
> The difference between these two APIs is thaht oidset is hash-backed,
> and you'd insert into it and we de-duplicate any duplicate OIDs on-the-fly.
>
> The oid_array is just an realloc()'d "struct object_id *oid". On
> insertion you can insert duplicates, but it has the ability to track
> "I've sorted these", and "let's iterate over this sorted, and de-dup any
> duplicates".
>
> We have the two APIs for a reason, but I don't know in any of these
> cases whether this change is safe.

To reduce the footprint of this change, maybe we can just keep oid_array and sort it
in diff.c before passing it off to promisor_remote_get_direct(), in which we then iterate
over unique entries.
>
> Does e.g. index-pack.c always receive de-duplicated OIDs and we were
> wasting CPU cycles using an oidset?

Good question. In fact a more immediate question is how often does diff.c receive dupe oids,
which was the original motivation for the change. cc’ing Jonathan as he was involved with
95acf11a that added the NEEDSWORK block.

Jonathan, how often do we expect diff to pass duplicate oids to promisor_remote_get_direct()?
>
> Do some of these like pack-objects.c receive de-duplicated OIDs from
> e.g. "git repack" *now*, but we just lack test coverage to see that
> they're happy to get duplicate OIDs on stdin (e.g. manually from a
> user), and this would introduce a bug?
>
> But most importantly is it worth it? What's the rationale for the
> change? Less CPU/memory use? Getting e.g. "hyperfine" or "/usr/bin/time
> -v" output for those (if so) would be valuable.

I agree this would be very valuable, but I’m thinking more so once I get a better idea of
what kind of situations would lead to duplicate oids in diff.c

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] promisor-remote.c: use oidset for deduplication
  2022-01-24 22:55   ` John Cai
@ 2022-01-25 19:17     ` Jonathan Tan
  0 siblings, 0 replies; 7+ messages in thread
From: Jonathan Tan @ 2022-01-25 19:17 UTC (permalink / raw)
  To: johncai86; +Cc: avarab, gitgitgadget, git, jonathantanmy, gitster

John Cai <johncai86@gmail.com> writes:
> Jonathan, how often do we expect diff to pass duplicate oids to
> promisor_remote_get_direct()?

I don't recall all the details, but not often, I believe. In a partial
clone, I envision that you would typically diff one tree that you have
all blobs for and one tree that you don't (or two trees that you have
all blobs for). If a user is in a sparse checkout and/or they're diffing
two trees that they have never checked out, I would recommend disabling
inexact copy detection to reduce the number of blobs that Git would need
to fetch. In these cases, I think that diff would only pass duplicate
OIDs if one of the trees references the same object more than once
(directly or indirectly).

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-01-25 19:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-13 20:32 [PATCH] promisor-remote.c: use oidset for deduplication John Cai via GitGitGadget
2022-01-13 23:45 ` Junio C Hamano
2022-01-14 12:11 ` Ævar Arnfjörð Bjarmason
2022-01-14 19:14   ` Junio C Hamano
2022-01-14 23:12     ` Junio C Hamano
2022-01-24 22:55   ` John Cai
2022-01-25 19:17     ` Jonathan Tan

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).