git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] diff: restrict when prefetching occurs
@ 2020-03-31  2:04 Jonathan Tan
  2020-03-31 12:14 ` Derrick Stolee
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-03-31  2:04 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, peff

Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In particular, if
(1) no blob data is included in the output of the diff, (2)
break-rewrite detection is not requested, and (3) no inexact rename
detection is needed, then no blobs are read at all.

Therefore, in such a case, do not prefetch. Change diffcore_std() to
only prefetch if (1) and/or (2) is not true (currently, it always
prefetches); change diffcore_rename() to prefetch if (3) is not true and
no prefetch has yet occurred.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
I decided to revisit [1] because at $DAYJOB there was a new case that
required this. I used Peff's code from [2] for the rebase part, and also
automatically prefetch if blob data will be included in the output of
the diff and/or break-rewrite detection is requested.

[1] https://lore.kernel.org/git/20200128213508.31661-1-jonathantanmy@google.com/
[2] https://lore.kernel.org/git/20200130055136.GA2184413@coredump.intra.peff.net/
---
 diff.c                        | 26 +++++++++++++++----
 diffcore-rename.c             | 40 +++++++++++++++++++++++++++-
 diffcore.h                    |  2 +-
 t/t4067-diff-partial-clone.sh | 49 +++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/diff.c b/diff.c
index 1010d806f5..19c5d638d6 100644
--- a/diff.c
+++ b/diff.c
@@ -6507,10 +6507,24 @@ static void add_if_missing(struct repository *r,
 
 void diffcore_std(struct diff_options *options)
 {
-	if (options->repo == the_repository && has_promisor_remote()) {
-		/*
-		 * Prefetch the diff pairs that are about to be flushed.
-		 */
+	int prefetched = 0;
+	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
+		DIFF_FORMAT_NUMSTAT |
+		DIFF_FORMAT_PATCH |
+		DIFF_FORMAT_SHORTSTAT |
+		DIFF_FORMAT_DIRSTAT;
+
+	/*
+	 * Check if the user requested a blob-data-requiring diff output and/or
+	 * break-rewrite detection (which requires blob data). If yes, prefetch
+	 * the diff pairs.
+	 *
+	 * If no prefetching occurs, diffcore_rename() will prefetch if it
+	 * decides that it needs inexact rename detection.
+	 */
+	if (options->repo == the_repository && has_promisor_remote() &&
+	    (options->output_format & output_formats_to_prefetch ||
+	     (!options->found_follow && options->break_opt != -1))) {
 		int i;
 		struct diff_queue_struct *q = &diff_queued_diff;
 		struct oid_array to_fetch = OID_ARRAY_INIT;
@@ -6520,6 +6534,8 @@ void diffcore_std(struct diff_options *options)
 			add_if_missing(options->repo, &to_fetch, p->one);
 			add_if_missing(options->repo, &to_fetch, p->two);
 		}
+		prefetched = 1;
+
 		if (to_fetch.nr)
 			/*
 			 * NEEDSWORK: Consider deduplicating the OIDs sent.
@@ -6538,7 +6554,7 @@ void diffcore_std(struct diff_options *options)
 			diffcore_break(options->repo,
 				       options->break_opt);
 		if (options->detect_rename)
-			diffcore_rename(options);
+			diffcore_rename(options, prefetched);
 		if (options->break_opt != -1)
 			diffcore_merge_broken();
 	}
diff --git a/diffcore-rename.c b/diffcore-rename.c
index e189f407af..962565f066 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -7,6 +7,7 @@
 #include "object-store.h"
 #include "hashmap.h"
 #include "progress.h"
+#include "promisor-remote.h"
 
 /* Table of rename/copy destinations */
 
@@ -448,7 +449,18 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
 	return count;
 }
 
-void diffcore_rename(struct diff_options *options)
+static void add_if_missing(struct repository *r,
+			   struct oid_array *to_fetch,
+			   const struct diff_filespec *filespec)
+{
+	if (filespec && filespec->oid_valid &&
+	    !S_ISGITLINK(filespec->mode) &&
+	    oid_object_info_extended(r, &filespec->oid, NULL,
+				     OBJECT_INFO_FOR_PREFETCH))
+		oid_array_append(to_fetch, &filespec->oid);
+}
+
+void diffcore_rename(struct diff_options *options, int prefetched)
 {
 	int detect_rename = options->detect_rename;
 	int minimum_score = options->rename_score;
@@ -538,6 +550,32 @@ void diffcore_rename(struct diff_options *options)
 		break;
 	}
 
+	if (!prefetched) {
+		/*
+		 * At this point we know there's actual work to do: we have rename
+		 * destinations that didn't find an exact match, and we have potential
+		 * sources. So we'll have to do inexact rename detection, which
+		 * requires looking at the blobs.
+		 *
+		 * If we haven't already prefetched, it's worth pre-fetching
+		 * them as a group now.
+		 */
+		int i;
+		struct oid_array to_fetch = OID_ARRAY_INIT;
+
+		for (i = 0; i < rename_dst_nr; i++) {
+			if (rename_dst[i].pair)
+				continue; /* already found exact match */
+			add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
+		}
+		for (i = 0; i < rename_src_nr; i++)
+			add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
+		if (to_fetch.nr)
+			promisor_remote_get_direct(options->repo,
+						   to_fetch.oid, to_fetch.nr);
+		oid_array_clear(&to_fetch);
+	}
+
 	if (options->show_rename_progress) {
 		progress = start_delayed_progress(
 				_("Performing inexact rename detection"),
diff --git a/diffcore.h b/diffcore.h
index 7c07347e42..9f69506574 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -144,7 +144,7 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
 void diff_q(struct diff_queue_struct *, struct diff_filepair *);
 
 void diffcore_break(struct repository *, int);
-void diffcore_rename(struct diff_options *);
+void diffcore_rename(struct diff_options *, int prefetched);
 void diffcore_merge_broken(void);
 void diffcore_pickaxe(struct diff_options *);
 void diffcore_order(const char *orderfile);
diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
index 4831ad35e6..7acb64727d 100755
--- a/t/t4067-diff-partial-clone.sh
+++ b/t/t4067-diff-partial-clone.sh
@@ -131,4 +131,53 @@ test_expect_success 'diff with rename detection batches blobs' '
 	test_line_count = 1 done_lines
 '
 
+test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	rm server/b &&
+	printf "b\nb\nb\nb\nb\n" >server/c &&
+	git -C server add c &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace
+'
+
+test_expect_success 'diff --break-rewrites fetches only if necessary, and batches blobs if it does' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	printf "c\nc\nc\nc\nc\n" >server/b &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace &&
+
+	# But with --break-rewrites, ensure that there is exactly 1 negotiation
+	# by checking that there is only 1 "done" line sent. ("done" marks the
+	# end of negotiation.)
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --break-rewrites --raw -M HEAD^ HEAD &&
+	grep "git> done" trace >done_lines &&
+	test_line_count = 1 done_lines
+'
+
 test_done
-- 
2.26.0.rc2.310.g2932bb562d-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] diff: restrict when prefetching occurs
  2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
@ 2020-03-31 12:14 ` Derrick Stolee
  2020-03-31 16:50   ` Jonathan Tan
  2020-03-31 18:15 ` Junio C Hamano
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Derrick Stolee @ 2020-03-31 12:14 UTC (permalink / raw)
  To: Jonathan Tan, git; +Cc: peff

On 3/30/2020 10:04 PM, Jonathan Tan wrote:
> Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
> optimized "diff" by prefetching blobs in a partial clone, but there are
> some cases wherein blobs do not need to be prefetched. In particular, if
> (1) no blob data is included in the output of the diff, (2)
> break-rewrite detection is not requested, and (3) no inexact rename
> detection is needed, then no blobs are read at all.
> 
> Therefore, in such a case, do not prefetch. Change diffcore_std() to
> only prefetch if (1) and/or (2) is not true (currently, it always
> prefetches); change diffcore_rename() to prefetch if (3) is not true and
> no prefetch has yet occurred.

This conflicts with [3], so please keep that in mind.

Maybe [3] should be adjusted to assume this patch, because that change
is mostly about disabling the batch download when no renames are required.
As Peff said [2] the full rename detection trigger is "overly broad".

However, the changed-path Bloom filters are an excellent test for this
patch, as computing them in a partial clone will trigger downloading all
blobs without [3].

[3] https://lore.kernel.org/git/55824cda89c1dca7756c8c2d831d6e115f4a9ddb.1585528298.git.gitgitgadget@gmail.com/T/#u

> [1] https://lore.kernel.org/git/20200128213508.31661-1-jonathantanmy@google.com/
> [2] https://lore.kernel.org/git/20200130055136.GA2184413@coredump.intra.peff.net/
> ---
>  diff.c                        | 26 +++++++++++++++----
>  diffcore-rename.c             | 40 +++++++++++++++++++++++++++-
>  diffcore.h                    |  2 +-
>  t/t4067-diff-partial-clone.sh | 49 +++++++++++++++++++++++++++++++++++
>  4 files changed, 110 insertions(+), 7 deletions(-)
> 
> diff --git a/diff.c b/diff.c
> index 1010d806f5..19c5d638d6 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -6507,10 +6507,24 @@ static void add_if_missing(struct repository *r,
>  
>  void diffcore_std(struct diff_options *options)
>  {
> -	if (options->repo == the_repository && has_promisor_remote()) {
> -		/*
> -		 * Prefetch the diff pairs that are about to be flushed.
> -		 */
> +	int prefetched = 0;
> +	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
> +		DIFF_FORMAT_NUMSTAT |
> +		DIFF_FORMAT_PATCH |
> +		DIFF_FORMAT_SHORTSTAT |
> +		DIFF_FORMAT_DIRSTAT;
> +
> +	/*
> +	 * Check if the user requested a blob-data-requiring diff output and/or
> +	 * break-rewrite detection (which requires blob data). If yes, prefetch
> +	 * the diff pairs.
> +	 *
> +	 * If no prefetching occurs, diffcore_rename() will prefetch if it
> +	 * decides that it needs inexact rename detection.
> +	 */
> +	if (options->repo == the_repository && has_promisor_remote() &&
> +	    (options->output_format & output_formats_to_prefetch ||
> +	     (!options->found_follow && options->break_opt != -1))) {
>  		int i;
>  		struct diff_queue_struct *q = &diff_queued_diff;
>  		struct oid_array to_fetch = OID_ARRAY_INIT;
> @@ -6520,6 +6534,8 @@ void diffcore_std(struct diff_options *options)
>  			add_if_missing(options->repo, &to_fetch, p->one);
>  			add_if_missing(options->repo, &to_fetch, p->two);
>  		}
> +		prefetched = 1;
> +
>  		if (to_fetch.nr)

It was difficult to see from the context, but the next line is
"do the prefetch", so "prefetched = 1" makes sense here, even
if to_fetch.nr is zero. We've already done the work to see that
a batch download is not needed.

>  			/*
>  			 * NEEDSWORK: Consider deduplicating the OIDs sent.
> @@ -6538,7 +6554,7 @@ void diffcore_std(struct diff_options *options)
>  			diffcore_break(options->repo,
>  				       options->break_opt);
>  		if (options->detect_rename)
> -			diffcore_rename(options);
> +			diffcore_rename(options, prefetched);
>  		if (options->break_opt != -1)
>  			diffcore_merge_broken();
>  	}
> diff --git a/diffcore-rename.c b/diffcore-rename.c
> index e189f407af..962565f066 100644
> --- a/diffcore-rename.c
> +++ b/diffcore-rename.c
> @@ -7,6 +7,7 @@
>  #include "object-store.h"
>  #include "hashmap.h"
>  #include "progress.h"
> +#include "promisor-remote.h"
>  
>  /* Table of rename/copy destinations */
>  
> @@ -448,7 +449,18 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
>  	return count;
>  }
>  
> -void diffcore_rename(struct diff_options *options)
> +static void add_if_missing(struct repository *r,
> +			   struct oid_array *to_fetch,
> +			   const struct diff_filespec *filespec)
> +{
> +	if (filespec && filespec->oid_valid &&
> +	    !S_ISGITLINK(filespec->mode) &&
> +	    oid_object_info_extended(r, &filespec->oid, NULL,
> +				     OBJECT_INFO_FOR_PREFETCH))
> +		oid_array_append(to_fetch, &filespec->oid);
> +}
> +
> +void diffcore_rename(struct diff_options *options, int prefetched)
>  {
>  	int detect_rename = options->detect_rename;
>  	int minimum_score = options->rename_score;
> @@ -538,6 +550,32 @@ void diffcore_rename(struct diff_options *options)
>  		break;
>  	}
>  
> +	if (!prefetched) {
> +		/*
> +		 * At this point we know there's actual work to do: we have rename
> +		 * destinations that didn't find an exact match, and we have potential
> +		 * sources. So we'll have to do inexact rename detection, which
> +		 * requires looking at the blobs.
> +		 *
> +		 * If we haven't already prefetched, it's worth pre-fetching
> +		 * them as a group now.
> +		 */
> +		int i;
> +		struct oid_array to_fetch = OID_ARRAY_INIT;
> +
> +		for (i = 0; i < rename_dst_nr; i++) {
> +			if (rename_dst[i].pair)
> +				continue; /* already found exact match */
> +			add_if_missing(options->repo, &to_fetch, rename_dst[i].two);

Could this be reversed instead to avoid the "continue"?

	if (!rename_dst[i].pair)
		add_if_missing(options->repo, &to_fetch, rename_dst[i].two);

> +		}
> +		for (i = 0; i < rename_src_nr; i++)
> +			add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);

Does this not have the equivalent "rename_src[i].pair" logic for exact
matches?

> +		if (to_fetch.nr)
> +			promisor_remote_get_direct(options->repo,
> +						   to_fetch.oid, to_fetch.nr);

Perhaps promisor_remote_get_direct() could have the check for
nr == 0 to exit early instead of putting that upon all the
callers?

> +		oid_array_clear(&to_fetch);
> +	}
> +
>  	if (options->show_rename_progress) {
>  		progress = start_delayed_progress(
>  				_("Performing inexact rename detection"),
> diff --git a/diffcore.h b/diffcore.h
> index 7c07347e42..9f69506574 100644
> --- a/diffcore.h
> +++ b/diffcore.h
> @@ -144,7 +144,7 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
>  void diff_q(struct diff_queue_struct *, struct diff_filepair *);
>  
>  void diffcore_break(struct repository *, int);
> -void diffcore_rename(struct diff_options *);
> +void diffcore_rename(struct diff_options *, int prefetched);
>  void diffcore_merge_broken(void);
>  void diffcore_pickaxe(struct diff_options *);
>  void diffcore_order(const char *orderfile);
> diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
> index 4831ad35e6..7acb64727d 100755
> --- a/t/t4067-diff-partial-clone.sh
> +++ b/t/t4067-diff-partial-clone.sh
> @@ -131,4 +131,53 @@ test_expect_success 'diff with rename detection batches blobs' '
>  	test_line_count = 1 done_lines
>  '
>  
> +test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
> +	test_when_finished "rm -rf server client trace" &&
> +
> +	test_create_repo server &&
> +	echo a >server/a &&
> +	printf "b\nb\nb\nb\nb\n" >server/b &&
> +	git -C server add a b &&
> +	git -C server commit -m x &&

> +	rm server/b &&
> +	printf "b\nb\nb\nb\nb\n" >server/c &&

Would "mv server/b server/c" make it more clear that
this is an exact rename?

> +	git -C server add c &&
> +	git -C server commit -a -m x &&
> +
> +	test_config -C server uploadpack.allowfilter 1 &&
> +	test_config -C server uploadpack.allowanysha1inwant 1 &&
> +	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
> +
> +	# Ensure no fetches.
> +	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
> +	! test_path_exists trace
> +'
> +
> +test_expect_success 'diff --break-rewrites fetches only if necessary, and batches blobs if it does' '
> +	test_when_finished "rm -rf server client trace" &&
> +
> +	test_create_repo server &&
> +	echo a >server/a &&
> +	printf "b\nb\nb\nb\nb\n" >server/b &&
> +	git -C server add a b &&
> +	git -C server commit -m x &&
> +	printf "c\nc\nc\nc\nc\n" >server/b &&
> +	git -C server commit -a -m x &&
> +
> +	test_config -C server uploadpack.allowfilter 1 &&
> +	test_config -C server uploadpack.allowanysha1inwant 1 &&
> +	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
> +
> +	# Ensure no fetches.
> +	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
> +	! test_path_exists trace &&
> +
> +	# But with --break-rewrites, ensure that there is exactly 1 negotiation
> +	# by checking that there is only 1 "done" line sent. ("done" marks the
> +	# end of negotiation.)
> +	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --break-rewrites --raw -M HEAD^ HEAD &&
> +	grep "git> done" trace >done_lines &&
> +	test_line_count = 1 done_lines
> +'
> +
>  test_done
> 

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] diff: restrict when prefetching occurs
  2020-03-31 12:14 ` Derrick Stolee
@ 2020-03-31 16:50   ` Jonathan Tan
  2020-03-31 17:48     ` Derrick Stolee
  0 siblings, 1 reply; 26+ messages in thread
From: Jonathan Tan @ 2020-03-31 16:50 UTC (permalink / raw)
  To: stolee; +Cc: jonathantanmy, git, peff

> This conflicts with [3], so please keep that in mind.
> 
> Maybe [3] should be adjusted to assume this patch, because that change
> is mostly about disabling the batch download when no renames are required.
> As Peff said [2] the full rename detection trigger is "overly broad".
> 
> However, the changed-path Bloom filters are an excellent test for this
> patch, as computing them in a partial clone will trigger downloading all
> blobs without [3].
> 
> [3] https://lore.kernel.org/git/55824cda89c1dca7756c8c2d831d6e115f4a9ddb.1585528298.git.gitgitgadget@gmail.com/T/#u
> 
> > [1] https://lore.kernel.org/git/20200128213508.31661-1-jonathantanmy@google.com/
> > [2] https://lore.kernel.org/git/20200130055136.GA2184413@coredump.intra.peff.net/

Thanks for the pointer. Yes, I think that [3] should be adjusted to
assume this patch.

> > +		for (i = 0; i < rename_dst_nr; i++) {
> > +			if (rename_dst[i].pair)
> > +				continue; /* already found exact match */
> > +			add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
> 
> Could this be reversed instead to avoid the "continue"?

Hmm...I prefer the "return early" approach, but can change it if others
prefer to avoid the "continue" here.

> 	if (!rename_dst[i].pair)
> 		add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
> 
> > +		}
> > +		for (i = 0; i < rename_src_nr; i++)
> > +			add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
> 
> Does this not have the equivalent "rename_src[i].pair" logic for exact
> matches?

Thanks for the catch. There's no "pair" in rename_src[i], but the
equivalent is "if (skip_unmodified &&
diff_unmodified_pair(rename_src[i].p))", which you can see in the "for"
loop later in the function. I've added this.

> > +		if (to_fetch.nr)
> > +			promisor_remote_get_direct(options->repo,
> > +						   to_fetch.oid, to_fetch.nr);
> 
> Perhaps promisor_remote_get_direct() could have the check for
> nr == 0 to exit early instead of putting that upon all the
> callers?

The 2nd param is a pointer to an array, and I think it would be strange
to pass a pointer to a 0-size region of memory anywhere, so I'll leave
it as it is.

> > +test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
> > +	test_when_finished "rm -rf server client trace" &&
> > +
> > +	test_create_repo server &&
> > +	echo a >server/a &&
> > +	printf "b\nb\nb\nb\nb\n" >server/b &&
> > +	git -C server add a b &&
> > +	git -C server commit -m x &&
> 
> > +	rm server/b &&
> > +	printf "b\nb\nb\nb\nb\n" >server/c &&
> 
> Would "mv server/b server/c" make it more clear that
> this is an exact rename?

True. Will do.

Thanks for the review.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] diff: restrict when prefetching occurs
  2020-03-31 16:50   ` Jonathan Tan
@ 2020-03-31 17:48     ` Derrick Stolee
  2020-03-31 18:21       ` Junio C Hamano
  0 siblings, 1 reply; 26+ messages in thread
From: Derrick Stolee @ 2020-03-31 17:48 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, peff

On 3/31/2020 12:50 PM, Jonathan Tan wrote:
>> This conflicts with [3], so please keep that in mind.
>>
>> Maybe [3] should be adjusted to assume this patch, because that change
>> is mostly about disabling the batch download when no renames are required.
>> As Peff said [2] the full rename detection trigger is "overly broad".
>>
>> However, the changed-path Bloom filters are an excellent test for this
>> patch, as computing them in a partial clone will trigger downloading all
>> blobs without [3].
>>
>> [3] https://lore.kernel.org/git/55824cda89c1dca7756c8c2d831d6e115f4a9ddb.1585528298.git.gitgitgadget@gmail.com/T/#u
>>
>>> [1] https://lore.kernel.org/git/20200128213508.31661-1-jonathantanmy@google.com/
>>> [2] https://lore.kernel.org/git/20200130055136.GA2184413@coredump.intra.peff.net/
> 
> Thanks for the pointer. Yes, I think that [3] should be adjusted to
> assume this patch.
> 
>>> +		for (i = 0; i < rename_dst_nr; i++) {
>>> +			if (rename_dst[i].pair)
>>> +				continue; /* already found exact match */
>>> +			add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
>>
>> Could this be reversed instead to avoid the "continue"?
> 
> Hmm...I prefer the "return early" approach, but can change it if others
> prefer to avoid the "continue" here.

The "return early" approach is great and makes sense unless there is
only one line of code happening in the other case. Not sure if there
is any potential that the non-continue case grows in size or not.

Doesn't hurt that much to have the "return early" approach, as you
wrote it.

>> 	if (!rename_dst[i].pair)
>> 		add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
>>
>>> +		}
>>> +		for (i = 0; i < rename_src_nr; i++)
>>> +			add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
>>
>> Does this not have the equivalent "rename_src[i].pair" logic for exact
>> matches?
> 
> Thanks for the catch. There's no "pair" in rename_src[i], but the
> equivalent is "if (skip_unmodified &&
> diff_unmodified_pair(rename_src[i].p))", which you can see in the "for"
> loop later in the function. I've added this.
> 
>>> +		if (to_fetch.nr)
>>> +			promisor_remote_get_direct(options->repo,
>>> +						   to_fetch.oid, to_fetch.nr);
>>
>> Perhaps promisor_remote_get_direct() could have the check for
>> nr == 0 to exit early instead of putting that upon all the
>> callers?
> 
> The 2nd param is a pointer to an array, and I think it would be strange
> to pass a pointer to a 0-size region of memory anywhere, so I'll leave
> it as it is.

Well, I would assume that to_fetch.oid is either NULL or is alloc'd
larger than to_fetch.nr when there are no added objects.

This is now the fourth location where we if (to_fetch.nr) promisor_remote_get_direct()
so we have already violated the rule of three.

My preference would be to insert a patch before this that halts the
promisor_remote_get_direct() call on an nr of 0 and deletes the "if (nr)"
conditions from the three existing callers. Then this patch could use
the logic without ever adding the "if (nr)".

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] diff: restrict when prefetching occurs
  2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
  2020-03-31 12:14 ` Derrick Stolee
@ 2020-03-31 18:15 ` Junio C Hamano
  2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
  3 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2020-03-31 18:15 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, peff, Derrick Stolee

Jonathan Tan <jonathantanmy@google.com> writes:

> diff --git a/diffcore-rename.c b/diffcore-rename.c
> index e189f407af..962565f066 100644
> --- a/diffcore-rename.c
> +++ b/diffcore-rename.c
> ...
> @@ -448,7 +449,18 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
>  	return count;
>  }
>  
> +static void add_if_missing(struct repository *r,
> +			   struct oid_array *to_fetch,
> +			   const struct diff_filespec *filespec)
> +{
> +	if (filespec && filespec->oid_valid &&
> +	    !S_ISGITLINK(filespec->mode) &&
> +	    oid_object_info_extended(r, &filespec->oid, NULL,
> +				     OBJECT_INFO_FOR_PREFETCH))
> +		oid_array_append(to_fetch, &filespec->oid);
> +}

Do not copy&paste the exact code from elsewhere.  It is a sure way
to guarantee that they will drift apart over time.  Rename the one
in diff.c to something a bit more appropriate to be a global name
(e.g. diff_prepare_prefetch() or somesuch), make it extern in
<diffcore.h> and use it here.

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] diff: restrict when prefetching occurs
  2020-03-31 17:48     ` Derrick Stolee
@ 2020-03-31 18:21       ` Junio C Hamano
  0 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2020-03-31 18:21 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: Jonathan Tan, git, peff

Derrick Stolee <stolee@gmail.com> writes:

>>>> +		for (i = 0; i < rename_dst_nr; i++) {
>>>> +			if (rename_dst[i].pair)
>>>> +				continue; /* already found exact match */
>>>> +			add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
>>>
>>> Could this be reversed instead to avoid the "continue"?
>> 
>> Hmm...I prefer the "return early" approach, but can change it if others
>> prefer to avoid the "continue" here.
>
> The "return early" approach is great and makes sense unless there is
> only one line of code happening in the other case. Not sure if there
> is any potential that the non-continue case grows in size or not.
>
> Doesn't hurt that much to have the "return early" approach, as you
> wrote it.

Even with just one statement after the continue, in this particular
case, the logic seems to flow a bit more naturally.  "Let's see each
item in this list.  ah, this has already been processed so let's
move on.  otherwise, we may need to do something a bit more."  It
also saves one indentation level for the logic that matters ;-)

>>>> +		for (i = 0; i < rename_src_nr; i++)
>>>> +			add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
>>>
>>> Does this not have the equivalent "rename_src[i].pair" logic for exact
>>> matches?

One source blob can be copied to multiple destination path, with and
without modification, but we currently do not detect the case where
a destination blob is a concatenation of two source blobs.  So we
can optimize the destination side ("we are done with it, no need to
look---we won't find anything better anyway as we've found the exact
copy source") but we cannot do the same optimization on the source
side ("yes, this one was copied to path A, but path B may have a
copy with slight modification), I would think.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 0/2] Restrict when prefetcing occurs
  2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
  2020-03-31 12:14 ` Derrick Stolee
  2020-03-31 18:15 ` Junio C Hamano
@ 2020-04-02 19:19 ` Jonathan Tan
  2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
                     ` (2 more replies)
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
  3 siblings, 3 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-02 19:19 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, gitster, stolee

Thanks, everyone, for your review.

New in v2:
 - added restriction on fetching rename_src's blob, following Stolee's
   comment
 - folded oid_nr==0 check into promisor_remote_get_direct(), following
   Stolee's comment
 - used "mv server/b server/c", following Stolee's comment
 - made diff_add_if_missing() public function, following Junio's comment

I didn't change the "continue" part that Stolee suggested [1].

[1] https://lore.kernel.org/git/xmqqlfng75cl.fsf@gitster.c.googlers.com/

Jonathan Tan (2):
  promisor-remote: accept 0 as oid_nr in function
  diff: restrict when prefetching occurs

 builtin/index-pack.c          |  5 ++--
 diff.c                        | 49 +++++++++++++++++++++++------------
 diffcore-rename.c             | 37 +++++++++++++++++++++++++-
 diffcore.h                    | 10 ++++++-
 promisor-remote.c             |  3 +++
 promisor-remote.h             |  8 ++++++
 t/t4067-diff-partial-clone.sh | 48 ++++++++++++++++++++++++++++++++++
 unpack-trees.c                |  5 ++--
 8 files changed, 141 insertions(+), 24 deletions(-)

-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function
  2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
@ 2020-04-02 19:19   ` Jonathan Tan
  2020-04-02 19:46     ` Junio C Hamano
  2020-04-02 19:19   ` [PATCH v2 2/2] diff: restrict when prefetching occurs Jonathan Tan
  2020-04-02 20:28   ` [PATCH v2 0/2] Restrict when prefetcing occurs Junio C Hamano
  2 siblings, 1 reply; 26+ messages in thread
From: Jonathan Tan @ 2020-04-02 19:19 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, gitster, stolee

There are 3 callers to promisor_remote_get_direct() that first check if
the number of objects to be fetched is equal to 0. Fold that check into
promisor_remote_get_direct(), and in doing so, be explicit as to what
promisor_remote_get_direct() does if oid_nr is 0 (it returns 0, success,
immediately).

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/index-pack.c |  5 ++---
 diff.c               | 11 +++++------
 promisor-remote.c    |  3 +++
 promisor-remote.h    |  8 ++++++++
 unpack-trees.c       |  5 ++---
 5 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index d967d188a3..f176dd28c8 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1368,9 +1368,8 @@ static void fix_unresolved_deltas(struct hashfile *f)
 				continue;
 			oid_array_append(&to_fetch, &d->oid);
 		}
-		if (to_fetch.nr)
-			promisor_remote_get_direct(the_repository,
-						   to_fetch.oid, to_fetch.nr);
+		promisor_remote_get_direct(the_repository,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 
diff --git a/diff.c b/diff.c
index 1010d806f5..f01b4d91b8 100644
--- a/diff.c
+++ b/diff.c
@@ -6520,12 +6520,11 @@ void diffcore_std(struct diff_options *options)
 			add_if_missing(options->repo, &to_fetch, p->one);
 			add_if_missing(options->repo, &to_fetch, p->two);
 		}
-		if (to_fetch.nr)
-			/*
-			 * NEEDSWORK: Consider deduplicating the OIDs sent.
-			 */
-			promisor_remote_get_direct(options->repo,
-						   to_fetch.oid, to_fetch.nr);
+		/*
+		 * NEEDSWORK: Consider deduplicating the OIDs sent.
+		 */
+		promisor_remote_get_direct(options->repo,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 
diff --git a/promisor-remote.c b/promisor-remote.c
index 9f338c945f..2155dfe657 100644
--- a/promisor-remote.c
+++ b/promisor-remote.c
@@ -241,6 +241,9 @@ int promisor_remote_get_direct(struct repository *repo,
 	int to_free = 0;
 	int res = -1;
 
+	if (oid_nr == 0)
+		return 0;
+
 	promisor_remote_init();
 
 	for (r = promisors; r; r = r->next) {
diff --git a/promisor-remote.h b/promisor-remote.h
index 737bac3a33..6343c47d18 100644
--- a/promisor-remote.h
+++ b/promisor-remote.h
@@ -20,6 +20,14 @@ struct promisor_remote {
 void promisor_remote_reinit(void);
 struct promisor_remote *promisor_remote_find(const char *remote_name);
 int has_promisor_remote(void);
+
+/*
+ * Fetches all requested objects from all promisor remotes, trying them one at
+ * a time until all objects are fetched. Returns 0 upon success, and non-zero
+ * otherwise.
+ *
+ * If oid_nr is 0, this function returns 0 (success) immediately.
+ */
 int promisor_remote_get_direct(struct repository *repo,
 			       const struct object_id *oids,
 			       int oid_nr);
diff --git a/unpack-trees.c b/unpack-trees.c
index f618a644ef..4c3191b947 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -423,9 +423,8 @@ static int check_updates(struct unpack_trees_options *o)
 				continue;
 			oid_array_append(&to_fetch, &ce->oid);
 		}
-		if (to_fetch.nr)
-			promisor_remote_get_direct(the_repository,
-						   to_fetch.oid, to_fetch.nr);
+		promisor_remote_get_direct(the_repository,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 	for (i = 0; i < index->cache_nr; i++) {
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
  2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
@ 2020-04-02 19:19   ` Jonathan Tan
  2020-04-02 20:08     ` Junio C Hamano
  2020-04-02 20:28   ` [PATCH v2 0/2] Restrict when prefetcing occurs Junio C Hamano
  2 siblings, 1 reply; 26+ messages in thread
From: Jonathan Tan @ 2020-04-02 19:19 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, gitster, stolee, Jeff King

Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In particular, if
(1) no blob data is included in the output of the diff, (2)
break-rewrite detection is not requested, and (3) no inexact rename
detection is needed, then no blobs are read at all.

Therefore, in such a case, do not prefetch. Change diffcore_std() to
only prefetch if (1) and/or (2) is not true (currently, it always
prefetches); change diffcore_rename() to prefetch if (3) is not true and
no prefetch has yet occurred.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 diff.c                        | 38 +++++++++++++++++++--------
 diffcore-rename.c             | 37 ++++++++++++++++++++++++++-
 diffcore.h                    | 10 +++++++-
 t/t4067-diff-partial-clone.sh | 48 +++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 12 deletions(-)

diff --git a/diff.c b/diff.c
index f01b4d91b8..857f02f481 100644
--- a/diff.c
+++ b/diff.c
@@ -6494,9 +6494,9 @@ void diffcore_fix_diff_index(void)
 	QSORT(q->queue, q->nr, diffnamecmp);
 }
 
-static void add_if_missing(struct repository *r,
-			   struct oid_array *to_fetch,
-			   const struct diff_filespec *filespec)
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec)
 {
 	if (filespec && filespec->oid_valid &&
 	    !S_ISGITLINK(filespec->mode) &&
@@ -6507,24 +6507,42 @@ static void add_if_missing(struct repository *r,
 
 void diffcore_std(struct diff_options *options)
 {
-	if (options->repo == the_repository && has_promisor_remote()) {
-		/*
-		 * Prefetch the diff pairs that are about to be flushed.
-		 */
+	int prefetched = 0;
+	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
+		DIFF_FORMAT_NUMSTAT |
+		DIFF_FORMAT_PATCH |
+		DIFF_FORMAT_SHORTSTAT |
+		DIFF_FORMAT_DIRSTAT;
+
+	/*
+	 * Check if the user requested a blob-data-requiring diff output and/or
+	 * break-rewrite detection (which requires blob data). If yes, prefetch
+	 * the diff pairs.
+	 *
+	 * If no prefetching occurs, diffcore_rename() will prefetch if it
+	 * decides that it needs inexact rename detection.
+	 */
+	if (options->repo == the_repository && has_promisor_remote() &&
+	    (options->output_format & output_formats_to_prefetch ||
+	     (!options->found_follow && options->break_opt != -1))) {
 		int i;
 		struct diff_queue_struct *q = &diff_queued_diff;
 		struct oid_array to_fetch = OID_ARRAY_INIT;
 
 		for (i = 0; i < q->nr; i++) {
 			struct diff_filepair *p = q->queue[i];
-			add_if_missing(options->repo, &to_fetch, p->one);
-			add_if_missing(options->repo, &to_fetch, p->two);
+			diff_add_if_missing(options->repo, &to_fetch, p->one);
+			diff_add_if_missing(options->repo, &to_fetch, p->two);
 		}
+
+		prefetched = 1;
+
 		/*
 		 * NEEDSWORK: Consider deduplicating the OIDs sent.
 		 */
 		promisor_remote_get_direct(options->repo,
 					   to_fetch.oid, to_fetch.nr);
+
 		oid_array_clear(&to_fetch);
 	}
 
@@ -6537,7 +6555,7 @@ void diffcore_std(struct diff_options *options)
 			diffcore_break(options->repo,
 				       options->break_opt);
 		if (options->detect_rename)
-			diffcore_rename(options);
+			diffcore_rename(options, prefetched);
 		if (options->break_opt != -1)
 			diffcore_merge_broken();
 	}
diff --git a/diffcore-rename.c b/diffcore-rename.c
index e189f407af..79ac1b4bee 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -7,6 +7,7 @@
 #include "object-store.h"
 #include "hashmap.h"
 #include "progress.h"
+#include "promisor-remote.h"
 
 /* Table of rename/copy destinations */
 
@@ -448,7 +449,7 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
 	return count;
 }
 
-void diffcore_rename(struct diff_options *options)
+void diffcore_rename(struct diff_options *options, int prefetched)
 {
 	int detect_rename = options->detect_rename;
 	int minimum_score = options->rename_score;
@@ -538,6 +539,40 @@ void diffcore_rename(struct diff_options *options)
 		break;
 	}
 
+	if (!prefetched) {
+		/*
+		 * At this point we know there's actual work to do: we have rename
+		 * destinations that didn't find an exact match, and we have potential
+		 * sources. So we'll have to do inexact rename detection, which
+		 * requires looking at the blobs.
+		 *
+		 * If we haven't already prefetched, it's worth pre-fetching
+		 * them as a group now.
+		 */
+		int i;
+		struct oid_array to_fetch = OID_ARRAY_INIT;
+
+		for (i = 0; i < rename_dst_nr; i++) {
+			if (rename_dst[i].pair)
+				continue; /* already found exact match */
+			diff_add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
+		}
+		for (i = 0; i < rename_src_nr; i++) {
+			if (skip_unmodified &&
+			    diff_unmodified_pair(rename_src[i].p))
+				/*
+				 * The "for" loop below will not need these
+				 * blobs, so skip prefetching.
+				 */
+				continue;
+			diff_add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
+		}
+		if (to_fetch.nr)
+			promisor_remote_get_direct(options->repo,
+						   to_fetch.oid, to_fetch.nr);
+		oid_array_clear(&to_fetch);
+	}
+
 	if (options->show_rename_progress) {
 		progress = start_delayed_progress(
 				_("Performing inexact rename detection"),
diff --git a/diffcore.h b/diffcore.h
index 7c07347e42..d7af6ab018 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -144,7 +144,7 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
 void diff_q(struct diff_queue_struct *, struct diff_filepair *);
 
 void diffcore_break(struct repository *, int);
-void diffcore_rename(struct diff_options *);
+void diffcore_rename(struct diff_options *, int prefetched);
 void diffcore_merge_broken(void);
 void diffcore_pickaxe(struct diff_options *);
 void diffcore_order(const char *orderfile);
@@ -182,4 +182,12 @@ int diffcore_count_changes(struct repository *r,
 			   unsigned long *src_copied,
 			   unsigned long *literal_added);
 
+/*
+ * If filespec contains an OID and if that object is missing from the given
+ * repository, add that OID to to_fetch.
+ */
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec);
+
 #endif
diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
index 4831ad35e6..c1ed1c2fc4 100755
--- a/t/t4067-diff-partial-clone.sh
+++ b/t/t4067-diff-partial-clone.sh
@@ -131,4 +131,52 @@ test_expect_success 'diff with rename detection batches blobs' '
 	test_line_count = 1 done_lines
 '
 
+test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	mv server/b server/c &&
+	git -C server add c &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace
+'
+
+test_expect_success 'diff --break-rewrites fetches only if necessary, and batches blobs if it does' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	printf "c\nc\nc\nc\nc\n" >server/b &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace &&
+
+	# But with --break-rewrites, ensure that there is exactly 1 negotiation
+	# by checking that there is only 1 "done" line sent. ("done" marks the
+	# end of negotiation.)
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --break-rewrites --raw -M HEAD^ HEAD &&
+	grep "git> done" trace >done_lines &&
+	test_line_count = 1 done_lines
+'
+
 test_done
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function
  2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
@ 2020-04-02 19:46     ` Junio C Hamano
  2020-04-02 23:01       ` Jonathan Tan
  0 siblings, 1 reply; 26+ messages in thread
From: Junio C Hamano @ 2020-04-02 19:46 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee

Jonathan Tan <jonathantanmy@google.com> writes:

> There are 3 callers to promisor_remote_get_direct() that first check if
> the number of objects to be fetched is equal to 0. Fold that check into
> promisor_remote_get_direct(), and in doing so, be explicit as to what
> promisor_remote_get_direct() does if oid_nr is 0 (it returns 0, success,
> immediately).

>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
>  builtin/index-pack.c |  5 ++---
>  diff.c               | 11 +++++------
>  promisor-remote.c    |  3 +++
>  promisor-remote.h    |  8 ++++++++
>  unpack-trees.c       |  5 ++---
>  5 files changed, 20 insertions(+), 12 deletions(-)

Nice simplification.

> +/*
> + * Fetches all requested objects from all promisor remotes, trying them one at
> + * a time until all objects are fetched. Returns 0 upon success, and non-zero
> + * otherwise.

Good.

> + * If oid_nr is 0, this function returns 0 (success) immediately.

Is this worth saying?  If you ask to lazily grab 0 objects, it is
probably clear that no object would be read before the helper
returns.

When oid_nr==0 you are allowed to pass oids==NULL, but otherwise,
oids==NULL would be an error.  Is that the kind of difference you
wanted to point out, I wonder?

> + */
>  int promisor_remote_get_direct(struct repository *repo,
>  			       const struct object_id *oids,
>  			       int oid_nr);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 19:19   ` [PATCH v2 2/2] diff: restrict when prefetching occurs Jonathan Tan
@ 2020-04-02 20:08     ` Junio C Hamano
  2020-04-02 23:09       ` Jonathan Tan
  0 siblings, 1 reply; 26+ messages in thread
From: Junio C Hamano @ 2020-04-02 20:08 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee, Jeff King

Jonathan Tan <jonathantanmy@google.com> writes:

> +	int prefetched = 0;
> +	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
> +		DIFF_FORMAT_NUMSTAT |
> +		DIFF_FORMAT_PATCH |
> +		DIFF_FORMAT_SHORTSTAT |
> +		DIFF_FORMAT_DIRSTAT;

Would this want to be a "const int" (or even #define), I wonder.  I
do not care too much between the two, but leaving it as a variable
makes me a bit nervous.

> +	/*
> +	 * Check if the user requested a blob-data-requiring diff output and/or
> +	 * break-rewrite detection (which requires blob data). If yes, prefetch
> +	 * the diff pairs.
> +	 *
> +	 * If no prefetching occurs, diffcore_rename() will prefetch if it
> +	 * decides that it needs inexact rename detection.
> +	 */

Name-only etc. that Derrick mentioned in the other thread would be
relevant only when rename detection is active, and you'd do that in
diffcore_rename().  Good.

> +	if (options->repo == the_repository && has_promisor_remote() &&
> +	    (options->output_format & output_formats_to_prefetch ||
> +	     (!options->found_follow && options->break_opt != -1))) {
>  		int i;
>  		struct diff_queue_struct *q = &diff_queued_diff;
>  		struct oid_array to_fetch = OID_ARRAY_INIT;
>  
>  		for (i = 0; i < q->nr; i++) {
>  			struct diff_filepair *p = q->queue[i];
> -			add_if_missing(options->repo, &to_fetch, p->one);
> -			add_if_missing(options->repo, &to_fetch, p->two);
> +			diff_add_if_missing(options->repo, &to_fetch, p->one);
> +			diff_add_if_missing(options->repo, &to_fetch, p->two);
>  		}
> +
> +		prefetched = 1;
> +

Wouldn't it logically make more sense to do this after calling
promisor_remote_get_direct() and if to_fetch.nr is not 0, ...

>  		/*
>  		 * NEEDSWORK: Consider deduplicating the OIDs sent.
>  		 */
>  		promisor_remote_get_direct(options->repo,
>  					   to_fetch.oid, to_fetch.nr);
> +

... namely, here?

When (q->nr != 0), to_fetch.nr may not be zero, I suspect, but the
original code before [1/2] protected against to_fetch.nr==0 case, so
...?

>  		oid_array_clear(&to_fetch);
>  	}
>  
> @@ -6537,7 +6555,7 @@ void diffcore_std(struct diff_options *options)
>  			diffcore_break(options->repo,
>  				       options->break_opt);
>  		if (options->detect_rename)
> -			diffcore_rename(options);
> +			diffcore_rename(options, prefetched);
>  		if (options->break_opt != -1)
>  			diffcore_merge_broken();
>  	}
> diff --git a/diffcore-rename.c b/diffcore-rename.c
> index e189f407af..79ac1b4bee 100644
> --- a/diffcore-rename.c
> +++ b/diffcore-rename.c
> @@ -7,6 +7,7 @@
>  #include "object-store.h"
>  #include "hashmap.h"
>  #include "progress.h"
> +#include "promisor-remote.h"
>  
>  /* Table of rename/copy destinations */
>  
> @@ -448,7 +449,7 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
>  	return count;
>  }
>  
> -void diffcore_rename(struct diff_options *options)
> +void diffcore_rename(struct diff_options *options, int prefetched)
>  {
>  	int detect_rename = options->detect_rename;
>  	int minimum_score = options->rename_score;
> @@ -538,6 +539,40 @@ void diffcore_rename(struct diff_options *options)
>  		break;
>  	}
>  
> +	if (!prefetched) {
> +		/*
> +		 * At this point we know there's actual work to do: we have rename
> +		 * destinations that didn't find an exact match, and we have potential
> +		 * sources. So we'll have to do inexact rename detection, which
> +		 * requires looking at the blobs.
> +		 *
> +		 * If we haven't already prefetched, it's worth pre-fetching
> +		 * them as a group now.
> +		 */

This comment makes me wonder if it would be even better to

 - prepare an empty to_fetch OID array in the caller,

 - if the output format is one of the ones that wants prefetch, add
   object names to to_fetch in the caller, BUT not fetch there.

 - pass &to_fetch by the caller to this function, and this code here
   may add even more objects,

 - then do the prefetch here (so a single promisor interaction will
   grab objects the caller would have fetched before calling us and
   the ones we want here), and then clear the to_fetch array.

 - the caller, after seeing this function returns, checks to_fetch
   and if it is not empty, fetches (i.e. the caller prepared list of
   objects based on the output type, we ended up not calling this
   helper, and then finally the caller does the prefetch).

That way, the "unless we have already prefetched" logic can go, and
we can lose one indentation level, no?


> +		int i;
> +		struct oid_array to_fetch = OID_ARRAY_INIT;
> +
> +		for (i = 0; i < rename_dst_nr; i++) {
> +			if (rename_dst[i].pair)
> +				continue; /* already found exact match */
> +			diff_add_if_missing(options->repo, &to_fetch, rename_dst[i].two);
> +		}
> +		for (i = 0; i < rename_src_nr; i++) {
> +			if (skip_unmodified &&
> +			    diff_unmodified_pair(rename_src[i].p))
> +				/*
> +				 * The "for" loop below will not need these
> +				 * blobs, so skip prefetching.
> +				 */
> +				continue;
> +			diff_add_if_missing(options->repo, &to_fetch, rename_src[i].p->one);
> +		}
> +		if (to_fetch.nr)
> +			promisor_remote_get_direct(options->repo,
> +						   to_fetch.oid, to_fetch.nr);

You no longer need the if(), no?

> +		oid_array_clear(&to_fetch);
> +	}

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/2] Restrict when prefetcing occurs
  2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
  2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
  2020-04-02 19:19   ` [PATCH v2 2/2] diff: restrict when prefetching occurs Jonathan Tan
@ 2020-04-02 20:28   ` Junio C Hamano
  2020-04-06 11:44     ` Derrick Stolee
  2 siblings, 1 reply; 26+ messages in thread
From: Junio C Hamano @ 2020-04-02 20:28 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee, Garima Singh

Jonathan Tan <jonathantanmy@google.com> writes:

> Thanks, everyone, for your review.
>
> New in v2:
>  - added restriction on fetching rename_src's blob, following Stolee's
>    comment
>  - folded oid_nr==0 check into promisor_remote_get_direct(), following
>    Stolee's comment
>  - used "mv server/b server/c", following Stolee's comment
>  - made diff_add_if_missing() public function, following Junio's comment
>
> I didn't change the "continue" part that Stolee suggested [1].
>
> [1] https://lore.kernel.org/git/xmqqlfng75cl.fsf@gitster.c.googlers.com/
>
> Jonathan Tan (2):
>   promisor-remote: accept 0 as oid_nr in function
>   diff: restrict when prefetching occurs
>
>  builtin/index-pack.c          |  5 ++--
>  diff.c                        | 49 +++++++++++++++++++++++------------
>  diffcore-rename.c             | 37 +++++++++++++++++++++++++-
>  diffcore.h                    | 10 ++++++-
>  promisor-remote.c             |  3 +++
>  promisor-remote.h             |  8 ++++++
>  t/t4067-diff-partial-clone.sh | 48 ++++++++++++++++++++++++++++++++++
>  unpack-trees.c                |  5 ++--
>  8 files changed, 141 insertions(+), 24 deletions(-)

I notice that a439b4ef (diff: skip batch object download when
possible, 2020-03-30) by Garima seems to aim for something similar.

I'll for now keep both topics with conflict resolution, but it may
make sense for you two to compare notes.  

I especially like the way this series enumerates the formats that
matter to prefetching and the way the change is localized in
diffcore_std(); the other topic splits a similar logic (with
different criteria) between diff_setup_done() and diffcore_std(),
which I found suboptimal.

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function
  2020-04-02 19:46     ` Junio C Hamano
@ 2020-04-02 23:01       ` Jonathan Tan
  0 siblings, 0 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-02 23:01 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, stolee

> > + * If oid_nr is 0, this function returns 0 (success) immediately.
> 
> Is this worth saying?  If you ask to lazily grab 0 objects, it is
> probably clear that no object would be read before the helper
> returns.
> 
> When oid_nr==0 you are allowed to pass oids==NULL, but otherwise,
> oids==NULL would be an error.  Is that the kind of difference you
> wanted to point out, I wonder?

Thanks for taking a look. Yes that was the difference I wanted to point
out. After some thought, maybe I'll replace it with "oids points to an
array of OIDs of size oid_nr. If oid_nr is 0, oids can be anything.".

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 20:08     ` Junio C Hamano
@ 2020-04-02 23:09       ` Jonathan Tan
  2020-04-02 23:25         ` Junio C Hamano
  2020-04-02 23:54         ` Junio C Hamano
  0 siblings, 2 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-02 23:09 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, stolee, peff

> > +	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
> > +		DIFF_FORMAT_NUMSTAT |
> > +		DIFF_FORMAT_PATCH |
> > +		DIFF_FORMAT_SHORTSTAT |
> > +		DIFF_FORMAT_DIRSTAT;
> 
> Would this want to be a "const int" (or even #define), I wonder.  I
> do not care too much between the two, but leaving it as a variable
> makes me a bit nervous.

OK, will switch to "const int".

> > +	if (options->repo == the_repository && has_promisor_remote() &&
> > +	    (options->output_format & output_formats_to_prefetch ||
> > +	     (!options->found_follow && options->break_opt != -1))) {
> >  		int i;
> >  		struct diff_queue_struct *q = &diff_queued_diff;
> >  		struct oid_array to_fetch = OID_ARRAY_INIT;
> >  
> >  		for (i = 0; i < q->nr; i++) {
> >  			struct diff_filepair *p = q->queue[i];
> > -			add_if_missing(options->repo, &to_fetch, p->one);
> > -			add_if_missing(options->repo, &to_fetch, p->two);
> > +			diff_add_if_missing(options->repo, &to_fetch, p->one);
> > +			diff_add_if_missing(options->repo, &to_fetch, p->two);
> >  		}
> > +
> > +		prefetched = 1;
> > +
> 
> Wouldn't it logically make more sense to do this after calling
> promisor_remote_get_direct() and if to_fetch.nr is not 0, ...
> 
> >  		/*
> >  		 * NEEDSWORK: Consider deduplicating the OIDs sent.
> >  		 */
> >  		promisor_remote_get_direct(options->repo,
> >  					   to_fetch.oid, to_fetch.nr);
> > +
> 
> ... namely, here?
> 
> When (q->nr != 0), to_fetch.nr may not be zero, I suspect, but the
> original code before [1/2] protected against to_fetch.nr==0 case, so
> ...?

My idea is that this prefetch is a superset of what diffcore_rebase()
wants to prefetch, so if we have already done the necessary logic here
(even if nothing gets prefetched - which might be the case if we have
all objects), we do not need to do it in diffcore_rebase().

> > +	if (!prefetched) {
> > +		/*
> > +		 * At this point we know there's actual work to do: we have rename
> > +		 * destinations that didn't find an exact match, and we have potential
> > +		 * sources. So we'll have to do inexact rename detection, which
> > +		 * requires looking at the blobs.
> > +		 *
> > +		 * If we haven't already prefetched, it's worth pre-fetching
> > +		 * them as a group now.
> > +		 */
> 
> This comment makes me wonder if it would be even better to
> 
>  - prepare an empty to_fetch OID array in the caller,
> 
>  - if the output format is one of the ones that wants prefetch, add
>    object names to to_fetch in the caller, BUT not fetch there.
> 
>  - pass &to_fetch by the caller to this function, and this code here
>    may add even more objects,
> 
>  - then do the prefetch here (so a single promisor interaction will
>    grab objects the caller would have fetched before calling us and
>    the ones we want here), and then clear the to_fetch array.
> 
>  - the caller, after seeing this function returns, checks to_fetch
>    and if it is not empty, fetches (i.e. the caller prepared list of
>    objects based on the output type, we ended up not calling this
>    helper, and then finally the caller does the prefetch).
> 
> That way, the "unless we have already prefetched" logic can go, and
> we can lose one indentation level, no?

This means that the only prefetch occurs in diffcore_rename()? I don't
think this will work for 2 reasons:

 - diffcore_std() calls diffcore_break() (which also reads blobs) before
   diffcore_rename()
 - (more importantly) there's a code path in diffcore_std() that does
   not call diffcore_rename(), so we would still need some prefetching
   logic in diffcore_std() in case diffcore_rename() is not called

> > +		if (to_fetch.nr)
> > +			promisor_remote_get_direct(options->repo,
> > +						   to_fetch.oid, to_fetch.nr);
> 
> You no longer need the if(), no?

Ah...I'll remove the if().

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 23:09       ` Jonathan Tan
@ 2020-04-02 23:25         ` Junio C Hamano
  2020-04-02 23:54         ` Junio C Hamano
  1 sibling, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2020-04-02 23:25 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee, peff

Jonathan Tan <jonathantanmy@google.com> writes:

>> This comment makes me wonder if it would be even better to
>> 
>>  - prepare an empty to_fetch OID array in the caller,
>> 
>>  - if the output format is one of the ones that wants prefetch, add
>>    object names to to_fetch in the caller, BUT not fetch there.
>> 
>>  - pass &to_fetch by the caller to this function, and this code here
>>    may add even more objects,
>> 
>>  - then do the prefetch here (so a single promisor interaction will
>>    grab objects the caller would have fetched before calling us and
>>    the ones we want here), and then clear the to_fetch array.
>> 
>>  - the caller, after seeing this function returns, checks to_fetch
>>    and if it is not empty, fetches (i.e. the caller prepared list of
>>    objects based on the output type, we ended up not calling this
>>    helper, and then finally the caller does the prefetch).
>> 
>> That way, the "unless we have already prefetched" logic can go, and
>> we can lose one indentation level, no?
>
> This means that the only prefetch occurs in diffcore_rename()?

No, but I phrased the last bullet item incorrectly.  "after seeing
this function returns" is wrong, but what is in parentheses (i.e. if
we didn't call diffcore_rename) is correct.

> I don't
> think this will work for 2 reasons:
>
>  - diffcore_std() calls diffcore_break() (which also reads blobs) before
>    diffcore_rename()

Ahh, I missed that part.

>  - (more importantly) there's a code path in diffcore_std() that does
>    not call diffcore_rename(), so we would still need some prefetching
>    logic in diffcore_std() in case diffcore_rename() is not called

That one I think is already covered.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 23:09       ` Jonathan Tan
  2020-04-02 23:25         ` Junio C Hamano
@ 2020-04-02 23:54         ` Junio C Hamano
  2020-04-03 21:35           ` Jonathan Tan
  1 sibling, 1 reply; 26+ messages in thread
From: Junio C Hamano @ 2020-04-02 23:54 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee, peff

Jonathan Tan <jonathantanmy@google.com> writes:

> My idea is that this prefetch is a superset of what diffcore_rebase()
> wants to prefetch, so if we have already done the necessary logic here
> (even if nothing gets prefetched - which might be the case if we have
> all objects), we do not need to do it in diffcore_rebase().

s/rebase/rename/ I presume, but the above reasoning, while it may
happen to hold true right now, feels brittle.  In other words

 - how do we know it would stay to be "a superset"?

 - would it change the picture if we later added a prefetch in
   diffcore_break(), just like you are doing so to diffcore_rename()
   in this patch?

So the revised version of my earlier "wondering" is if it would be
more future-proof (easier to teach different steps to prefetch for
their own needs, without having to make an assumption like "what
this step needs is sufficient for the other step") to arrange the
codepath from diffcore_std() to its helpers like so:

    - prepare an empty to_fetch OID array in the caller,

    - if the output format is one of the ones that wants prefetch,
      add object names to to_fetch in the caller, but not fetch as
      long as the caller does not yet need the contents of the
      blobs.

    - pass &to_fetch from diffcore_std() to the helper functions in
      the diffcore family like diffcore_{break,rename}() have them
      also batch what they (may) want to prefetch in there.  Delay
      fetching until they actually need to look at the blobs, and
      when they fetch, clear &to_fetch for the next helper.

    - diffcore_std() also would need to look at the blob eventually,
      perhaps after all the helpers it may call returns.  Do the
      final prefetch if to_fetch is still not empty before it has to
      look at the blobs.

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-02 23:54         ` Junio C Hamano
@ 2020-04-03 21:35           ` Jonathan Tan
  2020-04-03 22:12             ` Junio C Hamano
  0 siblings, 1 reply; 26+ messages in thread
From: Jonathan Tan @ 2020-04-03 21:35 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, stolee, peff

> Jonathan Tan <jonathantanmy@google.com> writes:
> 
> > My idea is that this prefetch is a superset of what diffcore_rebase()
> > wants to prefetch, so if we have already done the necessary logic here
> > (even if nothing gets prefetched - which might be the case if we have
> > all objects), we do not need to do it in diffcore_rebase().
> 
> s/rebase/rename/ I presume,

Ah, yes.

> but the above reasoning, while it may
> happen to hold true right now, feels brittle.  In other words
> 
>  - how do we know it would stay to be "a superset"?
> 
>  - would it change the picture if we later added a prefetch in
>    diffcore_break(), just like you are doing so to diffcore_rename()
>    in this patch?
> 
> So the revised version of my earlier "wondering" is if it would be
> more future-proof (easier to teach different steps to prefetch for
> their own needs, without having to make an assumption like "what
> this step needs is sufficient for the other step") to arrange the
> codepath from diffcore_std() to its helpers like so:
> 
>     - prepare an empty to_fetch OID array in the caller,
> 
>     - if the output format is one of the ones that wants prefetch,
>       add object names to to_fetch in the caller, but not fetch as
>       long as the caller does not yet need the contents of the
>       blobs.
> 
>     - pass &to_fetch from diffcore_std() to the helper functions in
>       the diffcore family like diffcore_{break,rename}() have them
>       also batch what they (may) want to prefetch in there.  Delay
>       fetching until they actually need to look at the blobs, and
>       when they fetch, clear &to_fetch for the next helper.
> 
>     - diffcore_std() also would need to look at the blob eventually,
>       perhaps after all the helpers it may call returns.  Do the
>       final prefetch if to_fetch is still not empty before it has to
>       look at the blobs.

Ah...that makes sense. Besides the accumulating of prefetch targets
(which makes deduplication more necessary - I might make a
"sort_and_uniq" function on oid_array that updates the oid_array
in-place), looking at the code, it's not only diffcore_break() which
might need prefetching, but diffcore_skip_stat_unmatch() too (not to
speak of the functions that come after diffcore_rename()). The least
brittle way is probably to have diff_populate_filespec() do the
prefetching. I'll take a further look.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/2] diff: restrict when prefetching occurs
  2020-04-03 21:35           ` Jonathan Tan
@ 2020-04-03 22:12             ` Junio C Hamano
  0 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2020-04-03 22:12 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, stolee, peff

Jonathan Tan <jonathantanmy@google.com> writes:

> Ah...that makes sense. Besides the accumulating of prefetch targets
> (which makes deduplication more necessary - I might make a
> "sort_and_uniq" function on oid_array that updates the oid_array
> in-place), looking at the code, it's not only diffcore_break() which
> might need prefetching, but diffcore_skip_stat_unmatch() too (not to
> speak of the functions that come after diffcore_rename()). The least
> brittle way is probably to have diff_populate_filespec() do the
> prefetching. I'll take a further look.

It might be a losing battle, unless we can somehow cleanly have a
two pass approach where we ask various codepaths "enumerate blobs
that you think you would need prefetching in this oid list" before
letting any of them actually look at blobs and perform their main
tasks, do a single prefetch and then let the existing age-old code
call the diffcore transformations as if there is no need for it to
worry about prefetching ;-)

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/2] Restrict when prefetcing occurs
  2020-04-02 20:28   ` [PATCH v2 0/2] Restrict when prefetcing occurs Junio C Hamano
@ 2020-04-06 11:44     ` Derrick Stolee
  2020-04-06 11:57       ` Garima Singh
  0 siblings, 1 reply; 26+ messages in thread
From: Derrick Stolee @ 2020-04-06 11:44 UTC (permalink / raw)
  To: Junio C Hamano, Jonathan Tan; +Cc: git, Garima Singh

On 4/2/2020 4:28 PM, Junio C Hamano wrote:
> I notice that a439b4ef (diff: skip batch object download when
> possible, 2020-03-30) by Garima seems to aim for something similar.
> 
> I'll for now keep both topics with conflict resolution, but it may
> make sense for you two to compare notes.

I pointed this out in [1]. I think the right thing to do is for
Garima's/my patch to rely on Jonathan's change. The commit needs
to be modified, not simply ejected, but it could be separated from
the rest of Garima's series. It is only a performance fix for
normal clones, but is critical for partial clones.

Garima: do you think it would be easy to remove that patch if/when
you do a v4 and I can make a new series based on yours and Jonathan's
with the rename setting?

[1] https://lore.kernel.org/git/b956528c-412b-2f38-bd90-1fa2ae4b8604@gmail.com/
Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 0/2] Restrict when prefetcing occurs
  2020-04-06 11:44     ` Derrick Stolee
@ 2020-04-06 11:57       ` Garima Singh
  0 siblings, 0 replies; 26+ messages in thread
From: Garima Singh @ 2020-04-06 11:57 UTC (permalink / raw)
  To: Derrick Stolee, Junio C Hamano, Jonathan Tan; +Cc: git, Garima Singh


On 4/6/2020 7:44 AM, Derrick Stolee wrote:
> On 4/2/2020 4:28 PM, Junio C Hamano wrote:
>> I notice that a439b4ef (diff: skip batch object download when
>> possible, 2020-03-30) by Garima seems to aim for something similar.
>>
>> I'll for now keep both topics with conflict resolution, but it may
>> make sense for you two to compare notes.
> 
> I pointed this out in [1]. I think the right thing to do is for
> Garima's/my patch to rely on Jonathan's change. The commit needs
> to be modified, not simply ejected, but it could be separated from
> the rest of Garima's series. It is only a performance fix for
> normal clones, but is critical for partial clones.
> 
> Garima: do you think it would be easy to remove that patch if/when
> you do a v4 and I can make a new series based on yours and Jonathan's
> with the rename setting?
> 

Sure. I was thinking about rebasing my series on top of Jonathan's 
and adjusting as necessary, but it might be easier to just remove it 
and then have a new series based on mine and Jonathan's, like you
suggested. 

There hasn't been any feedback since I sent out v3. I will just re-roll
v4 without this patch, to make sure pu no longer requires conflict
resolution around the Bloom filter series. 

Cheers! 
Garima Singh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 0/4] Restrict when prefetcing occurs
  2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
                   ` (2 preceding siblings ...)
  2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
@ 2020-04-07 22:11 ` Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
                     ` (3 more replies)
  3 siblings, 4 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-07 22:11 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, garimasigit, gitster

Junio wrote in [1]:

> s/rebase/rename/ I presume, but the above reasoning, while it may
> happen to hold true right now, feels brittle.  In other words
> 
>  - how do we know it would stay to be "a superset"?
> 
>  - would it change the picture if we later added a prefetch in
>    diffcore_break(), just like you are doing so to diffcore_rename()
>    in this patch?

and suggested that each function be capable of prefetching. I've done
that for most functions in this new version.

To avoid the potential slowdown of each function doing its own object
existence checks (that is, looping through all the relevant OIDs and
then prefetching based on whether it found one missing), what I did is
to teach diff_populate_filespec() to retry whenever it attempts to read
a missing object, calling a callback before its 2nd (and final) try. I
then taught the functions called by diffcore_std() to pass a prefetching
function as this callback.

The functions I've taught include diffcore_skip_stat_unmatch(). I
couldn't figure out how to trigger this behavior in a test (I can see
that the function is being run, but not how to make it read an object),
but I included the prefetching mechanism in this function anyway for
completeness.

The previous version of my patch [2] made the assumption that the
fetching done at the start of diffcore_std() is a superset of the
fetching done by diffcore_rebase() - hence Junio's comment above about
how we would know that it would stay a superset. With this series, if
ever that no longer holds (and we miss fixing it), rebase would only do
one additional bulk fetch (instead of fetching once for every missing
object).

[1] https://lore.kernel.org/git/xmqqsghl1m0p.fsf@gitster.c.googlers.com/
[2] https://lore.kernel.org/git/a3322cdedf019126305fcead5918d523a1b2dfbc.1585854639.git.jonathantanmy@google.com/

Jonathan Tan (4):
  promisor-remote: accept 0 as oid_nr in function
  diff: make diff_populate_filespec_options struct
  diff: refactor object read
  diff: restrict when prefetching occurs

 builtin/index-pack.c          |   5 +-
 diff.c                        | 157 +++++++++++++++++++++++-----------
 diffcore-break.c              |  12 ++-
 diffcore-rename.c             |  64 ++++++++++++--
 diffcore.h                    |  30 ++++++-
 line-log.c                    |   6 +-
 promisor-remote.c             |   3 +
 promisor-remote.h             |   8 ++
 t/t4067-diff-partial-clone.sh |  48 +++++++++++
 unpack-trees.c                |   5 +-
 10 files changed, 267 insertions(+), 71 deletions(-)

-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
@ 2020-04-07 22:11   ` Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 2/4] diff: make diff_populate_filespec_options struct Jonathan Tan
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-07 22:11 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, garimasigit, gitster

There are 3 callers to promisor_remote_get_direct() that first check if
the number of objects to be fetched is equal to 0. Fold that check into
promisor_remote_get_direct(), and in doing so, be explicit as to what
promisor_remote_get_direct() does if oid_nr is 0 (it returns 0, success,
immediately).

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/index-pack.c |  5 ++---
 diff.c               | 11 +++++------
 promisor-remote.c    |  3 +++
 promisor-remote.h    |  8 ++++++++
 unpack-trees.c       |  5 ++---
 5 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index d967d188a3..f176dd28c8 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1368,9 +1368,8 @@ static void fix_unresolved_deltas(struct hashfile *f)
 				continue;
 			oid_array_append(&to_fetch, &d->oid);
 		}
-		if (to_fetch.nr)
-			promisor_remote_get_direct(the_repository,
-						   to_fetch.oid, to_fetch.nr);
+		promisor_remote_get_direct(the_repository,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 
diff --git a/diff.c b/diff.c
index 1010d806f5..f01b4d91b8 100644
--- a/diff.c
+++ b/diff.c
@@ -6520,12 +6520,11 @@ void diffcore_std(struct diff_options *options)
 			add_if_missing(options->repo, &to_fetch, p->one);
 			add_if_missing(options->repo, &to_fetch, p->two);
 		}
-		if (to_fetch.nr)
-			/*
-			 * NEEDSWORK: Consider deduplicating the OIDs sent.
-			 */
-			promisor_remote_get_direct(options->repo,
-						   to_fetch.oid, to_fetch.nr);
+		/*
+		 * NEEDSWORK: Consider deduplicating the OIDs sent.
+		 */
+		promisor_remote_get_direct(options->repo,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 
diff --git a/promisor-remote.c b/promisor-remote.c
index 9f338c945f..2155dfe657 100644
--- a/promisor-remote.c
+++ b/promisor-remote.c
@@ -241,6 +241,9 @@ int promisor_remote_get_direct(struct repository *repo,
 	int to_free = 0;
 	int res = -1;
 
+	if (oid_nr == 0)
+		return 0;
+
 	promisor_remote_init();
 
 	for (r = promisors; r; r = r->next) {
diff --git a/promisor-remote.h b/promisor-remote.h
index 737bac3a33..6343c47d18 100644
--- a/promisor-remote.h
+++ b/promisor-remote.h
@@ -20,6 +20,14 @@ struct promisor_remote {
 void promisor_remote_reinit(void);
 struct promisor_remote *promisor_remote_find(const char *remote_name);
 int has_promisor_remote(void);
+
+/*
+ * Fetches all requested objects from all promisor remotes, trying them one at
+ * a time until all objects are fetched. Returns 0 upon success, and non-zero
+ * otherwise.
+ *
+ * If oid_nr is 0, this function returns 0 (success) immediately.
+ */
 int promisor_remote_get_direct(struct repository *repo,
 			       const struct object_id *oids,
 			       int oid_nr);
diff --git a/unpack-trees.c b/unpack-trees.c
index f618a644ef..4c3191b947 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -423,9 +423,8 @@ static int check_updates(struct unpack_trees_options *o)
 				continue;
 			oid_array_append(&to_fetch, &ce->oid);
 		}
-		if (to_fetch.nr)
-			promisor_remote_get_direct(the_repository,
-						   to_fetch.oid, to_fetch.nr);
+		promisor_remote_get_direct(the_repository,
+					   to_fetch.oid, to_fetch.nr);
 		oid_array_clear(&to_fetch);
 	}
 	for (i = 0; i < index->cache_nr; i++) {
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 2/4] diff: make diff_populate_filespec_options struct
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
@ 2020-04-07 22:11   ` Jonathan Tan
  2020-04-07 23:44     ` Junio C Hamano
  2020-04-07 22:11   ` [PATCH v3 3/4] diff: refactor object read Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 4/4] diff: restrict when prefetching occurs Jonathan Tan
  3 siblings, 1 reply; 26+ messages in thread
From: Jonathan Tan @ 2020-04-07 22:11 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, garimasigit, gitster

The behavior of diff_populate_filespec() currently can be customized
through a bitflag, but a subsequent patch requires it to support a
non-boolean option. Replace the bitflag with an options struct.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 diff.c            | 54 ++++++++++++++++++++++++++++++-----------------
 diffcore-break.c  |  4 ++--
 diffcore-rename.c | 13 +++++++-----
 diffcore.h        |  9 +++++---
 line-log.c        |  6 +++---
 5 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/diff.c b/diff.c
index f01b4d91b8..f337d837ac 100644
--- a/diff.c
+++ b/diff.c
@@ -573,7 +573,7 @@ static int fill_mmfile(struct repository *r, mmfile_t *mf,
 		mf->size = 0;
 		return 0;
 	}
-	else if (diff_populate_filespec(r, one, 0))
+	else if (diff_populate_filespec(r, one, NULL))
 		return -1;
 
 	mf->ptr = one->data;
@@ -585,9 +585,13 @@ static int fill_mmfile(struct repository *r, mmfile_t *mf,
 static unsigned long diff_filespec_size(struct repository *r,
 					struct diff_filespec *one)
 {
+	struct diff_populate_filespec_options dpf_options = {
+		.check_size_only = 1,
+	};
+
 	if (!DIFF_FILE_VALID(one))
 		return 0;
-	diff_populate_filespec(r, one, CHECK_SIZE_ONLY);
+	diff_populate_filespec(r, one, &dpf_options);
 	return one->size;
 }
 
@@ -3020,6 +3024,9 @@ static void show_dirstat(struct diff_options *options)
 		struct diff_filepair *p = q->queue[i];
 		const char *name;
 		unsigned long copied, added, damage;
+		struct diff_populate_filespec_options dpf_options = {
+			.check_size_only = 1,
+		};
 
 		name = p->two->path ? p->two->path : p->one->path;
 
@@ -3047,19 +3054,19 @@ static void show_dirstat(struct diff_options *options)
 		}
 
 		if (DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
-			diff_populate_filespec(options->repo, p->one, 0);
-			diff_populate_filespec(options->repo, p->two, 0);
+			diff_populate_filespec(options->repo, p->one, NULL);
+			diff_populate_filespec(options->repo, p->two, NULL);
 			diffcore_count_changes(options->repo,
 					       p->one, p->two, NULL, NULL,
 					       &copied, &added);
 			diff_free_filespec_data(p->one);
 			diff_free_filespec_data(p->two);
 		} else if (DIFF_FILE_VALID(p->one)) {
-			diff_populate_filespec(options->repo, p->one, CHECK_SIZE_ONLY);
+			diff_populate_filespec(options->repo, p->one, &dpf_options);
 			copied = added = 0;
 			diff_free_filespec_data(p->one);
 		} else if (DIFF_FILE_VALID(p->two)) {
-			diff_populate_filespec(options->repo, p->two, CHECK_SIZE_ONLY);
+			diff_populate_filespec(options->repo, p->two, &dpf_options);
 			copied = 0;
 			added = p->two->size;
 			diff_free_filespec_data(p->two);
@@ -3339,13 +3346,17 @@ static void emit_binary_diff(struct diff_options *o,
 int diff_filespec_is_binary(struct repository *r,
 			    struct diff_filespec *one)
 {
+	struct diff_populate_filespec_options dpf_options = {
+		.check_binary = 1,
+	};
+
 	if (one->is_binary == -1) {
 		diff_filespec_load_driver(one, r->index);
 		if (one->driver->binary != -1)
 			one->is_binary = one->driver->binary;
 		else {
 			if (!one->data && DIFF_FILE_VALID(one))
-				diff_populate_filespec(r, one, CHECK_BINARY);
+				diff_populate_filespec(r, one, &dpf_options);
 			if (one->is_binary == -1 && one->data)
 				one->is_binary = buffer_is_binary(one->data,
 						one->size);
@@ -3677,8 +3688,8 @@ static void builtin_diffstat(const char *name_a, const char *name_b,
 	}
 
 	else if (complete_rewrite) {
-		diff_populate_filespec(o->repo, one, 0);
-		diff_populate_filespec(o->repo, two, 0);
+		diff_populate_filespec(o->repo, one, NULL);
+		diff_populate_filespec(o->repo, two, NULL);
 		data->deleted = count_lines(one->data, one->size);
 		data->added = count_lines(two->data, two->size);
 	}
@@ -3914,9 +3925,10 @@ static int diff_populate_gitlink(struct diff_filespec *s, int size_only)
  */
 int diff_populate_filespec(struct repository *r,
 			   struct diff_filespec *s,
-			   unsigned int flags)
+			   const struct diff_populate_filespec_options *options)
 {
-	int size_only = flags & CHECK_SIZE_ONLY;
+	int size_only = options ? options->check_size_only : 0;
+	int check_binary = options ? options->check_binary : 0;
 	int err = 0;
 	int conv_flags = global_conv_flags_eol;
 	/*
@@ -3986,7 +3998,7 @@ int diff_populate_filespec(struct repository *r,
 		 * opening the file and inspecting the contents, this
 		 * is probably fine.
 		 */
-		if ((flags & CHECK_BINARY) &&
+		if (check_binary &&
 		    s->size > big_file_threshold && s->is_binary == -1) {
 			s->is_binary = 1;
 			return 0;
@@ -4012,7 +4024,7 @@ int diff_populate_filespec(struct repository *r,
 	}
 	else {
 		enum object_type type;
-		if (size_only || (flags & CHECK_BINARY)) {
+		if (size_only || check_binary) {
 			type = oid_object_info(r, &s->oid, &s->size);
 			if (type < 0)
 				die("unable to read %s",
@@ -4144,7 +4156,7 @@ static struct diff_tempfile *prepare_temp_file(struct repository *r,
 		return temp;
 	}
 	else {
-		if (diff_populate_filespec(r, one, 0))
+		if (diff_populate_filespec(r, one, NULL))
 			die("cannot read data blob for %s", one->path);
 		prep_temp_blob(r->index, name, temp,
 			       one->data, one->size,
@@ -6410,9 +6422,9 @@ static int diff_filespec_is_identical(struct repository *r,
 {
 	if (S_ISGITLINK(one->mode))
 		return 0;
-	if (diff_populate_filespec(r, one, 0))
+	if (diff_populate_filespec(r, one, NULL))
 		return 0;
-	if (diff_populate_filespec(r, two, 0))
+	if (diff_populate_filespec(r, two, NULL))
 		return 0;
 	return !memcmp(one->data, two->data, one->size);
 }
@@ -6420,6 +6432,10 @@ static int diff_filespec_is_identical(struct repository *r,
 static int diff_filespec_check_stat_unmatch(struct repository *r,
 					    struct diff_filepair *p)
 {
+	struct diff_populate_filespec_options dpf_options = {
+		.check_size_only = 1,
+	};
+
 	if (p->done_skip_stat_unmatch)
 		return p->skip_stat_unmatch_result;
 
@@ -6442,8 +6458,8 @@ static int diff_filespec_check_stat_unmatch(struct repository *r,
 	    !DIFF_FILE_VALID(p->two) ||
 	    (p->one->oid_valid && p->two->oid_valid) ||
 	    (p->one->mode != p->two->mode) ||
-	    diff_populate_filespec(r, p->one, CHECK_SIZE_ONLY) ||
-	    diff_populate_filespec(r, p->two, CHECK_SIZE_ONLY) ||
+	    diff_populate_filespec(r, p->one, &dpf_options) ||
+	    diff_populate_filespec(r, p->two, &dpf_options) ||
 	    (p->one->size != p->two->size) ||
 	    !diff_filespec_is_identical(r, p->one, p->two)) /* (2) */
 		p->skip_stat_unmatch_result = 1;
@@ -6773,7 +6789,7 @@ size_t fill_textconv(struct repository *r,
 			*outbuf = "";
 			return 0;
 		}
-		if (diff_populate_filespec(r, df, 0))
+		if (diff_populate_filespec(r, df, NULL))
 			die("unable to read files to diff");
 		*outbuf = df->data;
 		return df->size;
diff --git a/diffcore-break.c b/diffcore-break.c
index 9d20a6a6fc..e8f6322c6a 100644
--- a/diffcore-break.c
+++ b/diffcore-break.c
@@ -62,8 +62,8 @@ static int should_break(struct repository *r,
 	    oideq(&src->oid, &dst->oid))
 		return 0; /* they are the same */
 
-	if (diff_populate_filespec(r, src, 0) ||
-	    diff_populate_filespec(r, dst, 0))
+	if (diff_populate_filespec(r, src, NULL) ||
+	    diff_populate_filespec(r, dst, NULL))
 		return 0; /* error but caught downstream */
 
 	max_size = ((src->size > dst->size) ? src->size : dst->size);
diff --git a/diffcore-rename.c b/diffcore-rename.c
index e189f407af..bf4c0b8740 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -148,6 +148,9 @@ static int estimate_similarity(struct repository *r,
 	 */
 	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
 	int score;
+	struct diff_populate_filespec_options dpf_options = {
+		.check_size_only = 1
+	};
 
 	/* We deal only with regular files.  Symlink renames are handled
 	 * only when they are exact matches --- in other words, no edits
@@ -166,10 +169,10 @@ static int estimate_similarity(struct repository *r,
 	 * say whether the size is valid or not!)
 	 */
 	if (!src->cnt_data &&
-	    diff_populate_filespec(r, src, CHECK_SIZE_ONLY))
+	    diff_populate_filespec(r, src, &dpf_options))
 		return 0;
 	if (!dst->cnt_data &&
-	    diff_populate_filespec(r, dst, CHECK_SIZE_ONLY))
+	    diff_populate_filespec(r, dst, &dpf_options))
 		return 0;
 
 	max_size = ((src->size > dst->size) ? src->size : dst->size);
@@ -187,9 +190,9 @@ static int estimate_similarity(struct repository *r,
 	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
-	if (!src->cnt_data && diff_populate_filespec(r, src, 0))
+	if (!src->cnt_data && diff_populate_filespec(r, src, NULL))
 		return 0;
-	if (!dst->cnt_data && diff_populate_filespec(r, dst, 0))
+	if (!dst->cnt_data && diff_populate_filespec(r, dst, NULL))
 		return 0;
 
 	if (diffcore_count_changes(r, src, dst,
@@ -261,7 +264,7 @@ static unsigned int hash_filespec(struct repository *r,
 				  struct diff_filespec *filespec)
 {
 	if (!filespec->oid_valid) {
-		if (diff_populate_filespec(r, filespec, 0))
+		if (diff_populate_filespec(r, filespec, NULL))
 			return 0;
 		hash_object_file(r->hash_algo, filespec->data, filespec->size,
 				 "blob", &filespec->oid);
diff --git a/diffcore.h b/diffcore.h
index 7c07347e42..3b2020ce93 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -65,9 +65,12 @@ void free_filespec(struct diff_filespec *);
 void fill_filespec(struct diff_filespec *, const struct object_id *,
 		   int, unsigned short);
 
-#define CHECK_SIZE_ONLY 1
-#define CHECK_BINARY    2
-int diff_populate_filespec(struct repository *, struct diff_filespec *, unsigned int);
+struct diff_populate_filespec_options {
+	unsigned check_size_only : 1;
+	unsigned check_binary : 1;
+};
+int diff_populate_filespec(struct repository *, struct diff_filespec *,
+			   const struct diff_populate_filespec_options *);
 void diff_free_filespec_data(struct diff_filespec *);
 void diff_free_filespec_blob(struct diff_filespec *);
 int diff_filespec_is_binary(struct repository *, struct diff_filespec *);
diff --git a/line-log.c b/line-log.c
index 9010e00950..40e1738dbb 100644
--- a/line-log.c
+++ b/line-log.c
@@ -519,7 +519,7 @@ static void fill_line_ends(struct repository *r,
 	unsigned long *ends = NULL;
 	char *data = NULL;
 
-	if (diff_populate_filespec(r, spec, 0))
+	if (diff_populate_filespec(r, spec, NULL))
 		die("Cannot read blob %s", oid_to_hex(&spec->oid));
 
 	ALLOC_ARRAY(ends, size);
@@ -1045,12 +1045,12 @@ static int process_diff_filepair(struct rev_info *rev,
 		return 0;
 
 	assert(pair->two->oid_valid);
-	diff_populate_filespec(rev->diffopt.repo, pair->two, 0);
+	diff_populate_filespec(rev->diffopt.repo, pair->two, NULL);
 	file_target.ptr = pair->two->data;
 	file_target.size = pair->two->size;
 
 	if (pair->one->oid_valid) {
-		diff_populate_filespec(rev->diffopt.repo, pair->one, 0);
+		diff_populate_filespec(rev->diffopt.repo, pair->one, NULL);
 		file_parent.ptr = pair->one->data;
 		file_parent.size = pair->one->size;
 	} else {
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 3/4] diff: refactor object read
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 2/4] diff: make diff_populate_filespec_options struct Jonathan Tan
@ 2020-04-07 22:11   ` Jonathan Tan
  2020-04-07 22:11   ` [PATCH v3 4/4] diff: restrict when prefetching occurs Jonathan Tan
  3 siblings, 0 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-07 22:11 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, garimasigit, gitster

Refactor the object reads in diff_populate_filespec() to have the first
object read not be in an if/else branch, because in a future patch, a
retry will be added to that first object read.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 diff.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/diff.c b/diff.c
index f337d837ac..8db981b906 100644
--- a/diff.c
+++ b/diff.c
@@ -4023,12 +4023,22 @@ int diff_populate_filespec(struct repository *r,
 		}
 	}
 	else {
-		enum object_type type;
+		struct object_info info = {
+			.sizep = &s->size
+		};
+
+		if (!(size_only || check_binary))
+			/*
+			 * Set contentp, since there is no chance that merely
+			 * the size is sufficient.
+			 */
+			info.contentp = &s->data;
+
+		if (oid_object_info_extended(r, &s->oid, &info,
+					     OBJECT_INFO_LOOKUP_REPLACE))
+			die("unable to read %s", oid_to_hex(&s->oid));
+
 		if (size_only || check_binary) {
-			type = oid_object_info(r, &s->oid, &s->size);
-			if (type < 0)
-				die("unable to read %s",
-				    oid_to_hex(&s->oid));
 			if (size_only)
 				return 0;
 			if (s->size > big_file_threshold && s->is_binary == -1) {
@@ -4036,9 +4046,12 @@ int diff_populate_filespec(struct repository *r,
 				return 0;
 			}
 		}
-		s->data = repo_read_object_file(r, &s->oid, &type, &s->size);
-		if (!s->data)
-			die("unable to read %s", oid_to_hex(&s->oid));
+		if (!info.contentp) {
+			info.contentp = &s->data;
+			if (oid_object_info_extended(r, &s->oid, &info,
+						     OBJECT_INFO_LOOKUP_REPLACE))
+				die("unable to read %s", oid_to_hex(&s->oid));
+		}
 		s->should_free = 1;
 	}
 	return 0;
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 4/4] diff: restrict when prefetching occurs
  2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
                     ` (2 preceding siblings ...)
  2020-04-07 22:11   ` [PATCH v3 3/4] diff: refactor object read Jonathan Tan
@ 2020-04-07 22:11   ` Jonathan Tan
  3 siblings, 0 replies; 26+ messages in thread
From: Jonathan Tan @ 2020-04-07 22:11 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, garimasigit, gitster, Jeff King

Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.

diffcore_std() may read blobs when it calls the following functions:
 (1) diffcore_skip_stat_unmatch() (controlled by the config variable
     diff.autorefreshindex)
 (2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
     detection)
 (3) diffcore_rename() (for rename detection)
 (4) diffcore_pickaxe() (for detecting addition/deletion of specified
     string)

Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 diff.c                        | 73 ++++++++++++++++++++++++-----------
 diffcore-break.c              | 12 +++++-
 diffcore-rename.c             | 55 ++++++++++++++++++++++++--
 diffcore.h                    | 21 ++++++++++
 t/t4067-diff-partial-clone.sh | 48 +++++++++++++++++++++++
 5 files changed, 181 insertions(+), 28 deletions(-)

diff --git a/diff.c b/diff.c
index 8db981b906..d1ad6a3c4a 100644
--- a/diff.c
+++ b/diff.c
@@ -4034,10 +4034,18 @@ int diff_populate_filespec(struct repository *r,
 			 */
 			info.contentp = &s->data;
 
+		if (options && options->missing_object_cb) {
+			if (!oid_object_info_extended(r, &s->oid, &info,
+						      OBJECT_INFO_LOOKUP_REPLACE |
+						      OBJECT_INFO_SKIP_FETCH_OBJECT))
+				goto object_read;
+			options->missing_object_cb(options->missing_object_data);
+		}
 		if (oid_object_info_extended(r, &s->oid, &info,
 					     OBJECT_INFO_LOOKUP_REPLACE))
 			die("unable to read %s", oid_to_hex(&s->oid));
 
+object_read:
 		if (size_only || check_binary) {
 			if (size_only)
 				return 0;
@@ -6447,6 +6455,8 @@ static int diff_filespec_check_stat_unmatch(struct repository *r,
 {
 	struct diff_populate_filespec_options dpf_options = {
 		.check_size_only = 1,
+		.missing_object_cb = diff_queued_diff_prefetch,
+		.missing_object_data = r,
 	};
 
 	if (p->done_skip_stat_unmatch)
@@ -6523,9 +6533,9 @@ void diffcore_fix_diff_index(void)
 	QSORT(q->queue, q->nr, diffnamecmp);
 }
 
-static void add_if_missing(struct repository *r,
-			   struct oid_array *to_fetch,
-			   const struct diff_filespec *filespec)
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec)
 {
 	if (filespec && filespec->oid_valid &&
 	    !S_ISGITLINK(filespec->mode) &&
@@ -6534,29 +6544,48 @@ static void add_if_missing(struct repository *r,
 		oid_array_append(to_fetch, &filespec->oid);
 }
 
-void diffcore_std(struct diff_options *options)
+void diff_queued_diff_prefetch(void *repository)
 {
-	if (options->repo == the_repository && has_promisor_remote()) {
-		/*
-		 * Prefetch the diff pairs that are about to be flushed.
-		 */
-		int i;
-		struct diff_queue_struct *q = &diff_queued_diff;
-		struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct repository *repo = repository;
+	int i;
+	struct diff_queue_struct *q = &diff_queued_diff;
+	struct oid_array to_fetch = OID_ARRAY_INIT;
 
-		for (i = 0; i < q->nr; i++) {
-			struct diff_filepair *p = q->queue[i];
-			add_if_missing(options->repo, &to_fetch, p->one);
-			add_if_missing(options->repo, &to_fetch, p->two);
-		}
-		/*
-		 * NEEDSWORK: Consider deduplicating the OIDs sent.
-		 */
-		promisor_remote_get_direct(options->repo,
-					   to_fetch.oid, to_fetch.nr);
-		oid_array_clear(&to_fetch);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		diff_add_if_missing(repo, &to_fetch, p->one);
+		diff_add_if_missing(repo, &to_fetch, p->two);
 	}
 
+	/*
+	 * NEEDSWORK: Consider deduplicating the OIDs sent.
+	 */
+	promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
+
+	oid_array_clear(&to_fetch);
+}
+
+void diffcore_std(struct diff_options *options)
+{
+	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
+		DIFF_FORMAT_NUMSTAT |
+		DIFF_FORMAT_PATCH |
+		DIFF_FORMAT_SHORTSTAT |
+		DIFF_FORMAT_DIRSTAT;
+
+	/*
+	 * Check if the user requested a blob-data-requiring diff output and/or
+	 * break-rewrite detection (which requires blob data). If yes, prefetch
+	 * the diff pairs.
+	 *
+	 * If no prefetching occurs, diffcore_rename() will prefetch if it
+	 * decides that it needs inexact rename detection.
+	 */
+	if (options->repo == the_repository && has_promisor_remote() &&
+	    (options->output_format & output_formats_to_prefetch ||
+	     options->pickaxe_opts & DIFF_PICKAXE_KINDS_MASK))
+		diff_queued_diff_prefetch(options->repo);
+
 	/* NOTE please keep the following in sync with diff_tree_combined() */
 	if (options->skip_stat_unmatch)
 		diffcore_skip_stat_unmatch(options);
diff --git a/diffcore-break.c b/diffcore-break.c
index e8f6322c6a..0d4a14964d 100644
--- a/diffcore-break.c
+++ b/diffcore-break.c
@@ -4,6 +4,7 @@
 #include "cache.h"
 #include "diff.h"
 #include "diffcore.h"
+#include "promisor-remote.h"
 
 static int should_break(struct repository *r,
 			struct diff_filespec *src,
@@ -49,6 +50,8 @@ static int should_break(struct repository *r,
 	unsigned long delta_size, max_size;
 	unsigned long src_copied, literal_added, src_removed;
 
+	struct diff_populate_filespec_options options = { 0 };
+
 	*merge_score_p = 0; /* assume no deletion --- "do not break"
 			     * is the default.
 			     */
@@ -62,8 +65,13 @@ static int should_break(struct repository *r,
 	    oideq(&src->oid, &dst->oid))
 		return 0; /* they are the same */
 
-	if (diff_populate_filespec(r, src, NULL) ||
-	    diff_populate_filespec(r, dst, NULL))
+	if (r == the_repository && has_promisor_remote()) {
+		options.missing_object_cb = diff_queued_diff_prefetch;
+		options.missing_object_data = r;
+	}
+
+	if (diff_populate_filespec(r, src, &options) ||
+	    diff_populate_filespec(r, dst, &options))
 		return 0; /* error but caught downstream */
 
 	max_size = ((src->size > dst->size) ? src->size : dst->size);
diff --git a/diffcore-rename.c b/diffcore-rename.c
index bf4c0b8740..99e63e90f8 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -1,4 +1,5 @@
 /*
+ *
  * Copyright (C) 2005 Junio C Hamano
  */
 #include "cache.h"
@@ -7,6 +8,7 @@
 #include "object-store.h"
 #include "hashmap.h"
 #include "progress.h"
+#include "promisor-remote.h"
 
 /* Table of rename/copy destinations */
 
@@ -128,10 +130,46 @@ struct diff_score {
 	short name_score;
 };
 
+struct prefetch_options {
+	struct repository *repo;
+	int skip_unmodified;
+};
+static void prefetch(void *prefetch_options)
+{
+	struct prefetch_options *options = prefetch_options;
+	int i;
+	struct oid_array to_fetch = OID_ARRAY_INIT;
+
+	for (i = 0; i < rename_dst_nr; i++) {
+		if (rename_dst[i].pair)
+			/*
+			 * The loop in diffcore_rename() will not need these
+			 * blobs, so skip prefetching.
+			 */
+			continue; /* already found exact match */
+		diff_add_if_missing(options->repo, &to_fetch,
+				    rename_dst[i].two);
+	}
+	for (i = 0; i < rename_src_nr; i++) {
+		if (options->skip_unmodified &&
+		    diff_unmodified_pair(rename_src[i].p))
+			/*
+			 * The loop in diffcore_rename() will not need these
+			 * blobs, so skip prefetching.
+			 */
+			continue;
+		diff_add_if_missing(options->repo, &to_fetch,
+				    rename_src[i].p->one);
+	}
+	promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
+	oid_array_clear(&to_fetch);
+}
+
 static int estimate_similarity(struct repository *r,
 			       struct diff_filespec *src,
 			       struct diff_filespec *dst,
-			       int minimum_score)
+			       int minimum_score,
+			       int skip_unmodified)
 {
 	/* src points at a file that existed in the original tree (or
 	 * optionally a file in the destination tree) and dst points
@@ -151,6 +189,12 @@ static int estimate_similarity(struct repository *r,
 	struct diff_populate_filespec_options dpf_options = {
 		.check_size_only = 1
 	};
+	struct prefetch_options prefetch_options = {r, skip_unmodified};
+
+	if (r == the_repository && has_promisor_remote()) {
+		dpf_options.missing_object_cb = prefetch;
+		dpf_options.missing_object_data = &prefetch_options;
+	}
 
 	/* We deal only with regular files.  Symlink renames are handled
 	 * only when they are exact matches --- in other words, no edits
@@ -190,9 +234,11 @@ static int estimate_similarity(struct repository *r,
 	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
-	if (!src->cnt_data && diff_populate_filespec(r, src, NULL))
+	dpf_options.check_size_only = 0;
+
+	if (!src->cnt_data && diff_populate_filespec(r, src, &dpf_options))
 		return 0;
-	if (!dst->cnt_data && diff_populate_filespec(r, dst, NULL))
+	if (!dst->cnt_data && diff_populate_filespec(r, dst, &dpf_options))
 		return 0;
 
 	if (diffcore_count_changes(r, src, dst,
@@ -569,7 +615,8 @@ void diffcore_rename(struct diff_options *options)
 
 			this_src.score = estimate_similarity(options->repo,
 							     one, two,
-							     minimum_score);
+							     minimum_score,
+							     skip_unmodified);
 			this_src.name_score = basename_same(one, two);
 			this_src.dst = i;
 			this_src.src = j;
diff --git a/diffcore.h b/diffcore.h
index 3b2020ce93..d2a63c5c71 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -65,9 +65,22 @@ void free_filespec(struct diff_filespec *);
 void fill_filespec(struct diff_filespec *, const struct object_id *,
 		   int, unsigned short);
 
+/*
+ * Prefetch the entries in diff_queued_diff. The parameter is a pointer to a
+ * struct repository.
+ */
+void diff_queued_diff_prefetch(void *repository);
+
 struct diff_populate_filespec_options {
 	unsigned check_size_only : 1;
 	unsigned check_binary : 1;
+
+	/*
+	 * If an object is missing, diff_populate_filespec() will invoke this
+	 * callback before attempting to read that object again.
+	 */
+	void (*missing_object_cb)(void *);
+	void *missing_object_data;
 };
 int diff_populate_filespec(struct repository *, struct diff_filespec *,
 			   const struct diff_populate_filespec_options *);
@@ -185,4 +198,12 @@ int diffcore_count_changes(struct repository *r,
 			   unsigned long *src_copied,
 			   unsigned long *literal_added);
 
+/*
+ * If filespec contains an OID and if that object is missing from the given
+ * repository, add that OID to to_fetch.
+ */
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec);
+
 #endif
diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
index 4831ad35e6..c1ed1c2fc4 100755
--- a/t/t4067-diff-partial-clone.sh
+++ b/t/t4067-diff-partial-clone.sh
@@ -131,4 +131,52 @@ test_expect_success 'diff with rename detection batches blobs' '
 	test_line_count = 1 done_lines
 '
 
+test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	mv server/b server/c &&
+	git -C server add c &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace
+'
+
+test_expect_success 'diff --break-rewrites fetches only if necessary, and batches blobs if it does' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	printf "c\nc\nc\nc\nc\n" >server/b &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace &&
+
+	# But with --break-rewrites, ensure that there is exactly 1 negotiation
+	# by checking that there is only 1 "done" line sent. ("done" marks the
+	# end of negotiation.)
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --break-rewrites --raw -M HEAD^ HEAD &&
+	grep "git> done" trace >done_lines &&
+	test_line_count = 1 done_lines
+'
+
 test_done
-- 
2.26.0.292.g33ef6b2f38-goog


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/4] diff: make diff_populate_filespec_options struct
  2020-04-07 22:11   ` [PATCH v3 2/4] diff: make diff_populate_filespec_options struct Jonathan Tan
@ 2020-04-07 23:44     ` Junio C Hamano
  0 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2020-04-07 23:44 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, garimasigit

Jonathan Tan <jonathantanmy@google.com> writes:

> The behavior of diff_populate_filespec() currently can be customized
> through a bitflag, but a subsequent patch requires it to support a
> non-boolean option. Replace the bitflag with an options struct.

Hmph, clever :-).

> +	struct diff_populate_filespec_options dpf_options = {
> +		.check_size_only = 1,
> +	};

I would have called this instance of d-p-f-o "check_size_only",
which would make the site that uses it ...

>  	if (!DIFF_FILE_VALID(one))
>  		return 0;
> -	diff_populate_filespec(r, one, CHECK_SIZE_ONLY);
> +	diff_populate_filespec(r, one, &dpf_options);

... easier to understand, especially if we can made it constant, but
that would probably contradict the plan to add more fields to the
structure, so let's see how it goes.

> @@ -3339,13 +3346,17 @@ static void emit_binary_diff(struct diff_options *o,
>  int diff_filespec_is_binary(struct repository *r,
>  			    struct diff_filespec *one)
>  {
> +	struct diff_populate_filespec_options dpf_options = {
> +		.check_binary = 1,
> +	};
> +

The same comment applies to here, too.


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2020-04-07 23:44 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
2020-03-31 12:14 ` Derrick Stolee
2020-03-31 16:50   ` Jonathan Tan
2020-03-31 17:48     ` Derrick Stolee
2020-03-31 18:21       ` Junio C Hamano
2020-03-31 18:15 ` Junio C Hamano
2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
2020-04-02 19:46     ` Junio C Hamano
2020-04-02 23:01       ` Jonathan Tan
2020-04-02 19:19   ` [PATCH v2 2/2] diff: restrict when prefetching occurs Jonathan Tan
2020-04-02 20:08     ` Junio C Hamano
2020-04-02 23:09       ` Jonathan Tan
2020-04-02 23:25         ` Junio C Hamano
2020-04-02 23:54         ` Junio C Hamano
2020-04-03 21:35           ` Jonathan Tan
2020-04-03 22:12             ` Junio C Hamano
2020-04-02 20:28   ` [PATCH v2 0/2] Restrict when prefetcing occurs Junio C Hamano
2020-04-06 11:44     ` Derrick Stolee
2020-04-06 11:57       ` Garima Singh
2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
2020-04-07 22:11   ` [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
2020-04-07 22:11   ` [PATCH v3 2/4] diff: make diff_populate_filespec_options struct Jonathan Tan
2020-04-07 23:44     ` Junio C Hamano
2020-04-07 22:11   ` [PATCH v3 3/4] diff: refactor object read Jonathan Tan
2020-04-07 22:11   ` [PATCH v3 4/4] diff: restrict when prefetching occurs Jonathan Tan

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).