git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jonathan Tan <jonathantanmy@google.com>
To: git@vger.kernel.org
Cc: Jonathan Tan <jonathantanmy@google.com>,
	garimasigit@gmail.com, gitster@pobox.com,
	Jeff King <peff@peff.net>
Subject: [PATCH v3 4/4] diff: restrict when prefetching occurs
Date: Tue,  7 Apr 2020 15:11:43 -0700	[thread overview]
Message-ID: <ee3373bc6a27ee1c4015abc84b634c036048e3f0.1586296510.git.jonathantanmy@google.com> (raw)
In-Reply-To: <cover.1586296510.git.jonathantanmy@google.com>

Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.

diffcore_std() may read blobs when it calls the following functions:
 (1) diffcore_skip_stat_unmatch() (controlled by the config variable
     diff.autorefreshindex)
 (2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
     detection)
 (3) diffcore_rename() (for rename detection)
 (4) diffcore_pickaxe() (for detecting addition/deletion of specified
     string)

Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 diff.c                        | 73 ++++++++++++++++++++++++-----------
 diffcore-break.c              | 12 +++++-
 diffcore-rename.c             | 55 ++++++++++++++++++++++++--
 diffcore.h                    | 21 ++++++++++
 t/t4067-diff-partial-clone.sh | 48 +++++++++++++++++++++++
 5 files changed, 181 insertions(+), 28 deletions(-)

diff --git a/diff.c b/diff.c
index 8db981b906..d1ad6a3c4a 100644
--- a/diff.c
+++ b/diff.c
@@ -4034,10 +4034,18 @@ int diff_populate_filespec(struct repository *r,
 			 */
 			info.contentp = &s->data;
 
+		if (options && options->missing_object_cb) {
+			if (!oid_object_info_extended(r, &s->oid, &info,
+						      OBJECT_INFO_LOOKUP_REPLACE |
+						      OBJECT_INFO_SKIP_FETCH_OBJECT))
+				goto object_read;
+			options->missing_object_cb(options->missing_object_data);
+		}
 		if (oid_object_info_extended(r, &s->oid, &info,
 					     OBJECT_INFO_LOOKUP_REPLACE))
 			die("unable to read %s", oid_to_hex(&s->oid));
 
+object_read:
 		if (size_only || check_binary) {
 			if (size_only)
 				return 0;
@@ -6447,6 +6455,8 @@ static int diff_filespec_check_stat_unmatch(struct repository *r,
 {
 	struct diff_populate_filespec_options dpf_options = {
 		.check_size_only = 1,
+		.missing_object_cb = diff_queued_diff_prefetch,
+		.missing_object_data = r,
 	};
 
 	if (p->done_skip_stat_unmatch)
@@ -6523,9 +6533,9 @@ void diffcore_fix_diff_index(void)
 	QSORT(q->queue, q->nr, diffnamecmp);
 }
 
-static void add_if_missing(struct repository *r,
-			   struct oid_array *to_fetch,
-			   const struct diff_filespec *filespec)
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec)
 {
 	if (filespec && filespec->oid_valid &&
 	    !S_ISGITLINK(filespec->mode) &&
@@ -6534,29 +6544,48 @@ static void add_if_missing(struct repository *r,
 		oid_array_append(to_fetch, &filespec->oid);
 }
 
-void diffcore_std(struct diff_options *options)
+void diff_queued_diff_prefetch(void *repository)
 {
-	if (options->repo == the_repository && has_promisor_remote()) {
-		/*
-		 * Prefetch the diff pairs that are about to be flushed.
-		 */
-		int i;
-		struct diff_queue_struct *q = &diff_queued_diff;
-		struct oid_array to_fetch = OID_ARRAY_INIT;
+	struct repository *repo = repository;
+	int i;
+	struct diff_queue_struct *q = &diff_queued_diff;
+	struct oid_array to_fetch = OID_ARRAY_INIT;
 
-		for (i = 0; i < q->nr; i++) {
-			struct diff_filepair *p = q->queue[i];
-			add_if_missing(options->repo, &to_fetch, p->one);
-			add_if_missing(options->repo, &to_fetch, p->two);
-		}
-		/*
-		 * NEEDSWORK: Consider deduplicating the OIDs sent.
-		 */
-		promisor_remote_get_direct(options->repo,
-					   to_fetch.oid, to_fetch.nr);
-		oid_array_clear(&to_fetch);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		diff_add_if_missing(repo, &to_fetch, p->one);
+		diff_add_if_missing(repo, &to_fetch, p->two);
 	}
 
+	/*
+	 * NEEDSWORK: Consider deduplicating the OIDs sent.
+	 */
+	promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
+
+	oid_array_clear(&to_fetch);
+}
+
+void diffcore_std(struct diff_options *options)
+{
+	int output_formats_to_prefetch = DIFF_FORMAT_DIFFSTAT |
+		DIFF_FORMAT_NUMSTAT |
+		DIFF_FORMAT_PATCH |
+		DIFF_FORMAT_SHORTSTAT |
+		DIFF_FORMAT_DIRSTAT;
+
+	/*
+	 * Check if the user requested a blob-data-requiring diff output and/or
+	 * break-rewrite detection (which requires blob data). If yes, prefetch
+	 * the diff pairs.
+	 *
+	 * If no prefetching occurs, diffcore_rename() will prefetch if it
+	 * decides that it needs inexact rename detection.
+	 */
+	if (options->repo == the_repository && has_promisor_remote() &&
+	    (options->output_format & output_formats_to_prefetch ||
+	     options->pickaxe_opts & DIFF_PICKAXE_KINDS_MASK))
+		diff_queued_diff_prefetch(options->repo);
+
 	/* NOTE please keep the following in sync with diff_tree_combined() */
 	if (options->skip_stat_unmatch)
 		diffcore_skip_stat_unmatch(options);
diff --git a/diffcore-break.c b/diffcore-break.c
index e8f6322c6a..0d4a14964d 100644
--- a/diffcore-break.c
+++ b/diffcore-break.c
@@ -4,6 +4,7 @@
 #include "cache.h"
 #include "diff.h"
 #include "diffcore.h"
+#include "promisor-remote.h"
 
 static int should_break(struct repository *r,
 			struct diff_filespec *src,
@@ -49,6 +50,8 @@ static int should_break(struct repository *r,
 	unsigned long delta_size, max_size;
 	unsigned long src_copied, literal_added, src_removed;
 
+	struct diff_populate_filespec_options options = { 0 };
+
 	*merge_score_p = 0; /* assume no deletion --- "do not break"
 			     * is the default.
 			     */
@@ -62,8 +65,13 @@ static int should_break(struct repository *r,
 	    oideq(&src->oid, &dst->oid))
 		return 0; /* they are the same */
 
-	if (diff_populate_filespec(r, src, NULL) ||
-	    diff_populate_filespec(r, dst, NULL))
+	if (r == the_repository && has_promisor_remote()) {
+		options.missing_object_cb = diff_queued_diff_prefetch;
+		options.missing_object_data = r;
+	}
+
+	if (diff_populate_filespec(r, src, &options) ||
+	    diff_populate_filespec(r, dst, &options))
 		return 0; /* error but caught downstream */
 
 	max_size = ((src->size > dst->size) ? src->size : dst->size);
diff --git a/diffcore-rename.c b/diffcore-rename.c
index bf4c0b8740..99e63e90f8 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -1,4 +1,5 @@
 /*
+ *
  * Copyright (C) 2005 Junio C Hamano
  */
 #include "cache.h"
@@ -7,6 +8,7 @@
 #include "object-store.h"
 #include "hashmap.h"
 #include "progress.h"
+#include "promisor-remote.h"
 
 /* Table of rename/copy destinations */
 
@@ -128,10 +130,46 @@ struct diff_score {
 	short name_score;
 };
 
+struct prefetch_options {
+	struct repository *repo;
+	int skip_unmodified;
+};
+static void prefetch(void *prefetch_options)
+{
+	struct prefetch_options *options = prefetch_options;
+	int i;
+	struct oid_array to_fetch = OID_ARRAY_INIT;
+
+	for (i = 0; i < rename_dst_nr; i++) {
+		if (rename_dst[i].pair)
+			/*
+			 * The loop in diffcore_rename() will not need these
+			 * blobs, so skip prefetching.
+			 */
+			continue; /* already found exact match */
+		diff_add_if_missing(options->repo, &to_fetch,
+				    rename_dst[i].two);
+	}
+	for (i = 0; i < rename_src_nr; i++) {
+		if (options->skip_unmodified &&
+		    diff_unmodified_pair(rename_src[i].p))
+			/*
+			 * The loop in diffcore_rename() will not need these
+			 * blobs, so skip prefetching.
+			 */
+			continue;
+		diff_add_if_missing(options->repo, &to_fetch,
+				    rename_src[i].p->one);
+	}
+	promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
+	oid_array_clear(&to_fetch);
+}
+
 static int estimate_similarity(struct repository *r,
 			       struct diff_filespec *src,
 			       struct diff_filespec *dst,
-			       int minimum_score)
+			       int minimum_score,
+			       int skip_unmodified)
 {
 	/* src points at a file that existed in the original tree (or
 	 * optionally a file in the destination tree) and dst points
@@ -151,6 +189,12 @@ static int estimate_similarity(struct repository *r,
 	struct diff_populate_filespec_options dpf_options = {
 		.check_size_only = 1
 	};
+	struct prefetch_options prefetch_options = {r, skip_unmodified};
+
+	if (r == the_repository && has_promisor_remote()) {
+		dpf_options.missing_object_cb = prefetch;
+		dpf_options.missing_object_data = &prefetch_options;
+	}
 
 	/* We deal only with regular files.  Symlink renames are handled
 	 * only when they are exact matches --- in other words, no edits
@@ -190,9 +234,11 @@ static int estimate_similarity(struct repository *r,
 	if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
 		return 0;
 
-	if (!src->cnt_data && diff_populate_filespec(r, src, NULL))
+	dpf_options.check_size_only = 0;
+
+	if (!src->cnt_data && diff_populate_filespec(r, src, &dpf_options))
 		return 0;
-	if (!dst->cnt_data && diff_populate_filespec(r, dst, NULL))
+	if (!dst->cnt_data && diff_populate_filespec(r, dst, &dpf_options))
 		return 0;
 
 	if (diffcore_count_changes(r, src, dst,
@@ -569,7 +615,8 @@ void diffcore_rename(struct diff_options *options)
 
 			this_src.score = estimate_similarity(options->repo,
 							     one, two,
-							     minimum_score);
+							     minimum_score,
+							     skip_unmodified);
 			this_src.name_score = basename_same(one, two);
 			this_src.dst = i;
 			this_src.src = j;
diff --git a/diffcore.h b/diffcore.h
index 3b2020ce93..d2a63c5c71 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -65,9 +65,22 @@ void free_filespec(struct diff_filespec *);
 void fill_filespec(struct diff_filespec *, const struct object_id *,
 		   int, unsigned short);
 
+/*
+ * Prefetch the entries in diff_queued_diff. The parameter is a pointer to a
+ * struct repository.
+ */
+void diff_queued_diff_prefetch(void *repository);
+
 struct diff_populate_filespec_options {
 	unsigned check_size_only : 1;
 	unsigned check_binary : 1;
+
+	/*
+	 * If an object is missing, diff_populate_filespec() will invoke this
+	 * callback before attempting to read that object again.
+	 */
+	void (*missing_object_cb)(void *);
+	void *missing_object_data;
 };
 int diff_populate_filespec(struct repository *, struct diff_filespec *,
 			   const struct diff_populate_filespec_options *);
@@ -185,4 +198,12 @@ int diffcore_count_changes(struct repository *r,
 			   unsigned long *src_copied,
 			   unsigned long *literal_added);
 
+/*
+ * If filespec contains an OID and if that object is missing from the given
+ * repository, add that OID to to_fetch.
+ */
+void diff_add_if_missing(struct repository *r,
+			 struct oid_array *to_fetch,
+			 const struct diff_filespec *filespec);
+
 #endif
diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
index 4831ad35e6..c1ed1c2fc4 100755
--- a/t/t4067-diff-partial-clone.sh
+++ b/t/t4067-diff-partial-clone.sh
@@ -131,4 +131,52 @@ test_expect_success 'diff with rename detection batches blobs' '
 	test_line_count = 1 done_lines
 '
 
+test_expect_success 'diff does not fetch anything if inexact rename detection is not needed' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	mv server/b server/c &&
+	git -C server add c &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace
+'
+
+test_expect_success 'diff --break-rewrites fetches only if necessary, and batches blobs if it does' '
+	test_when_finished "rm -rf server client trace" &&
+
+	test_create_repo server &&
+	echo a >server/a &&
+	printf "b\nb\nb\nb\nb\n" >server/b &&
+	git -C server add a b &&
+	git -C server commit -m x &&
+	printf "c\nc\nc\nc\nc\n" >server/b &&
+	git -C server commit -a -m x &&
+
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+	# Ensure no fetches.
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --raw -M HEAD^ HEAD &&
+	! test_path_exists trace &&
+
+	# But with --break-rewrites, ensure that there is exactly 1 negotiation
+	# by checking that there is only 1 "done" line sent. ("done" marks the
+	# end of negotiation.)
+	GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff --break-rewrites --raw -M HEAD^ HEAD &&
+	grep "git> done" trace >done_lines &&
+	test_line_count = 1 done_lines
+'
+
 test_done
-- 
2.26.0.292.g33ef6b2f38-goog


      parent reply	other threads:[~2020-04-07 22:11 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-31  2:04 [PATCH] diff: restrict when prefetching occurs Jonathan Tan
2020-03-31 12:14 ` Derrick Stolee
2020-03-31 16:50   ` Jonathan Tan
2020-03-31 17:48     ` Derrick Stolee
2020-03-31 18:21       ` Junio C Hamano
2020-03-31 18:15 ` Junio C Hamano
2020-04-02 19:19 ` [PATCH v2 0/2] Restrict when prefetcing occurs Jonathan Tan
2020-04-02 19:19   ` [PATCH v2 1/2] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
2020-04-02 19:46     ` Junio C Hamano
2020-04-02 23:01       ` Jonathan Tan
2020-04-02 19:19   ` [PATCH v2 2/2] diff: restrict when prefetching occurs Jonathan Tan
2020-04-02 20:08     ` Junio C Hamano
2020-04-02 23:09       ` Jonathan Tan
2020-04-02 23:25         ` Junio C Hamano
2020-04-02 23:54         ` Junio C Hamano
2020-04-03 21:35           ` Jonathan Tan
2020-04-03 22:12             ` Junio C Hamano
2020-04-02 20:28   ` [PATCH v2 0/2] Restrict when prefetcing occurs Junio C Hamano
2020-04-06 11:44     ` Derrick Stolee
2020-04-06 11:57       ` Garima Singh
2020-04-07 22:11 ` [PATCH v3 0/4] " Jonathan Tan
2020-04-07 22:11   ` [PATCH v3 1/4] promisor-remote: accept 0 as oid_nr in function Jonathan Tan
2020-04-07 22:11   ` [PATCH v3 2/4] diff: make diff_populate_filespec_options struct Jonathan Tan
2020-04-07 23:44     ` Junio C Hamano
2020-04-07 22:11   ` [PATCH v3 3/4] diff: refactor object read Jonathan Tan
2020-04-07 22:11   ` Jonathan Tan [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ee3373bc6a27ee1c4015abc84b634c036048e3f0.1586296510.git.jonathantanmy@google.com \
    --to=jonathantanmy@google.com \
    --cc=garimasigit@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).