[PATCH 0/2] Prefetch objects in pack-objects

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH 0/2] Prefetch objects in pack-objects
@ 2020-07-21  0:21 Jonathan Tan
  2020-07-21  0:21 ` [PATCH 1/2] pack-objects: refactor to oid_object_info_extended Jonathan Tan
  2020-07-21  0:21 ` [PATCH 2/2] pack-objects: prefetch objects to be packed Jonathan Tan
  0 siblings, 2 replies; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21  0:21 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, sluongng

Patch 1 is refactoring, and patch 2 is the main patch which introduces
the feature.

Jonathan Tan (2):
  pack-objects: refactor to oid_object_info_extended
  pack-objects: prefetch objects to be packed

 builtin/pack-objects.c | 40 ++++++++++++++++++++++++++++++++++++----
 t/t5300-pack-object.sh | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 4 deletions(-)

-- 
2.28.0.rc0.105.gf9edc3c819-goog


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] pack-objects: refactor to oid_object_info_extended
  2020-07-21  0:21 [PATCH 0/2] Prefetch objects in pack-objects Jonathan Tan
@ 2020-07-21  0:21 ` Jonathan Tan
  2020-07-21  0:21 ` [PATCH 2/2] pack-objects: prefetch objects to be packed Jonathan Tan
  1 sibling, 0 replies; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21  0:21 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, sluongng

Use oid_object_info_extended() instead of oid_object_info() because a
subsequent commit needs to specify an additional flag here.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/pack-objects.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7016b28485..e09d140eed 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1707,6 +1707,8 @@ static int can_reuse_delta(const struct object_id *base_oid,
 static void check_object(struct object_entry *entry)
 {
 	unsigned long canonical_size;
+	enum object_type type;
+	struct object_info oi = {.typep = &type, .sizep = &canonical_size};
 
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
@@ -1840,8 +1842,10 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry,
-		    oid_object_info(the_repository, &entry->idx.oid, &canonical_size));
+	if (oid_object_info_extended(the_repository, &entry->idx.oid, &oi,
+				     OBJECT_INFO_LOOKUP_REPLACE) < 0)
+		type = -1;
+	oe_set_type(entry, type);
 	if (entry->type_valid) {
 		SET_SIZE(entry, canonical_size);
 	} else {
-- 
2.28.0.rc0.105.gf9edc3c819-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21  0:21 [PATCH 0/2] Prefetch objects in pack-objects Jonathan Tan
  2020-07-21  0:21 ` [PATCH 1/2] pack-objects: refactor to oid_object_info_extended Jonathan Tan
@ 2020-07-21  0:21 ` Jonathan Tan
  2020-07-21  1:00   ` Junio C Hamano
  1 sibling, 1 reply; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21  0:21 UTC (permalink / raw)
  To: git; +Cc: Jonathan Tan, sluongng

When an object to be packed is noticed to be missing, prefetch all
to-be-packed objects in one batch.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
There have been recent discussions about using QUICK whenever we use
SKIP_FETCH_OBJECT. I don't think it fully applies here, since here we
fully expect the object to be present in the non-partial-clone case.
Having said that, I wouldn't be opposed to adding QUICK and then, if the
object read fails and if the repo is not a partial clone, to retry the
object load (before setting the type to -1).
---
 builtin/pack-objects.c | 36 ++++++++++++++++++++++++++++++++----
 t/t5300-pack-object.sh | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e09d140eed..ecef5cda44 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -35,6 +35,7 @@
 #include "midx.h"
 #include "trace2.h"
 #include "shallow.h"
+#include "promisor-remote.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
 #define SIZE(obj) oe_size(&to_pack, obj)
@@ -1704,7 +1705,26 @@ static int can_reuse_delta(const struct object_id *base_oid,
 	return 0;
 }
 
-static void check_object(struct object_entry *entry)
+static void prefetch_to_pack(uint32_t object_index_start) {
+	struct oid_array to_fetch = OID_ARRAY_INIT;
+	uint32_t i;
+
+	for (i = object_index_start; i < to_pack.nr_objects; i++) {
+		struct object_entry *entry = to_pack.objects + i;
+
+		if (!oid_object_info_extended(the_repository,
+					      &entry->idx.oid,
+					      NULL,
+					      OBJECT_INFO_FOR_PREFETCH))
+			continue;
+		oid_array_append(&to_fetch, &entry->idx.oid);
+	}
+	promisor_remote_get_direct(the_repository,
+				   to_fetch.oid, to_fetch.nr);
+	oid_array_clear(&to_fetch);
+}
+
+static void check_object(struct object_entry *entry, uint32_t object_index)
 {
 	unsigned long canonical_size;
 	enum object_type type;
@@ -1843,8 +1863,16 @@ static void check_object(struct object_entry *entry)
 	}
 
 	if (oid_object_info_extended(the_repository, &entry->idx.oid, &oi,
-				     OBJECT_INFO_LOOKUP_REPLACE) < 0)
-		type = -1;
+				     OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_LOOKUP_REPLACE) < 0) {
+		if (has_promisor_remote()) {
+			prefetch_to_pack(object_index);
+			if (oid_object_info_extended(the_repository, &entry->idx.oid, &oi,
+						     OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_LOOKUP_REPLACE) < 0)
+				type = -1;
+		} else {
+			type = -1;
+		}
+	}
 	oe_set_type(entry, type);
 	if (entry->type_valid) {
 		SET_SIZE(entry, canonical_size);
@@ -2065,7 +2093,7 @@ static void get_object_details(void)
 
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
-		check_object(entry);
+		check_object(entry, i);
 		if (entry->type_valid &&
 		    oe_size_greater_than(&to_pack, entry, big_file_threshold))
 			entry->no_try_delta = 1;
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 746cdb626e..d553d0ca46 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -497,4 +497,40 @@ test_expect_success 'make sure index-pack detects the SHA1 collision (large blob
 	)
 '
 
+test_expect_success 'prefetch objects' '
+	rm -rf server client &&
+
+	git init server &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server protocol.version 2 &&
+
+	echo one >server/one &&
+	git -C server add one &&
+	git -C server commit -m one &&
+	git -C server branch one_branch &&
+
+	echo two_a >server/two_a &&
+	echo two_b >server/two_b &&
+	git -C server add two_a two_b &&
+	git -C server commit -m two &&
+
+	echo three >server/three &&
+	git -C server add three &&
+	git -C server commit -m three &&
+	git -C server branch three_branch &&
+
+	# Clone, fetch "two" with blobs excluded, and re-push it. This requires
+	# the client to have the blobs of "two" - verify that these are
+	# prefetched in one batch.
+	git clone --filter=blob:none --single-branch -b one_branch \
+		"file://$(pwd)/server" client &&
+	test_config -C client protocol.version 2 &&
+	TWO=$(git -C server rev-parse three_branch^) &&
+	git -C client fetch --filter=blob:none origin "$TWO" &&
+	GIT_TRACE_PACKET=$(pwd)/trace git -C client push origin "$TWO":refs/heads/two_branch &&
+	grep "git> done" trace >donelines &&
+	test_line_count = 1 donelines
+'
+
 test_done
-- 
2.28.0.rc0.105.gf9edc3c819-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21  0:21 ` [PATCH 2/2] pack-objects: prefetch objects to be packed Jonathan Tan
@ 2020-07-21  1:00   ` Junio C Hamano
  2020-07-21 16:37     ` Jonathan Tan
  0 siblings, 1 reply; 13+ messages in thread
From: Junio C Hamano @ 2020-07-21  1:00 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Jonathan Tan <jonathantanmy@google.com> writes:

> When an object to be packed is noticed to be missing, prefetch all
> to-be-packed objects in one batch.
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---

Hmph, the resulting codeflow structure feels somewhat iffy.  Perhaps
I am not reading the code correctly, but

 * There is a loop that scans from 0..to_pack.nr_objects and calls
   check_object() for each and every one of them;

 * The called check_object(), when it notices that a missing and
   promised (i.e. to be lazily fetched) object is in the to_pack
   array, asks prefetch_to_pack() to scan from that point to the end
   of that array and grabs all of them that are missing.

It almost feels a lot cleaner to see what is going on in the
resulting code, instead of the way the new "loop" was added, if a
new loop is added _before_ the loop to call check_object() on all
objects in to_pack array as a pre-processing phase when there is a
promisor remote.  That is, after reverting all the change this patch
makes to check_object(), add a new loop in get_object_details() that
looks more or less like so:

	QSORT(sorted_by_offset, to_pack.nr_objects, pack_offset_sort);

+	if (has_promisor_remote())
+		prefetch_to_pack(0);
+
	for (i = 0; i < to_pack.nr_objects; i++) {


Was the patch done this way because scanning the entire array twice
is expensive?  The optimization makes sense to me if certain
conditions are met, like...

 - Most of the time there is no missing object due to promisor, even
   if has_promissor_to_remote() is true;

 - When there are missing objects due to promisor, pack_offset_sort
   will keep them near the end of the array; and

 - Given the oid, oid_object_info_extended() on it with
   OBJECT_INFO_FOR_PREFETCH is expensive.

Only when all these conditions are met, it would avoid unnecessary
overhead by scanning only a very later part of the array by delaying
the point in the array where prefetch_to_pack() starts scanning.

Thanks.

> There have been recent discussions about using QUICK whenever we use
> SKIP_FETCH_OBJECT. I don't think it fully applies here, since here we
> fully expect the object to be present in the non-partial-clone case.
> Having said that, I wouldn't be opposed to adding QUICK and then, if the
> object read fails and if the repo is not a partial clone, to retry the
> object load (before setting the type to -1).
> ---
>  builtin/pack-objects.c | 36 ++++++++++++++++++++++++++++++++----
>  t/t5300-pack-object.sh | 36 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 68 insertions(+), 4 deletions(-)
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index e09d140eed..ecef5cda44 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -35,6 +35,7 @@
>  #include "midx.h"
>  #include "trace2.h"
>  #include "shallow.h"
> +#include "promisor-remote.h"
>  
>  #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
>  #define SIZE(obj) oe_size(&to_pack, obj)
> @@ -1704,7 +1705,26 @@ static int can_reuse_delta(const struct object_id *base_oid,
>  	return 0;
>  }
>  
> -static void check_object(struct object_entry *entry)
> +static void prefetch_to_pack(uint32_t object_index_start) {
> +	struct oid_array to_fetch = OID_ARRAY_INIT;
> +	uint32_t i;
> +
> +	for (i = object_index_start; i < to_pack.nr_objects; i++) {
> +		struct object_entry *entry = to_pack.objects + i;
> +
> +		if (!oid_object_info_extended(the_repository,
> +					      &entry->idx.oid,
> +					      NULL,
> +					      OBJECT_INFO_FOR_PREFETCH))
> +			continue;
> +		oid_array_append(&to_fetch, &entry->idx.oid);
> +	}
> +	promisor_remote_get_direct(the_repository,
> +				   to_fetch.oid, to_fetch.nr);
> +	oid_array_clear(&to_fetch);
> +}
> +
> +static void check_object(struct object_entry *entry, uint32_t object_index)
>  {
>  	unsigned long canonical_size;
>  	enum object_type type;
> @@ -1843,8 +1863,16 @@ static void check_object(struct object_entry *entry)
>  	}
>  
>  	if (oid_object_info_extended(the_repository, &entry->idx.oid, &oi,
> -				     OBJECT_INFO_LOOKUP_REPLACE) < 0)
> -		type = -1;
> +				     OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_LOOKUP_REPLACE) < 0) {
> +		if (has_promisor_remote()) {
> +			prefetch_to_pack(object_index);
> +			if (oid_object_info_extended(the_repository, &entry->idx.oid, &oi,
> +						     OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_LOOKUP_REPLACE) < 0)
> +				type = -1;
> +		} else {
> +			type = -1;
> +		}
> +	}
>  	oe_set_type(entry, type);
>  	if (entry->type_valid) {
>  		SET_SIZE(entry, canonical_size);
> @@ -2065,7 +2093,7 @@ static void get_object_details(void)
>  
>  	for (i = 0; i < to_pack.nr_objects; i++) {
>  		struct object_entry *entry = sorted_by_offset[i];
> -		check_object(entry);
> +		check_object(entry, i);
>  		if (entry->type_valid &&
>  		    oe_size_greater_than(&to_pack, entry, big_file_threshold))
>  			entry->no_try_delta = 1;
> diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
> index 746cdb626e..d553d0ca46 100755
> --- a/t/t5300-pack-object.sh
> +++ b/t/t5300-pack-object.sh
> @@ -497,4 +497,40 @@ test_expect_success 'make sure index-pack detects the SHA1 collision (large blob
>  	)
>  '
>  
> +test_expect_success 'prefetch objects' '
> +	rm -rf server client &&
> +
> +	git init server &&
> +	test_config -C server uploadpack.allowanysha1inwant 1 &&
> +	test_config -C server uploadpack.allowfilter 1 &&
> +	test_config -C server protocol.version 2 &&
> +
> +	echo one >server/one &&
> +	git -C server add one &&
> +	git -C server commit -m one &&
> +	git -C server branch one_branch &&
> +
> +	echo two_a >server/two_a &&
> +	echo two_b >server/two_b &&
> +	git -C server add two_a two_b &&
> +	git -C server commit -m two &&
> +
> +	echo three >server/three &&
> +	git -C server add three &&
> +	git -C server commit -m three &&
> +	git -C server branch three_branch &&
> +
> +	# Clone, fetch "two" with blobs excluded, and re-push it. This requires
> +	# the client to have the blobs of "two" - verify that these are
> +	# prefetched in one batch.
> +	git clone --filter=blob:none --single-branch -b one_branch \
> +		"file://$(pwd)/server" client &&
> +	test_config -C client protocol.version 2 &&
> +	TWO=$(git -C server rev-parse three_branch^) &&
> +	git -C client fetch --filter=blob:none origin "$TWO" &&
> +	GIT_TRACE_PACKET=$(pwd)/trace git -C client push origin "$TWO":refs/heads/two_branch &&
> +	grep "git> done" trace >donelines &&
> +	test_line_count = 1 donelines
> +'
> +
>  test_done

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21  1:00   ` Junio C Hamano
@ 2020-07-21 16:37     ` Jonathan Tan
  2020-07-21 19:23       ` Junio C Hamano
  0 siblings, 1 reply; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21 16:37 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, sluongng

> Hmph, the resulting codeflow structure feels somewhat iffy.  Perhaps
> I am not reading the code correctly, but
> 
>  * There is a loop that scans from 0..to_pack.nr_objects and calls
>    check_object() for each and every one of them;
> 
>  * The called check_object(), when it notices that a missing and
>    promised (i.e. to be lazily fetched) object is in the to_pack
>    array, asks prefetch_to_pack() to scan from that point to the end
>    of that array and grabs all of them that are missing.
> 
> It almost feels a lot cleaner to see what is going on in the
> resulting code, instead of the way the new "loop" was added, if a
> new loop is added _before_ the loop to call check_object() on all
> objects in to_pack array as a pre-processing phase when there is a
> promisor remote.  That is, after reverting all the change this patch
> makes to check_object(), add a new loop in get_object_details() that
> looks more or less like so:
> 
> 	QSORT(sorted_by_offset, to_pack.nr_objects, pack_offset_sort);
> 
> +	if (has_promisor_remote())
> +		prefetch_to_pack(0);
> +
> 	for (i = 0; i < to_pack.nr_objects; i++) {
> 
> 
> Was the patch done this way because scanning the entire array twice
> is expensive?

Yes. If we called prefetch_to_pack(0) first (as you suggest), this first
scan involves checking the existence of all objects in the array, so I
thought it would be expensive. (Checking the existence of an object
probably brings the corresponding pack index into disk cache on
platforms like Linux, so 2 object reads might not take much more time
than 1 object read, but I didn't want to rely on this when I could just
avoid the extra read.)

> The optimization makes sense to me if certain
> conditions are met, like...
> 
>  - Most of the time there is no missing object due to promisor, even
>    if has_promissor_to_remote() is true;

I think that optimizing for this condition makes sense - most pushes (I
would think) are pushes of objects we create locally, and thus no
objects are missing.

>  - When there are missing objects due to promisor, pack_offset_sort
>    will keep them near the end of the array; and
> 
>  - Given the oid, oid_object_info_extended() on it with
>    OBJECT_INFO_FOR_PREFETCH is expensive.

I see this as expensive since it involves checking of object existence.

> Only when all these conditions are met, it would avoid unnecessary
> overhead by scanning only a very later part of the array by delaying
> the point in the array where prefetch_to_pack() starts scanning.

Yes (and when there are no missing objects at all, there is no
double-scanning).

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 16:37     ` Jonathan Tan
@ 2020-07-21 19:23       ` Junio C Hamano
  2020-07-21 21:27         ` Junio C Hamano
  2020-07-21 23:20         ` Jonathan Tan
  0 siblings, 2 replies; 13+ messages in thread
From: Junio C Hamano @ 2020-07-21 19:23 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Jonathan Tan <jonathantanmy@google.com> writes:

>> +	if (has_promisor_remote())
>> +		prefetch_to_pack(0);
>> +
>> 	for (i = 0; i < to_pack.nr_objects; i++) {
>> 
>> 
>> Was the patch done this way because scanning the entire array twice
>> is expensive?
>
> Yes. If we called prefetch_to_pack(0) first (as you suggest), this first
> scan involves checking the existence of all objects in the array, so I
> thought it would be expensive. (Checking the existence of an object
> probably brings the corresponding pack index into disk cache on
> platforms like Linux, so 2 object reads might not take much more time
> than 1 object read, but I didn't want to rely on this when I could just
> avoid the extra read.)
>
>> The optimization makes sense to me if certain
>> conditions are met, like...
>> 
>>  - Most of the time there is no missing object due to promisor, even
>>    if has_promissor_to_remote() is true;
>
> I think that optimizing for this condition makes sense - most pushes (I
> would think) are pushes of objects we create locally, and thus no
> objects are missing.
>
>>  - When there are missing objects due to promisor, pack_offset_sort
>>    will keep them near the end of the array; and

I do not see this one got answered, but it is crucial if you want to
argue that the "lazy decision to prefetch at the last moment" is a
good optimization.  If an object in the early part of to_pack array
is missing, you'd end up doing the same amount of work as the
simpler "if promissor is there, prefetch what is missing".

>>  - Given the oid, oid_object_info_extended() on it with
>>    OBJECT_INFO_FOR_PREFETCH is expensive.
>
> I see this as expensive since it involves checking of object existence.

But doesn't the "prefetch before starting the main loop" change the
equation? If we prefetch, we can mark the objects to be prefetched
in prefetch_to_pack() so that the main loop do not even have to
check, so the non-lazy loop taken outside the check_object() and
placed before the main loop would have to run .nr_objects times, in
addition to the main loop that runs .nr_objects times, but you won't
have to call oid_object_info_extended() twice on the same object?

>> Only when all these conditions are met, it would avoid unnecessary
>> overhead by scanning only a very later part of the array by delaying
>> the point in the array where prefetch_to_pack() starts scanning.
>
> Yes (and when there are no missing objects at all, there is no
> double-scanning).

In any case, the design choice needs to be justified in the log
message.  I am not sure if the lazy decision to prefetch at the last
moment is really worth the code smell.  Perhaps it is, if there is a
reason to believe that it would save extra work compared to the more
naive "if we have promissor remote, prefetch what is missing", but I
do not think I've heard that reason yet.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 19:23       ` Junio C Hamano
@ 2020-07-21 21:27         ` Junio C Hamano
  2020-07-21 23:37           ` Jonathan Tan
  2020-07-21 23:20         ` Jonathan Tan
  1 sibling, 1 reply; 13+ messages in thread
From: Junio C Hamano @ 2020-07-21 21:27 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Junio C Hamano <gitster@pobox.com> writes:

>>> The optimization makes sense to me if certain
>>> conditions are met, like...
>>> 
>>>  - Most of the time there is no missing object due to promisor, even
>>>    if has_promissor_to_remote() is true;
>>
>> I think that optimizing for this condition makes sense - most pushes (I
>> would think) are pushes of objects we create locally, and thus no
>> objects are missing.

Another simple thing I missed.  Why do we specifically refer to
"push" here?  Is this codepath in pack-objects not used or less
often used by upload-pack (which is what serves "fetch")?  

I just wanted to make sure that we are not optimizing for "push",
trading efficiency for "fetch" off.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 19:23       ` Junio C Hamano
  2020-07-21 21:27         ` Junio C Hamano
@ 2020-07-21 23:20         ` Jonathan Tan
  2020-07-21 23:51           ` Junio C Hamano
  1 sibling, 1 reply; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21 23:20 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, sluongng

> Jonathan Tan <jonathantanmy@google.com> writes:
> 
> >> +	if (has_promisor_remote())
> >> +		prefetch_to_pack(0);
> >> +
> >> 	for (i = 0; i < to_pack.nr_objects; i++) {
> >> 
> >> 
> >> Was the patch done this way because scanning the entire array twice
> >> is expensive?
> >
> > Yes. If we called prefetch_to_pack(0) first (as you suggest), this first
> > scan involves checking the existence of all objects in the array, so I
> > thought it would be expensive. (Checking the existence of an object
> > probably brings the corresponding pack index into disk cache on
> > platforms like Linux, so 2 object reads might not take much more time
> > than 1 object read, but I didn't want to rely on this when I could just
> > avoid the extra read.)
> >
> >> The optimization makes sense to me if certain
> >> conditions are met, like...
> >> 
> >>  - Most of the time there is no missing object due to promisor, even
> >>    if has_promissor_to_remote() is true;
> >
> > I think that optimizing for this condition makes sense - most pushes (I
> > would think) are pushes of objects we create locally, and thus no
> > objects are missing.
> >
> >>  - When there are missing objects due to promisor, pack_offset_sort
> >>    will keep them near the end of the array; and
> 
> I do not see this one got answered, but it is crucial if you want to
> argue that the "lazy decision to prefetch at the last moment" is a
> good optimization.  If an object in the early part of to_pack array
> is missing, you'd end up doing the same amount of work as the
> simpler "if promissor is there, prefetch what is missing".

My argument is that typically *no* objects are missing, so we should
delay the prefetch as much as possible in the hope that we don't need it
at all. I admit that if some objects are missing, I don't know where
they will be in the to_pack list.

> >>  - Given the oid, oid_object_info_extended() on it with
> >>    OBJECT_INFO_FOR_PREFETCH is expensive.
> >
> > I see this as expensive since it involves checking of object existence.
> 
> But doesn't the "prefetch before starting the main loop" change the
> equation? If we prefetch, we can mark the objects to be prefetched
> in prefetch_to_pack() so that the main loop do not even have to
> check, so the non-lazy loop taken outside the check_object() and
> placed before the main loop would have to run .nr_objects times, in
> addition to the main loop that runs .nr_objects times, but you won't
> have to call oid_object_info_extended() twice on the same object?

The main loop (in get_object_details()) calls check_object() for each
iteration, and check_object() calls oid_object_info_extended()
(oid_object_info() before patch 1 of this series) in order to get the
object's type. I don't see how the prefetch oid_object_info_extended()
(in order to check existence) would eliminate the need for the main-loop
oid_object_info_extended() (which obtains the object type), unless we
record the type somewhere during the prefetch - but that would make
things more complicated than they are now, I think.

> >> Only when all these conditions are met, it would avoid unnecessary
> >> overhead by scanning only a very later part of the array by delaying
> >> the point in the array where prefetch_to_pack() starts scanning.
> >
> > Yes (and when there are no missing objects at all, there is no
> > double-scanning).
> 
> In any case, the design choice needs to be justified in the log
> message.  I am not sure if the lazy decision to prefetch at the last
> moment is really worth the code smell.  Perhaps it is, if there is a
> reason to believe that it would save extra work compared to the more
> naive "if we have promissor remote, prefetch what is missing", but I
> do not think I've heard that reason yet.

I still think that there is a reason (the extra existence check), but if
we think that the extra existence check is fast enough (compared to the
other operations in pack-objects) or that there is a way to avoid
calling oid_object_info_extended() twice for the same object (even with
moving the prefetch loop to the beginning), then I agree that we don't
need the lazy decision. (Or if we want to write the simpler code now and
only improve the performance if we need it later, that's fine with me
too.)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 21:27         ` Junio C Hamano
@ 2020-07-21 23:37           ` Jonathan Tan
  2020-07-21 23:56             ` Junio C Hamano
  0 siblings, 1 reply; 13+ messages in thread
From: Jonathan Tan @ 2020-07-21 23:37 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, sluongng

> Junio C Hamano <gitster@pobox.com> writes:
> 
> >>> The optimization makes sense to me if certain
> >>> conditions are met, like...
> >>> 
> >>>  - Most of the time there is no missing object due to promisor, even
> >>>    if has_promissor_to_remote() is true;
> >>
> >> I think that optimizing for this condition makes sense - most pushes (I
> >> would think) are pushes of objects we create locally, and thus no
> >> objects are missing.
> 
> Another simple thing I missed.  Why do we specifically refer to
> "push" here?  Is this codepath in pack-objects not used or less
> often used by upload-pack (which is what serves "fetch")?  
> 
> I just wanted to make sure that we are not optimizing for "push",
> trading efficiency for "fetch" off.

Ah...that's true. For "fetch" I think it's less important because the
multiple roundtrips of the pack negotiation means that the packfiles are
usually more specific, but this optimization will help "fetch" too. I
don't think we're optimizing at the expense of "fetch" - any
improvements should benefit both similarly.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 23:20         ` Jonathan Tan
@ 2020-07-21 23:51           ` Junio C Hamano
  2020-07-22 21:30             ` Jonathan Tan
  0 siblings, 1 reply; 13+ messages in thread
From: Junio C Hamano @ 2020-07-21 23:51 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Jonathan Tan <jonathantanmy@google.com> writes:

> My argument is that typically *no* objects are missing, so we should
> delay the prefetch as much as possible in the hope that we don't need it
> at all. I admit that if some objects are missing, I don't know where
> they will be in the to_pack list.

OK, if the common case we optimize for is where there is no object
missing, then of course "prefetch missing ones upfront" will spend
cycles to ensure all objects we will pack exist before going into
the main loop, so the "delay as much as possible" optimization makes
sense.

>> In any case, the design choice needs to be justified in the log
>> message.  I am not sure if the lazy decision to prefetch at the last
>> moment is really worth the code smell.  Perhaps it is, if there is a
>> reason to believe that it would save extra work compared to the more
>> naive "if we have promissor remote, prefetch what is missing", but I
>> do not think I've heard that reason yet.
>
> I still think that there is a reason (the extra existence check), but if
> we think that the extra existence check is fast enough (compared to the
> other operations in pack-objects) or that there is a way to avoid
> calling oid_object_info_extended() twice for the same object (even with
> moving the prefetch loop to the beginning), then I agree that we don't
> need the lazy decision. (Or if we want to write the simpler code now and
> only improve the performance if we need it later, that's fine with me
> too.)

Now I finally have, I think, heard some of the reasoning behind the
design decision made before this patch was written.  That should be
written in the proposed log message.

Thanks.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 23:37           ` Jonathan Tan
@ 2020-07-21 23:56             ` Junio C Hamano
  0 siblings, 0 replies; 13+ messages in thread
From: Junio C Hamano @ 2020-07-21 23:56 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Jonathan Tan <jonathantanmy@google.com> writes:

>> Junio C Hamano <gitster@pobox.com> writes:
>> 
>> >>> The optimization makes sense to me if certain
>> >>> conditions are met, like...
>> >>> 
>> >>>  - Most of the time there is no missing object due to promisor, even
>> >>>    if has_promissor_to_remote() is true;
>> >>
>> >> I think that optimizing for this condition makes sense - most pushes (I
>> >> would think) are pushes of objects we create locally, and thus no
>> >> objects are missing.
>> 
>> Another simple thing I missed.  Why do we specifically refer to
>> "push" here?  Is this codepath in pack-objects not used or less
>> often used by upload-pack (which is what serves "fetch")?  
>> 
>> I just wanted to make sure that we are not optimizing for "push",
>> trading efficiency for "fetch" off.
>
> Ah...that's true. For "fetch" I think it's less important because the
> multiple roundtrips of the pack negotiation means that the packfiles are
> usually more specific, but this optimization will help "fetch" too. I
> don't think we're optimizing at the expense of "fetch" - any
> improvements should benefit both similarly.

The one who serves "fetch" (i.e. "upload-pack") may be advertising
the objects that it itself would have to lazily fetch from its
promisor remote, but that is sort of crazy.  As long as we have
something to discourage such an arrangement (i.e. "don't fetch from
a lazy clone"), we would be OK to assume that most of the time there
is no missing objects.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-21 23:51           ` Junio C Hamano
@ 2020-07-22 21:30             ` Jonathan Tan
  2020-07-22 21:45               ` Junio C Hamano
  0 siblings, 1 reply; 13+ messages in thread
From: Jonathan Tan @ 2020-07-22 21:30 UTC (permalink / raw)
  To: gitster; +Cc: jonathantanmy, git, sluongng

> Jonathan Tan <jonathantanmy@google.com> writes:
> 
> > My argument is that typically *no* objects are missing, so we should
> > delay the prefetch as much as possible in the hope that we don't need it
> > at all. I admit that if some objects are missing, I don't know where
> > they will be in the to_pack list.
> 
> OK, if the common case we optimize for is where there is no object
> missing, then of course "prefetch missing ones upfront" will spend
> cycles to ensure all objects we will pack exist before going into
> the main loop, so the "delay as much as possible" optimization makes
> sense.

OK.

> >> In any case, the design choice needs to be justified in the log
> >> message.  I am not sure if the lazy decision to prefetch at the last
> >> moment is really worth the code smell.  Perhaps it is, if there is a
> >> reason to believe that it would save extra work compared to the more
> >> naive "if we have promissor remote, prefetch what is missing", but I
> >> do not think I've heard that reason yet.
> >
> > I still think that there is a reason (the extra existence check), but if
> > we think that the extra existence check is fast enough (compared to the
> > other operations in pack-objects) or that there is a way to avoid
> > calling oid_object_info_extended() twice for the same object (even with
> > moving the prefetch loop to the beginning), then I agree that we don't
> > need the lazy decision. (Or if we want to write the simpler code now and
> > only improve the performance if we need it later, that's fine with me
> > too.)
> 
> Now I finally have, I think, heard some of the reasoning behind the
> design decision made before this patch was written.  That should be
> written in the proposed log message.
> 
> Thanks.

OK - how about:

[start]
When an object to be packed is noticed to be missing, prefetch all
to-be-packed objects in one batch.

Most of the time (typically, when serving a fetch or when pushing),
packs consist only of objects that the repo has. To maintain the
performance in this case, the existing object type read is made to also
serve as the object existence check: if the read fails, then we do the
prefetch.

An alternative design is to loop over all the entries in to_pack,
checking the existence of each object, and prefetching if we notice any
missing. This would result in clearer code, but would incur some
performance degradation in the aforementioned most common case due to
the additional object existence checks.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
[end]

The first paragraph is already in the original, and the rest are
additions.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] pack-objects: prefetch objects to be packed
  2020-07-22 21:30             ` Jonathan Tan
@ 2020-07-22 21:45               ` Junio C Hamano
  0 siblings, 0 replies; 13+ messages in thread
From: Junio C Hamano @ 2020-07-22 21:45 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, sluongng

Jonathan Tan <jonathantanmy@google.com> writes:

> OK - how about:
>
> [start]
> When an object to be packed is noticed to be missing, prefetch all
> to-be-packed objects in one batch.
>
> Most of the time (typically, when serving a fetch or when pushing),
> packs consist only of objects that the repo has. To maintain the
> performance in this case, the existing object type read is made to also
> serve as the object existence check: if the read fails, then we do the
> prefetch.
>
> An alternative design is to loop over all the entries in to_pack,
> checking the existence of each object, and prefetching if we notice any
> missing. This would result in clearer code, but would incur some
> performance degradation in the aforementioned most common case due to
> the additional object existence checks.
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> [end]
>
> The first paragraph is already in the original, and the rest are
> additions.

Sure.  

The first two paragraphs would be sufficient, although the last
paragraph wouldn't hurt.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-07-22 21:45 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-21  0:21 [PATCH 0/2] Prefetch objects in pack-objects Jonathan Tan
2020-07-21  0:21 ` [PATCH 1/2] pack-objects: refactor to oid_object_info_extended Jonathan Tan
2020-07-21  0:21 ` [PATCH 2/2] pack-objects: prefetch objects to be packed Jonathan Tan
2020-07-21  1:00   ` Junio C Hamano
2020-07-21 16:37     ` Jonathan Tan
2020-07-21 19:23       ` Junio C Hamano
2020-07-21 21:27         ` Junio C Hamano
2020-07-21 23:37           ` Jonathan Tan
2020-07-21 23:56             ` Junio C Hamano
2020-07-21 23:20         ` Jonathan Tan
2020-07-21 23:51           ` Junio C Hamano
2020-07-22 21:30             ` Jonathan Tan
2020-07-22 21:45               ` Junio C Hamano

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).