[PATCH] pack-objects: use streaming interface for reading large loose blobs

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH] pack-objects: use streaming interface for reading large loose blobs
@ 2012-05-12 10:26 Nguyễn Thái Ngọc Duy
  2012-05-12 16:51 ` Nicolas Pitre
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
  0 siblings, 2 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-12 10:26 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read these blobs and compress/write at the
same time.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 index-pack's streaming support is on the way. unpack-objects is
 another story because I'm thinking of merging it back to index-pack
 first, which may take more than one release cycle.

 builtin/pack-objects.c |   73 ++++++++++++++++++++++++++++++++++++++++++++----
 t/t1050-large.sh       |   16 ++++++++++
 2 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..98b51c1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	int zret;
+
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		die(_("failed to read %s"), sha1_to_hex(sha1));
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	if (type != OBJ_BLOB)
+		die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+	for (;;) {
+		ssize_t readlen;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("failed to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		zret = Z_OK;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	close_istream(st);
+	git_deflate_end(&stream);
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
 	if (!to_reuse) {
 		no_reuse:
 		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			type = sha1_object_info(entry->idx.sha1, &size);
+			if (type == OBJ_BLOB && size > big_file_threshold)
+				buf = NULL;
+			else {
+				buf = read_sha1_file(entry->idx.sha1, &type, &size);
+				if (!buf)
+					die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			}
 			/*
 			 * make sure no cached delta data remains from a
 			 * previous attempt before a pack split occurred.
@@ -284,8 +339,11 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (entry->z_delta_size)
 			datalen = entry->z_delta_size;
-		else
+		else if (buf)
 			datalen = do_compress(&buf, size);
+		else
+			/* large blob case, just assume we don't compress well */
+			datalen = size;
 
 		/*
 		 * The object header is a byte of 'type' followed by zero or
@@ -330,8 +388,11 @@ static unsigned long write_object(struct sha1file *f,
 			}
 			sha1write(f, header, hdrlen);
 		}
-		sha1write(f, buf, datalen);
-		free(buf);
+		if (buf) {
+			sha1write(f, buf, datalen);
+			free(buf);
+		} else
+			write_large_blob_data(f, entry->idx.sha1);
 	}
 	else {
 		struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	echo Z | dd of=large4 bs=1k seek=2000 &&
+	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+	(
+	unset GIT_ALLOC_LIMIT &&
+	cat large4 | git hash-object -w --stdin &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+	) &&
+	echo $OBJ | git pack-objects .git/objects/pack/pack &&
+	rm $P &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] pack-objects: use streaming interface for reading large loose blobs
  2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
@ 2012-05-12 16:51 ` Nicolas Pitre
  2012-05-13  4:37   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
  1 sibling, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2012-05-12 16:51 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1841 bytes --]

On Sat, 12 May 2012, Nguyễn Thái Ngọc Duy wrote:

> git usually streams large blobs directly to packs. But there are cases
> where git can create large loose blobs (unpack-objects or hash-object
> over pipe). Or they can come from other git implementations.
> core.bigfilethreshold can also be lowered down and introduce a new
> wave of large loose blobs.
> 
> Use streaming interface to read these blobs and compress/write at the
> same time.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Comments below.

> ---
>  index-pack's streaming support is on the way. unpack-objects is
>  another story because I'm thinking of merging it back to index-pack
>  first, which may take more than one release cycle.
> 
>  builtin/pack-objects.c |   73 ++++++++++++++++++++++++++++++++++++++++++++----
>  t/t1050-large.sh       |   16 ++++++++++
>  2 files changed, 83 insertions(+), 6 deletions(-)
> 
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 1861093..98b51c1 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
>  	if (!to_reuse) {
>  		no_reuse:
>  		if (!usable_delta) {
> -			buf = read_sha1_file(entry->idx.sha1, &type, &size);
> -			if (!buf)
> -				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> +			type = sha1_object_info(entry->idx.sha1, &size);

Please don't use sha1_object_info() lightly.  This is a potentially 
expensive operation, and you really don't want to do it on each objects.

And as a matter of fact, the information you are looking for has already 
been determined earlier.  See the code in check_object() which tries 
hard to avoid sha1_object_info() as much as possible.

Therefore you should have entry->type and entry->size already set for 
you to use.


Nicolas

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-12 16:51 ` Nicolas Pitre
@ 2012-05-13  4:37   ` Nguyễn Thái Ngọc Duy
  2012-05-14 15:56     ` Junio C Hamano
  2012-05-14 19:43     ` Junio C Hamano
  0 siblings, 2 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-13  4:37 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one go.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 On Sat, May 12, 2012 at 11:51 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
 >> @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
 >>       if (!to_reuse) {
 >>               no_reuse:
 >>               if (!usable_delta) {
 >> -                     buf = read_sha1_file(entry->idx.sha1, &type, &size);
 >> -                     if (!buf)
 >> -                             die("unable to read %s", sha1_to_hex(entry->idx.sha1));
 >> +                     type = sha1_object_info(entry->idx.sha1, &size);
 >
 > Please don't use sha1_object_info() lightly.  This is a potentially
 > expensive operation, and you really don't want to do it on each objects.
 >
 > And as a matter of fact, the information you are looking for has already
 > been determined earlier.  See the code in check_object() which tries
 > hard to avoid sha1_object_info() as much as possible.
 >
 > Therefore you should have entry->type and entry->size already set for
 > you to use.

 Thanks. sha1_object_info() removed in favor of entry->type and entry->size.

 builtin/pack-objects.c |   72 ++++++++++++++++++++++++++++++++++++++++++++----
 t/t1050-large.sh       |   16 ++++++++++
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..ab5438a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	int zret;
+
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		die(_("unable to read %s"), sha1_to_hex(sha1));
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	if (type != OBJ_BLOB)
+		die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+	for (;;) {
+		ssize_t readlen;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		zret = Z_OK;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	close_istream(st);
+	git_deflate_end(&stream);
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -259,9 +309,13 @@ static unsigned long write_object(struct sha1file *f,
 	if (!to_reuse) {
 		no_reuse:
 		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			if (entry->type == OBJ_BLOB && entry->size > big_file_threshold)
+				buf = NULL;
+			else {
+				buf = read_sha1_file(entry->idx.sha1, &type, &size);
+				if (!buf)
+					die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+			}
 			/*
 			 * make sure no cached delta data remains from a
 			 * previous attempt before a pack split occurred.
@@ -284,8 +338,11 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (entry->z_delta_size)
 			datalen = entry->z_delta_size;
-		else
+		else if (buf)
 			datalen = do_compress(&buf, size);
+		else
+			/* large blob case, just assume we don't compress well */
+			datalen = size;
 
 		/*
 		 * The object header is a byte of 'type' followed by zero or
@@ -330,8 +387,11 @@ static unsigned long write_object(struct sha1file *f,
 			}
 			sha1write(f, header, hdrlen);
 		}
-		sha1write(f, buf, datalen);
-		free(buf);
+		if (buf) {
+			sha1write(f, buf, datalen);
+			free(buf);
+		} else
+			write_large_blob_data(f, entry->idx.sha1);
 	}
 	else {
 		struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	echo Z | dd of=large4 bs=1k seek=2000 &&
+	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+	(
+	unset GIT_ALLOC_LIMIT &&
+	cat large4 | git hash-object -w --stdin &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+	) &&
+	echo $OBJ | git pack-objects .git/objects/pack/pack &&
+	rm $P &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-13  4:37   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
@ 2012-05-14 15:56     ` Junio C Hamano
  2012-05-14 19:43     ` Junio C Hamano
  1 sibling, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-14 15:56 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:

> +	st = open_istream(sha1, &type, &sz, NULL);
> +	if (!st)
> +		die(_("unable to read %s"), sha1_to_hex(sha1));
> +
> +	memset(&stream, 0, sizeof(stream));
> +	git_deflate_init(&stream, pack_compression_level);
> +
> +	if (type != OBJ_BLOB)
> +		die("BUG: %s is not a blob", sha1_to_hex(sha1));

Just a comment, not an expression of preference, but if we are treating
istream interface as an add-on, I wonder it might make sense not to die in
these places, and instead fall back to the usual "read and then write"
codepath.

To put it another way, I am wondering if it makes the logic more clear if
you restructure the calling side a bit more, perhaps like this:

        if (!to_reuse) {
                if (!usable_delta) {
                        /* we know we will write as base object */
                        ... do *NOT* do read_sha1_file() here !!!
                } else if ... delta cases {
                        decide delta_data and z_delta_size as before
                }

                if (entry->z_delta_size) {
                        datalen = entry->z_delta_size;
                        hdrlen = encode_in_pack_object_header(type, size, header);
                        ... write either OFS_DELTA or REF_DELTA here
                } else {
                        /* base object case */
                        if (write_object_in_base_representation(f, sha1)) {
                                /* stream interface punted */
                                read_sha1_file();
                                do_compress();
                                write it;
                        }
                }
        } else { /* reuse case ... */

> @@ -259,9 +309,13 @@ static unsigned long write_object(struct sha1file *f,
>  	if (!to_reuse) {
>  		no_reuse:
>  		if (!usable_delta) {
> -			buf = read_sha1_file(entry->idx.sha1, &type, &size);
> -			if (!buf)
> -				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> +			if (entry->type == OBJ_BLOB && entry->size > big_file_threshold)
> +				buf = NULL;

Two comments:

 - In get_object_details(), we do this:

		if (big_file_threshold <= entry->size)
			entry->no_try_delta = 1;

   The new code is inconsistent with respect to the boundary conditions,
   when the size is exactly at the threshold.

 - Magic condition "buf has NULL means we will read via streaming
   interface" feels somewhat hacky and tasteless.  I am borderline OK with
   such a hack whose scope is clearly limited to a short function like
   this one, but at least it needs to be documented.  I'd rather see the
   conditionals that look at !buf/buf are rewritten to use a new bool
   variable similar to to_reuse and usable_delta whose name tells the
   reader more explicitly what is going on, though.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-13  4:37   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2012-05-14 15:56     ` Junio C Hamano
@ 2012-05-14 19:43     ` Junio C Hamano
  2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 16+ messages in thread
From: Junio C Hamano @ 2012-05-14 19:43 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> git usually streams large blobs directly to packs. But there are cases
> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
> index 55ed955..7fbd2e1 100755
> --- a/t/t1050-large.sh
> +++ b/t/t1050-large.sh
> @@ -134,6 +134,22 @@ test_expect_success 'repack' '
>  	git repack -ad
>  '
>  
> +test_expect_success 'pack-objects with large loose object' '
> +	echo Z | dd of=large4 bs=1k seek=2000 &&
> +	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
> +	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&

I do not think you need these hardcoded constants; you will run
hash-object later, no?

Also, relying on $P to exist after hash-object -w returns is somewhat
flaky, no?

> +	(
> +	unset GIT_ALLOC_LIMIT &&

sane_unset?

> +	cat large4 | git hash-object -w --stdin &&
> +	git cat-file blob $OBJ >actual &&
> +	cmp large4 actual
> +	) &&
> +	echo $OBJ | git pack-objects .git/objects/pack/pack &&

If you do not write directly into this directory, and then
create another _empty_ repository and deposit the packfile and its
associated .idx file there, then you do not have to care how the earlier
hash-object wrote its object.  You are interested only in pack-objects
producing a usable pack, and testing it like so will be the most direct
way to test it, no?

> +	rm $P &&
> +	git cat-file blob $OBJ >actual &&
> +	cmp large4 actual
> +'

In any case, the patch when applied on top of cd07cc5 (Update draft
release notes to 1.7.11 (11th batch), 2012-05-11) does not pass this part
of the test on my box.

>  test_expect_success 'tar achiving' '
>  	git archive --format=tar HEAD >/dev/null
>  '

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-14 19:43     ` Junio C Hamano
@ 2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
  2012-05-15 15:27         ` Junio C Hamano
  0 siblings, 1 reply; 16+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-15 11:18 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

On Tue, May 15, 2012 at 2:43 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> git usually streams large blobs directly to packs. But there are cases
>> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
>> index 55ed955..7fbd2e1 100755
>> --- a/t/t1050-large.sh
>> +++ b/t/t1050-large.sh
>> @@ -134,6 +134,22 @@ test_expect_success 'repack' '
>>       git repack -ad
>>  '
>>
>> +test_expect_success 'pack-objects with large loose object' '
>> +     echo Z | dd of=large4 bs=1k seek=2000 &&
>> +     OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
>> +     P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
>
> I do not think you need these hardcoded constants; you will run
> hash-object later, no?
>
> Also, relying on $P to exist after hash-object -w returns is somewhat
> flaky, no?

I need it to be a loose object to test this code path. Maybe this instead?

test_expect_success 'pack-objects with large loose object' '
	SHA1=`git hash-object huge` &&
	test_create_repo loose &&
	echo $SHA1 | git pack-objects --stdout |
		GIT_ALLOC_LIMIT=0 GIT_DIR=loose/.git git unpack-objects &&
	echo $SHA1 | GIT_DIR=loose/.git git pack-objects pack  &&
	test_create_repo packed &&
	mv pack-* packed/.git/objects/pack &&
	GIT_DIR=packed/.git git cat-file blob $SHA1 >actual &&
	cmp huge actual
'

>> +     rm $P &&
>> +     git cat-file blob $OBJ >actual &&
>> +     cmp large4 actual
>> +'
>
> In any case, the patch when applied on top of cd07cc5 (Update draft
> release notes to 1.7.11 (11th batch), 2012-05-11) does not pass this part
> of the test on my box.

Interesting. It passes for me (same base). I assume rm failed?
-- 
Duy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
@ 2012-05-15 15:27         ` Junio C Hamano
  2012-05-16  7:09           ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 16+ messages in thread
From: Junio C Hamano @ 2012-05-15 15:27 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git, Nicolas Pitre

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> On Tue, May 15, 2012 at 2:43 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>>
>>> git usually streams large blobs directly to packs. But there are cases
>>> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
>>> index 55ed955..7fbd2e1 100755
>>> --- a/t/t1050-large.sh
>>> +++ b/t/t1050-large.sh
>>> @@ -134,6 +134,22 @@ test_expect_success 'repack' '
>>>       git repack -ad
>>>  '
>>>
>>> +test_expect_success 'pack-objects with large loose object' '
>>> +     echo Z | dd of=large4 bs=1k seek=2000 &&
>>> +     OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
>>> +     P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
>>
>> I do not think you need these hardcoded constants; you will run
>> hash-object later, no?
>>
>> Also, relying on $P to exist after hash-object -w returns is somewhat
>> flaky, no?
>
> I need it to be a loose object to test this code path.

No you don't.  You only need it to be something istream_read() will read
from, iow, it could come from a base representation in a packfile.

In short, what I was hinting at was to do something like this instead:

	create a large file "large4"
        BLOB=$(git hash-objects -w "large4")
        PACK=$(echo "$BLOB" | git pack-objects pack)
        create a new empty repository "check"
        move pack-$PACK.{pack,idx} to "check" repository
        in "check" repository, try to see if you can read $BLOB correctly

>> In any case, the patch when applied on top of cd07cc5 (Update draft
>> release notes to 1.7.11 (11th batch), 2012-05-11) does not pass this part
>> of the test on my box.
>
> Interesting. It passes for me (same base). I assume rm failed?

No, reading the resulting pack dies with an error message that says the
object could not be read at offset 12, implying that the pack writer wrote
something bogus.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
  2012-05-15 15:27         ` Junio C Hamano
@ 2012-05-16  7:09           ` Nguyen Thai Ngoc Duy
  0 siblings, 0 replies; 16+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-16  7:09 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

On Tue, May 15, 2012 at 10:27 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
>
>> On Tue, May 15, 2012 at 2:43 AM, Junio C Hamano <gitster@pobox.com> wrote:
>>> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>>>
>>>> git usually streams large blobs directly to packs. But there are cases
>>>> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
>>>> index 55ed955..7fbd2e1 100755
>>>> --- a/t/t1050-large.sh
>>>> +++ b/t/t1050-large.sh
>>>> @@ -134,6 +134,22 @@ test_expect_success 'repack' '
>>>>       git repack -ad
>>>>  '
>>>>
>>>> +test_expect_success 'pack-objects with large loose object' '
>>>> +     echo Z | dd of=large4 bs=1k seek=2000 &&
>>>> +     OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
>>>> +     P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
>>>
>>> I do not think you need these hardcoded constants; you will run
>>> hash-object later, no?
>>>
>>> Also, relying on $P to exist after hash-object -w returns is somewhat
>>> flaky, no?
>>
>> I need it to be a loose object to test this code path.
>
> No you don't.  You only need it to be something istream_read() will read
> from, iow, it could come from a base representation in a packfile.

No, an in-pack object will set to_reuse to 1, which goes a completely
different code path. write_large_blob_data() is only called when
to_reuse == 0.

>>> In any case, the patch when applied on top of cd07cc5 (Update draft
>>> release notes to 1.7.11 (11th batch), 2012-05-11) does not pass this part
>>> of the test on my box.
>>
>> Interesting. It passes for me (same base). I assume rm failed?
>
> No, reading the resulting pack dies with an error message that says the
> object could not be read at offset 12, implying that the pack writer wrote
> something bogus.

I'm still unable to reproduce that. But I think I found the problem.
In streaming code path, I set datalen = <uncompressed size> but
write_object() returns "hdrlen + (wrong) datalen". Patches will come a
couple of hours from now.
-- 
Duy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/4] streaming: allow to call close_istream(NULL);
  2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
  2012-05-12 16:51 ` Nicolas Pitre
@ 2012-05-16 12:02 ` Nguyễn Thái Ngọc Duy
  2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
                     ` (3 more replies)
  1 sibling, 4 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:02 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

This makes it more convenient in cleanup code, like free(NULL).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 streaming.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/streaming.c b/streaming.c
index 3a3cd12..38b39cd 100644
--- a/streaming.c
+++ b/streaming.c
@@ -94,7 +94,7 @@ struct git_istream {
 
 int close_istream(struct git_istream *st)
 {
-	int r = st->vtbl->close(st);
+	int r = st ? st->vtbl->close(st) : 0;
 	free(st);
 	return r;
 }
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .."
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
@ 2012-05-16 12:02   ` Nguyễn Thái Ngọc Duy
  2012-05-18 21:05     ` Junio C Hamano
  2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:02 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

This is because all other places do "xx > big_file_threshold"

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Without two comparisons I added recently in archive-*.c, it's still
 inconsistent (two xx >= big_file and one xx > big_file).  I have no
 preference between > and >=. If some of you vote >=, I'll update the
 patch.

 builtin/pack-objects.c |    2 +-
 streaming.c            |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..b2e0940 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1327,7 +1327,7 @@ static void get_object_details(void)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold <= entry->size)
+		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
 	}
 
diff --git a/streaming.c b/streaming.c
index 38b39cd..829ce7d 100644
--- a/streaming.c
+++ b/streaming.c
@@ -121,7 +121,7 @@ static enum input_source istream_source(const unsigned char *sha1,
 	case OI_LOOSE:
 		return loose;
 	case OI_PACKED:
-		if (!oi->u.packed.is_delta && big_file_threshold <= size)
+		if (!oi->u.packed.is_delta && big_file_threshold < size)
 			return pack_non_delta;
 		/* fallthru */
 	default:
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/4] pack-objects: refactor write_object()
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
  2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
@ 2012-05-16 12:02   ` Nguyễn Thái Ngọc Duy
  2012-05-18 21:16     ` Junio C Hamano
  2012-05-19  2:43     ` Nicolas Pitre
  2012-05-16 12:02   ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
  2012-05-18 21:02   ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano
  3 siblings, 2 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:02 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

Move !to_reuse and to_reuse write code out into two separate functions
and remove "goto no_reuse;" hack

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  322 ++++++++++++++++++++++++++----------------------
 1 files changed, 172 insertions(+), 150 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b2e0940..ccfcbad 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -200,22 +200,178 @@ static void copy_pack_data(struct sha1file *f,
 }
 
 /* Return 0 if we will bust the pack-size limit */
-static unsigned long write_object(struct sha1file *f,
-				  struct object_entry *entry,
-				  off_t write_offset)
+static unsigned long write_no_reuse_object(struct sha1file *f, struct object_entry *entry,
+					   unsigned long limit, int usable_delta)
 {
-	unsigned long size, limit, datalen;
-	void *buf;
+	unsigned long size, datalen;
 	unsigned char header[10], dheader[10];
 	unsigned hdrlen;
 	enum object_type type;
+	void *buf;
+
+	if (!usable_delta) {
+		buf = read_sha1_file(entry->idx.sha1, &type, &size);
+		if (!buf)
+			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+		/*
+		 * make sure no cached delta data remains from a
+		 * previous attempt before a pack split occurred.
+		 */
+		free(entry->delta_data);
+		entry->delta_data = NULL;
+		entry->z_delta_size = 0;
+	} else if (entry->delta_data) {
+		size = entry->delta_size;
+		buf = entry->delta_data;
+		entry->delta_data = NULL;
+		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+			OBJ_OFS_DELTA : OBJ_REF_DELTA;
+	} else {
+		buf = get_delta(entry);
+		size = entry->delta_size;
+		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+			OBJ_OFS_DELTA : OBJ_REF_DELTA;
+	}
+
+	if (entry->z_delta_size)
+		datalen = entry->z_delta_size;
+	else
+		datalen = do_compress(&buf, size);
+
+	/*
+	 * The object header is a byte of 'type' followed by zero or
+	 * more bytes of length.
+	 */
+	hdrlen = encode_in_pack_object_header(type, size, header);
+
+	if (type == OBJ_OFS_DELTA) {
+		/*
+		 * Deltas with relative base contain an additional
+		 * encoding of the relative offset for the delta
+		 * base from this object's position in the pack.
+		 */
+		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		unsigned pos = sizeof(dheader) - 1;
+		dheader[pos] = ofs & 127;
+		while (ofs >>= 7)
+			dheader[--pos] = 128 | (--ofs & 127);
+		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			free(buf);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+		sha1write(f, dheader + pos, sizeof(dheader) - pos);
+		hdrlen += sizeof(dheader) - pos;
+	} else if (type == OBJ_REF_DELTA) {
+		/*
+		 * Deltas with a base reference contain
+		 * an additional 20 bytes for the base sha1.
+		 */
+		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
+			free(buf);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+		sha1write(f, entry->delta->idx.sha1, 20);
+		hdrlen += 20;
+	} else {
+		if (limit && hdrlen + datalen + 20 >= limit) {
+			free(buf);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+	}
+	sha1write(f, buf, datalen);
+	free(buf);
+
+	return hdrlen + datalen;
+}
+
+/* Return 0 if we will bust the pack-size limit */
+static unsigned long write_reuse_object(struct sha1file *f, struct object_entry *entry,
+					unsigned long limit, int usable_delta)
+{
+	struct packed_git *p = entry->in_pack;
+	struct pack_window *w_curs = NULL;
+	struct revindex_entry *revidx;
+	off_t offset;
+	enum object_type type = entry->type;
+	unsigned long datalen;
+	unsigned char header[10], dheader[10];
+	unsigned hdrlen;
+
+	if (entry->delta)
+		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+			OBJ_OFS_DELTA : OBJ_REF_DELTA;
+	hdrlen = encode_in_pack_object_header(type, entry->size, header);
+
+	offset = entry->in_pack_offset;
+	revidx = find_pack_revindex(p, offset);
+	datalen = revidx[1].offset - offset;
+	if (!pack_to_stdout && p->index_version > 1 &&
+	    check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
+		error("bad packed object CRC for %s", sha1_to_hex(entry->idx.sha1));
+		unuse_pack(&w_curs);
+		return write_no_reuse_object(f, entry, limit, usable_delta);
+	}
+
+	offset += entry->in_pack_header_size;
+	datalen -= entry->in_pack_header_size;
+
+	if (!pack_to_stdout && p->index_version == 1 &&
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+		error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
+		unuse_pack(&w_curs);
+		return write_no_reuse_object(f, entry, limit, usable_delta);
+	}
+
+	if (type == OBJ_OFS_DELTA) {
+		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		unsigned pos = sizeof(dheader) - 1;
+		dheader[pos] = ofs & 127;
+		while (ofs >>= 7)
+			dheader[--pos] = 128 | (--ofs & 127);
+		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			unuse_pack(&w_curs);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+		sha1write(f, dheader + pos, sizeof(dheader) - pos);
+		hdrlen += sizeof(dheader) - pos;
+		reused_delta++;
+	} else if (type == OBJ_REF_DELTA) {
+		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
+			unuse_pack(&w_curs);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+		sha1write(f, entry->delta->idx.sha1, 20);
+		hdrlen += 20;
+		reused_delta++;
+	} else {
+		if (limit && hdrlen + datalen + 20 >= limit) {
+			unuse_pack(&w_curs);
+			return 0;
+		}
+		sha1write(f, header, hdrlen);
+	}
+	copy_pack_data(f, p, &w_curs, offset, datalen);
+	unuse_pack(&w_curs);
+	reused++;
+	return hdrlen + datalen;
+}
+
+/* Return 0 if we will bust the pack-size limit */
+static unsigned long write_object(struct sha1file *f,
+				  struct object_entry *entry,
+				  off_t write_offset)
+{
+	unsigned long limit, len;
 	int usable_delta, to_reuse;
 
 	if (!pack_to_stdout)
 		crc32_begin(f);
 
-	type = entry->type;
-
 	/* apply size limit if limited packsize and not first object */
 	if (!pack_size_limit || !nr_written)
 		limit = 0;
@@ -243,11 +399,11 @@ static unsigned long write_object(struct sha1file *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA)
+	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (type != entry->in_pack_type)
+	else if (entry->type != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -256,153 +412,19 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
-	if (!to_reuse) {
-		no_reuse:
-		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
-			/*
-			 * make sure no cached delta data remains from a
-			 * previous attempt before a pack split occurred.
-			 */
-			free(entry->delta_data);
-			entry->delta_data = NULL;
-			entry->z_delta_size = 0;
-		} else if (entry->delta_data) {
-			size = entry->delta_size;
-			buf = entry->delta_data;
-			entry->delta_data = NULL;
-			type = (allow_ofs_delta && entry->delta->idx.offset) ?
-				OBJ_OFS_DELTA : OBJ_REF_DELTA;
-		} else {
-			buf = get_delta(entry);
-			size = entry->delta_size;
-			type = (allow_ofs_delta && entry->delta->idx.offset) ?
-				OBJ_OFS_DELTA : OBJ_REF_DELTA;
-		}
-
-		if (entry->z_delta_size)
-			datalen = entry->z_delta_size;
-		else
-			datalen = do_compress(&buf, size);
-
-		/*
-		 * The object header is a byte of 'type' followed by zero or
-		 * more bytes of length.
-		 */
-		hdrlen = encode_in_pack_object_header(type, size, header);
-
-		if (type == OBJ_OFS_DELTA) {
-			/*
-			 * Deltas with relative base contain an additional
-			 * encoding of the relative offset for the delta
-			 * base from this object's position in the pack.
-			 */
-			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
-			unsigned pos = sizeof(dheader) - 1;
-			dheader[pos] = ofs & 127;
-			while (ofs >>= 7)
-				dheader[--pos] = 128 | (--ofs & 127);
-			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
-				free(buf);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-			sha1write(f, dheader + pos, sizeof(dheader) - pos);
-			hdrlen += sizeof(dheader) - pos;
-		} else if (type == OBJ_REF_DELTA) {
-			/*
-			 * Deltas with a base reference contain
-			 * an additional 20 bytes for the base sha1.
-			 */
-			if (limit && hdrlen + 20 + datalen + 20 >= limit) {
-				free(buf);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-			sha1write(f, entry->delta->idx.sha1, 20);
-			hdrlen += 20;
-		} else {
-			if (limit && hdrlen + datalen + 20 >= limit) {
-				free(buf);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-		}
-		sha1write(f, buf, datalen);
-		free(buf);
-	}
-	else {
-		struct packed_git *p = entry->in_pack;
-		struct pack_window *w_curs = NULL;
-		struct revindex_entry *revidx;
-		off_t offset;
-
-		if (entry->delta)
-			type = (allow_ofs_delta && entry->delta->idx.offset) ?
-				OBJ_OFS_DELTA : OBJ_REF_DELTA;
-		hdrlen = encode_in_pack_object_header(type, entry->size, header);
-
-		offset = entry->in_pack_offset;
-		revidx = find_pack_revindex(p, offset);
-		datalen = revidx[1].offset - offset;
-		if (!pack_to_stdout && p->index_version > 1 &&
-		    check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
-			error("bad packed object CRC for %s", sha1_to_hex(entry->idx.sha1));
-			unuse_pack(&w_curs);
-			goto no_reuse;
-		}
-
-		offset += entry->in_pack_header_size;
-		datalen -= entry->in_pack_header_size;
-		if (!pack_to_stdout && p->index_version == 1 &&
-		    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
-			error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
-			unuse_pack(&w_curs);
-			goto no_reuse;
-		}
+	if (!to_reuse)
+		len = write_no_reuse_object(f, entry, limit, usable_delta);
+	else
+		len = write_reuse_object(f, entry, limit, usable_delta);
+	if (!len)
+		return 0;
 
-		if (type == OBJ_OFS_DELTA) {
-			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
-			unsigned pos = sizeof(dheader) - 1;
-			dheader[pos] = ofs & 127;
-			while (ofs >>= 7)
-				dheader[--pos] = 128 | (--ofs & 127);
-			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
-				unuse_pack(&w_curs);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-			sha1write(f, dheader + pos, sizeof(dheader) - pos);
-			hdrlen += sizeof(dheader) - pos;
-			reused_delta++;
-		} else if (type == OBJ_REF_DELTA) {
-			if (limit && hdrlen + 20 + datalen + 20 >= limit) {
-				unuse_pack(&w_curs);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-			sha1write(f, entry->delta->idx.sha1, 20);
-			hdrlen += 20;
-			reused_delta++;
-		} else {
-			if (limit && hdrlen + datalen + 20 >= limit) {
-				unuse_pack(&w_curs);
-				return 0;
-			}
-			sha1write(f, header, hdrlen);
-		}
-		copy_pack_data(f, p, &w_curs, offset, datalen);
-		unuse_pack(&w_curs);
-		reused++;
-	}
 	if (usable_delta)
 		written_delta++;
 	written++;
 	if (!pack_to_stdout)
 		entry->idx.crc32 = crc32_end(f);
-	return hdrlen + datalen;
+	return len;
 }
 
 enum write_one_status {
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
  2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
  2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
@ 2012-05-16 12:02   ` Nguyễn Thái Ngọc Duy
  2012-05-18 21:02   ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano
  3 siblings, 0 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:02 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre,
	Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one go.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Changes since the first version:

  - use "struct git_istream *" as streaming indicator, instead of "buf
    != NULL"
  - fall back to read_sha1_file() if open_istream() fails
  - write_object() returns correct written number of bytes (previously
    datalen was miscalculated, leading to wrong pack size calculation)

 builtin/pack-objects.c |   63 ++++++++++++++++++++++++++++++++++++++++++++---
 t/t1050-large.sh       |   12 +++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ccfcbad..b2679cb 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,46 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static unsigned long write_large_blob_data(struct git_istream *st, struct sha1file *f,
+					   const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	unsigned long olen = 0;
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	for (;;) {
+		ssize_t readlen;
+		int zret = Z_OK;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+			olen += stream.next_out - obuf;
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	git_deflate_end(&stream);
+	return olen;
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -208,11 +249,18 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	unsigned hdrlen;
 	enum object_type type;
 	void *buf;
+	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		buf = read_sha1_file(entry->idx.sha1, &type, &size);
-		if (!buf)
-			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+		if (entry->type == OBJ_BLOB &&
+		    entry->size > big_file_threshold &&
+		    (st = open_istream(entry->idx.sha1, &type, &size, NULL)) != NULL)
+			buf = NULL;
+		else {
+			buf = read_sha1_file(entry->idx.sha1, &type, &size);
+			if (!buf)
+				die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+		}
 		/*
 		 * make sure no cached delta data remains from a
 		 * previous attempt before a pack split occurred.
@@ -235,6 +283,9 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 
 	if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
+	else if (st)
+		/* large blob case, just assume we don't compress well */
+		datalen = size;
 	else
 		datalen = do_compress(&buf, size);
 
@@ -281,7 +332,11 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		}
 		sha1write(f, header, hdrlen);
 	}
-	sha1write(f, buf, datalen);
+	if (st)
+		datalen = write_large_blob_data(st, f, entry->idx.sha1);
+	else
+		sha1write(f, buf, datalen);
+	close_istream(st);
 	free(buf);
 
 	return hdrlen + datalen;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..313889b 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,18 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	SHA1=`git hash-object huge` &&
+	test_create_repo loose &&
+	echo $SHA1 | git pack-objects --stdout |
+		GIT_ALLOC_LIMIT=0 GIT_DIR=loose/.git git unpack-objects &&
+	echo $SHA1 | GIT_DIR=loose/.git git pack-objects pack &&
+	test_create_repo packed &&
+	mv pack-* packed/.git/objects/pack &&
+	GIT_DIR=packed/.git git cat-file blob $SHA1 >actual &&
+	cmp huge actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/4] streaming: allow to call close_istream(NULL);
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2012-05-16 12:02   ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
@ 2012-05-18 21:02   ` Junio C Hamano
  3 siblings, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-18 21:02 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> This makes it more convenient in cleanup code, like free(NULL).

If the justification were "like fclose(NULL)", it have made more sense,
but fclose(NULL) does not silently turn itself into a successful no-op.
You are expected to check the returned value from the matching function
(i.e. fopen()) and do not call it.  So taking this patch alone, I smell
a bad design taste.

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  streaming.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/streaming.c b/streaming.c
> index 3a3cd12..38b39cd 100644
> --- a/streaming.c
> +++ b/streaming.c
> @@ -94,7 +94,7 @@ struct git_istream {
>  
>  int close_istream(struct git_istream *st)
>  {
> -	int r = st->vtbl->close(st);
> +	int r = st ? st->vtbl->close(st) : 0;
>  	free(st);
>  	return r;
>  }

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .."
  2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
@ 2012-05-18 21:05     ` Junio C Hamano
  0 siblings, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-18 21:05 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> This is because all other places do "xx > big_file_threshold"
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Without two comparisons I added recently in archive-*.c, it's still
>  inconsistent (two xx >= big_file and one xx > big_file).  I have no
>  preference between > and >=. If some of you vote >=, I'll update the
>  patch.

Thanks; I do not have huge preference between the two, but "I want
special treatment for anything bigger than 2MiB" is probably more
natural than "I want special treatment for things that are exactly 2MiB
and also things bigger than that threashold".

>  builtin/pack-objects.c |    2 +-
>  streaming.c            |    2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 1861093..b2e0940 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -1327,7 +1327,7 @@ static void get_object_details(void)
>  	for (i = 0; i < nr_objects; i++) {
>  		struct object_entry *entry = sorted_by_offset[i];
>  		check_object(entry);
> -		if (big_file_threshold <= entry->size)
> +		if (big_file_threshold < entry->size)
>  			entry->no_try_delta = 1;
>  	}
>  
> diff --git a/streaming.c b/streaming.c
> index 38b39cd..829ce7d 100644
> --- a/streaming.c
> +++ b/streaming.c
> @@ -121,7 +121,7 @@ static enum input_source istream_source(const unsigned char *sha1,
>  	case OI_LOOSE:
>  		return loose;
>  	case OI_PACKED:
> -		if (!oi->u.packed.is_delta && big_file_threshold <= size)
> +		if (!oi->u.packed.is_delta && big_file_threshold < size)
>  			return pack_non_delta;
>  		/* fallthru */
>  	default:

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/4] pack-objects: refactor write_object()
  2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
@ 2012-05-18 21:16     ` Junio C Hamano
  2012-05-19  2:43     ` Nicolas Pitre
  1 sibling, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-18 21:16 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Move !to_reuse and to_reuse write code out into two separate functions
> and remove "goto no_reuse;" hack

I do not necessarily think goto is a harmful hack, but the original code
is sufficiently large and its flow clearly ought to be separable into
two phases (phase I: decide if we want to copy from existing pack or
build data to be sent to the pack afresh; phase II: carry out the
decision, doing one of the two things we decided to do), so I think it
makes tons of sense to separate out the second phase into two helper
functions.

Looks good from a cursory reading.

Thanks.

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  builtin/pack-objects.c |  322 ++++++++++++++++++++++++++----------------------
>  1 files changed, 172 insertions(+), 150 deletions(-)
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index b2e0940..ccfcbad 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -200,22 +200,178 @@ static void copy_pack_data(struct sha1file *f,
>  }
>  
>  /* Return 0 if we will bust the pack-size limit */
> +static unsigned long write_no_reuse_object(struct sha1file *f, struct object_entry *entry,
> +					   unsigned long limit, int usable_delta)
>  {
> +	unsigned long size, datalen;
>  	unsigned char header[10], dheader[10];
>  	unsigned hdrlen;
>  	enum object_type type;
> +	void *buf;
> +
> +	if (!usable_delta) {
> +		buf = read_sha1_file(entry->idx.sha1, &type, &size);
> +		if (!buf)
> +			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> +		/*
> +		 * make sure no cached delta data remains from a
> +		 * previous attempt before a pack split occurred.
> +		 */
> +		free(entry->delta_data);
> +		entry->delta_data = NULL;
> +		entry->z_delta_size = 0;
> +	} else if (entry->delta_data) {
> +		size = entry->delta_size;
> +		buf = entry->delta_data;
> +		entry->delta_data = NULL;
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	} else {
> +		buf = get_delta(entry);
> +		size = entry->delta_size;
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	}
> +
> +	if (entry->z_delta_size)
> +		datalen = entry->z_delta_size;
> +	else
> +		datalen = do_compress(&buf, size);
> +
> +	/*
> +	 * The object header is a byte of 'type' followed by zero or
> +	 * more bytes of length.
> +	 */
> +	hdrlen = encode_in_pack_object_header(type, size, header);
> +
> +	if (type == OBJ_OFS_DELTA) {
> +		/*
> +		 * Deltas with relative base contain an additional
> +		 * encoding of the relative offset for the delta
> +		 * base from this object's position in the pack.
> +		 */
> +		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> +		unsigned pos = sizeof(dheader) - 1;
> +		dheader[pos] = ofs & 127;
> +		while (ofs >>= 7)
> +			dheader[--pos] = 128 | (--ofs & 127);
> +		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, dheader + pos, sizeof(dheader) - pos);
> +		hdrlen += sizeof(dheader) - pos;
> +	} else if (type == OBJ_REF_DELTA) {
> +		/*
> +		 * Deltas with a base reference contain
> +		 * an additional 20 bytes for the base sha1.
> +		 */
> +		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, entry->delta->idx.sha1, 20);
> +		hdrlen += 20;
> +	} else {
> +		if (limit && hdrlen + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +	}
> +	sha1write(f, buf, datalen);
> +	free(buf);
> +
> +	return hdrlen + datalen;
> +}
> +
> +/* Return 0 if we will bust the pack-size limit */
> +static unsigned long write_reuse_object(struct sha1file *f, struct object_entry *entry,
> +					unsigned long limit, int usable_delta)
> +{
> +	struct packed_git *p = entry->in_pack;
> +	struct pack_window *w_curs = NULL;
> +	struct revindex_entry *revidx;
> +	off_t offset;
> +	enum object_type type = entry->type;
> +	unsigned long datalen;
> +	unsigned char header[10], dheader[10];
> +	unsigned hdrlen;
> +
> +	if (entry->delta)
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	hdrlen = encode_in_pack_object_header(type, entry->size, header);
> +
> +	offset = entry->in_pack_offset;
> +	revidx = find_pack_revindex(p, offset);
> +	datalen = revidx[1].offset - offset;
> +	if (!pack_to_stdout && p->index_version > 1 &&
> +	    check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
> +		error("bad packed object CRC for %s", sha1_to_hex(entry->idx.sha1));
> +		unuse_pack(&w_curs);
> +		return write_no_reuse_object(f, entry, limit, usable_delta);
> +	}
> +
> +	offset += entry->in_pack_header_size;
> +	datalen -= entry->in_pack_header_size;
> +
> +	if (!pack_to_stdout && p->index_version == 1 &&
> +	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
> +		error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
> +		unuse_pack(&w_curs);
> +		return write_no_reuse_object(f, entry, limit, usable_delta);
> +	}
> +
> +	if (type == OBJ_OFS_DELTA) {
> +		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> +		unsigned pos = sizeof(dheader) - 1;
> +		dheader[pos] = ofs & 127;
> +		while (ofs >>= 7)
> +			dheader[--pos] = 128 | (--ofs & 127);
> +		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, dheader + pos, sizeof(dheader) - pos);
> +		hdrlen += sizeof(dheader) - pos;
> +		reused_delta++;
> +	} else if (type == OBJ_REF_DELTA) {
> +		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, entry->delta->idx.sha1, 20);
> +		hdrlen += 20;
> +		reused_delta++;
> +	} else {
> +		if (limit && hdrlen + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +	}
> +	copy_pack_data(f, p, &w_curs, offset, datalen);
> +	unuse_pack(&w_curs);
> +	reused++;
> +	return hdrlen + datalen;
> +}
> +
> +/* Return 0 if we will bust the pack-size limit */
> +static unsigned long write_object(struct sha1file *f,
> +				  struct object_entry *entry,
> +				  off_t write_offset)
> +{
> +	unsigned long limit, len;
>  	int usable_delta, to_reuse;
>  
>  	if (!pack_to_stdout)
>  		crc32_begin(f);
>  
>  	/* apply size limit if limited packsize and not first object */
>  	if (!pack_size_limit || !nr_written)
>  		limit = 0;
> @@ -243,11 +399,11 @@ static unsigned long write_object(struct sha1file *f,
>  		to_reuse = 0;	/* explicit */
>  	else if (!entry->in_pack)
>  		to_reuse = 0;	/* can't reuse what we don't have */
> +	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
>  				/* check_object() decided it for us ... */
>  		to_reuse = usable_delta;
>  				/* ... but pack split may override that */
> +	else if (entry->type != entry->in_pack_type)
>  		to_reuse = 0;	/* pack has delta which is unusable */
>  	else if (entry->delta)
>  		to_reuse = 0;	/* we want to pack afresh */
> @@ -256,153 +412,19 @@ static unsigned long write_object(struct sha1file *f,
>  				 * and we do not need to deltify it.
>  				 */
>  
> +	if (!to_reuse)
> +		len = write_no_reuse_object(f, entry, limit, usable_delta);
> +	else
> +		len = write_reuse_object(f, entry, limit, usable_delta);
> +	if (!len)
> +		return 0;
>  
>  	if (usable_delta)
>  		written_delta++;
>  	written++;
>  	if (!pack_to_stdout)
>  		entry->idx.crc32 = crc32_end(f);
> -	return hdrlen + datalen;
> +	return len;
>  }
>  
>  enum write_one_status {

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/4] pack-objects: refactor write_object()
  2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
  2012-05-18 21:16     ` Junio C Hamano
@ 2012-05-19  2:43     ` Nicolas Pitre
  1 sibling, 0 replies; 16+ messages in thread
From: Nicolas Pitre @ 2012-05-19  2:43 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 12290 bytes --]

On Wed, 16 May 2012, Nguyễn Thái Ngọc Duy wrote:

> Move !to_reuse and to_reuse write code out into two separate functions
> and remove "goto no_reuse;" hack
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

I like this very much.

Acked-by: Nicolas Pitre <nico@fluxnic.net>


> ---
>  builtin/pack-objects.c |  322 ++++++++++++++++++++++++++----------------------
>  1 files changed, 172 insertions(+), 150 deletions(-)
> 
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index b2e0940..ccfcbad 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -200,22 +200,178 @@ static void copy_pack_data(struct sha1file *f,
>  }
>  
>  /* Return 0 if we will bust the pack-size limit */
> -static unsigned long write_object(struct sha1file *f,
> -				  struct object_entry *entry,
> -				  off_t write_offset)
> +static unsigned long write_no_reuse_object(struct sha1file *f, struct object_entry *entry,
> +					   unsigned long limit, int usable_delta)
>  {
> -	unsigned long size, limit, datalen;
> -	void *buf;
> +	unsigned long size, datalen;
>  	unsigned char header[10], dheader[10];
>  	unsigned hdrlen;
>  	enum object_type type;
> +	void *buf;
> +
> +	if (!usable_delta) {
> +		buf = read_sha1_file(entry->idx.sha1, &type, &size);
> +		if (!buf)
> +			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> +		/*
> +		 * make sure no cached delta data remains from a
> +		 * previous attempt before a pack split occurred.
> +		 */
> +		free(entry->delta_data);
> +		entry->delta_data = NULL;
> +		entry->z_delta_size = 0;
> +	} else if (entry->delta_data) {
> +		size = entry->delta_size;
> +		buf = entry->delta_data;
> +		entry->delta_data = NULL;
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	} else {
> +		buf = get_delta(entry);
> +		size = entry->delta_size;
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	}
> +
> +	if (entry->z_delta_size)
> +		datalen = entry->z_delta_size;
> +	else
> +		datalen = do_compress(&buf, size);
> +
> +	/*
> +	 * The object header is a byte of 'type' followed by zero or
> +	 * more bytes of length.
> +	 */
> +	hdrlen = encode_in_pack_object_header(type, size, header);
> +
> +	if (type == OBJ_OFS_DELTA) {
> +		/*
> +		 * Deltas with relative base contain an additional
> +		 * encoding of the relative offset for the delta
> +		 * base from this object's position in the pack.
> +		 */
> +		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> +		unsigned pos = sizeof(dheader) - 1;
> +		dheader[pos] = ofs & 127;
> +		while (ofs >>= 7)
> +			dheader[--pos] = 128 | (--ofs & 127);
> +		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, dheader + pos, sizeof(dheader) - pos);
> +		hdrlen += sizeof(dheader) - pos;
> +	} else if (type == OBJ_REF_DELTA) {
> +		/*
> +		 * Deltas with a base reference contain
> +		 * an additional 20 bytes for the base sha1.
> +		 */
> +		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, entry->delta->idx.sha1, 20);
> +		hdrlen += 20;
> +	} else {
> +		if (limit && hdrlen + datalen + 20 >= limit) {
> +			free(buf);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +	}
> +	sha1write(f, buf, datalen);
> +	free(buf);
> +
> +	return hdrlen + datalen;
> +}
> +
> +/* Return 0 if we will bust the pack-size limit */
> +static unsigned long write_reuse_object(struct sha1file *f, struct object_entry *entry,
> +					unsigned long limit, int usable_delta)
> +{
> +	struct packed_git *p = entry->in_pack;
> +	struct pack_window *w_curs = NULL;
> +	struct revindex_entry *revidx;
> +	off_t offset;
> +	enum object_type type = entry->type;
> +	unsigned long datalen;
> +	unsigned char header[10], dheader[10];
> +	unsigned hdrlen;
> +
> +	if (entry->delta)
> +		type = (allow_ofs_delta && entry->delta->idx.offset) ?
> +			OBJ_OFS_DELTA : OBJ_REF_DELTA;
> +	hdrlen = encode_in_pack_object_header(type, entry->size, header);
> +
> +	offset = entry->in_pack_offset;
> +	revidx = find_pack_revindex(p, offset);
> +	datalen = revidx[1].offset - offset;
> +	if (!pack_to_stdout && p->index_version > 1 &&
> +	    check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
> +		error("bad packed object CRC for %s", sha1_to_hex(entry->idx.sha1));
> +		unuse_pack(&w_curs);
> +		return write_no_reuse_object(f, entry, limit, usable_delta);
> +	}
> +
> +	offset += entry->in_pack_header_size;
> +	datalen -= entry->in_pack_header_size;
> +
> +	if (!pack_to_stdout && p->index_version == 1 &&
> +	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
> +		error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
> +		unuse_pack(&w_curs);
> +		return write_no_reuse_object(f, entry, limit, usable_delta);
> +	}
> +
> +	if (type == OBJ_OFS_DELTA) {
> +		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> +		unsigned pos = sizeof(dheader) - 1;
> +		dheader[pos] = ofs & 127;
> +		while (ofs >>= 7)
> +			dheader[--pos] = 128 | (--ofs & 127);
> +		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, dheader + pos, sizeof(dheader) - pos);
> +		hdrlen += sizeof(dheader) - pos;
> +		reused_delta++;
> +	} else if (type == OBJ_REF_DELTA) {
> +		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +		sha1write(f, entry->delta->idx.sha1, 20);
> +		hdrlen += 20;
> +		reused_delta++;
> +	} else {
> +		if (limit && hdrlen + datalen + 20 >= limit) {
> +			unuse_pack(&w_curs);
> +			return 0;
> +		}
> +		sha1write(f, header, hdrlen);
> +	}
> +	copy_pack_data(f, p, &w_curs, offset, datalen);
> +	unuse_pack(&w_curs);
> +	reused++;
> +	return hdrlen + datalen;
> +}
> +
> +/* Return 0 if we will bust the pack-size limit */
> +static unsigned long write_object(struct sha1file *f,
> +				  struct object_entry *entry,
> +				  off_t write_offset)
> +{
> +	unsigned long limit, len;
>  	int usable_delta, to_reuse;
>  
>  	if (!pack_to_stdout)
>  		crc32_begin(f);
>  
> -	type = entry->type;
> -
>  	/* apply size limit if limited packsize and not first object */
>  	if (!pack_size_limit || !nr_written)
>  		limit = 0;
> @@ -243,11 +399,11 @@ static unsigned long write_object(struct sha1file *f,
>  		to_reuse = 0;	/* explicit */
>  	else if (!entry->in_pack)
>  		to_reuse = 0;	/* can't reuse what we don't have */
> -	else if (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA)
> +	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
>  				/* check_object() decided it for us ... */
>  		to_reuse = usable_delta;
>  				/* ... but pack split may override that */
> -	else if (type != entry->in_pack_type)
> +	else if (entry->type != entry->in_pack_type)
>  		to_reuse = 0;	/* pack has delta which is unusable */
>  	else if (entry->delta)
>  		to_reuse = 0;	/* we want to pack afresh */
> @@ -256,153 +412,19 @@ static unsigned long write_object(struct sha1file *f,
>  				 * and we do not need to deltify it.
>  				 */
>  
> -	if (!to_reuse) {
> -		no_reuse:
> -		if (!usable_delta) {
> -			buf = read_sha1_file(entry->idx.sha1, &type, &size);
> -			if (!buf)
> -				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> -			/*
> -			 * make sure no cached delta data remains from a
> -			 * previous attempt before a pack split occurred.
> -			 */
> -			free(entry->delta_data);
> -			entry->delta_data = NULL;
> -			entry->z_delta_size = 0;
> -		} else if (entry->delta_data) {
> -			size = entry->delta_size;
> -			buf = entry->delta_data;
> -			entry->delta_data = NULL;
> -			type = (allow_ofs_delta && entry->delta->idx.offset) ?
> -				OBJ_OFS_DELTA : OBJ_REF_DELTA;
> -		} else {
> -			buf = get_delta(entry);
> -			size = entry->delta_size;
> -			type = (allow_ofs_delta && entry->delta->idx.offset) ?
> -				OBJ_OFS_DELTA : OBJ_REF_DELTA;
> -		}
> -
> -		if (entry->z_delta_size)
> -			datalen = entry->z_delta_size;
> -		else
> -			datalen = do_compress(&buf, size);
> -
> -		/*
> -		 * The object header is a byte of 'type' followed by zero or
> -		 * more bytes of length.
> -		 */
> -		hdrlen = encode_in_pack_object_header(type, size, header);
> -
> -		if (type == OBJ_OFS_DELTA) {
> -			/*
> -			 * Deltas with relative base contain an additional
> -			 * encoding of the relative offset for the delta
> -			 * base from this object's position in the pack.
> -			 */
> -			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> -			unsigned pos = sizeof(dheader) - 1;
> -			dheader[pos] = ofs & 127;
> -			while (ofs >>= 7)
> -				dheader[--pos] = 128 | (--ofs & 127);
> -			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> -				free(buf);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -			sha1write(f, dheader + pos, sizeof(dheader) - pos);
> -			hdrlen += sizeof(dheader) - pos;
> -		} else if (type == OBJ_REF_DELTA) {
> -			/*
> -			 * Deltas with a base reference contain
> -			 * an additional 20 bytes for the base sha1.
> -			 */
> -			if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> -				free(buf);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -			sha1write(f, entry->delta->idx.sha1, 20);
> -			hdrlen += 20;
> -		} else {
> -			if (limit && hdrlen + datalen + 20 >= limit) {
> -				free(buf);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -		}
> -		sha1write(f, buf, datalen);
> -		free(buf);
> -	}
> -	else {
> -		struct packed_git *p = entry->in_pack;
> -		struct pack_window *w_curs = NULL;
> -		struct revindex_entry *revidx;
> -		off_t offset;
> -
> -		if (entry->delta)
> -			type = (allow_ofs_delta && entry->delta->idx.offset) ?
> -				OBJ_OFS_DELTA : OBJ_REF_DELTA;
> -		hdrlen = encode_in_pack_object_header(type, entry->size, header);
> -
> -		offset = entry->in_pack_offset;
> -		revidx = find_pack_revindex(p, offset);
> -		datalen = revidx[1].offset - offset;
> -		if (!pack_to_stdout && p->index_version > 1 &&
> -		    check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
> -			error("bad packed object CRC for %s", sha1_to_hex(entry->idx.sha1));
> -			unuse_pack(&w_curs);
> -			goto no_reuse;
> -		}
> -
> -		offset += entry->in_pack_header_size;
> -		datalen -= entry->in_pack_header_size;
> -		if (!pack_to_stdout && p->index_version == 1 &&
> -		    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
> -			error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
> -			unuse_pack(&w_curs);
> -			goto no_reuse;
> -		}
> +	if (!to_reuse)
> +		len = write_no_reuse_object(f, entry, limit, usable_delta);
> +	else
> +		len = write_reuse_object(f, entry, limit, usable_delta);
> +	if (!len)
> +		return 0;
>  
> -		if (type == OBJ_OFS_DELTA) {
> -			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
> -			unsigned pos = sizeof(dheader) - 1;
> -			dheader[pos] = ofs & 127;
> -			while (ofs >>= 7)
> -				dheader[--pos] = 128 | (--ofs & 127);
> -			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
> -				unuse_pack(&w_curs);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -			sha1write(f, dheader + pos, sizeof(dheader) - pos);
> -			hdrlen += sizeof(dheader) - pos;
> -			reused_delta++;
> -		} else if (type == OBJ_REF_DELTA) {
> -			if (limit && hdrlen + 20 + datalen + 20 >= limit) {
> -				unuse_pack(&w_curs);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -			sha1write(f, entry->delta->idx.sha1, 20);
> -			hdrlen += 20;
> -			reused_delta++;
> -		} else {
> -			if (limit && hdrlen + datalen + 20 >= limit) {
> -				unuse_pack(&w_curs);
> -				return 0;
> -			}
> -			sha1write(f, header, hdrlen);
> -		}
> -		copy_pack_data(f, p, &w_curs, offset, datalen);
> -		unuse_pack(&w_curs);
> -		reused++;
> -	}
>  	if (usable_delta)
>  		written_delta++;
>  	written++;
>  	if (!pack_to_stdout)
>  		entry->idx.crc32 = crc32_end(f);
> -	return hdrlen + datalen;
> +	return len;
>  }
>  
>  enum write_one_status {
> -- 
> 1.7.8.36.g69ee2
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-05-19  2:44 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre
2012-05-13  4:37   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-05-14 15:56     ` Junio C Hamano
2012-05-14 19:43     ` Junio C Hamano
2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
2012-05-15 15:27         ` Junio C Hamano
2012-05-16  7:09           ` Nguyen Thai Ngoc Duy
2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
2012-05-18 21:05     ` Junio C Hamano
2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
2012-05-18 21:16     ` Junio C Hamano
2012-05-19  2:43     ` Nicolas Pitre
2012-05-16 12:02   ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-18 21:02   ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).