git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH 1/2] index-pack: hash non-delta objects while reading from stream
@ 2012-05-16 12:50 Nguyễn Thái Ngọc Duy
  2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:50 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Preparatory step for large blob support.

 builtin/index-pack.c |   43 ++++++++++++++++++++++++++++++++-----------
 1 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dc2cfe6..ccb0214 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -384,30 +384,55 @@ static void unlink_base_data(struct base_data *c)
 	free_base_data(c);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static int is_delta_type(enum object_type type)
+{
+	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
+}
+
+static void *unpack_entry_data(unsigned long offset, unsigned long size,
+			       enum object_type type, unsigned char *sha1)
 {
 	int status;
 	git_zstream stream;
 	void *buf = xmalloc(size);
+	git_SHA_CTX c;
+	char hdr[32];
+	int hdrlen;
+	unsigned char *last_out;
+
+	if (!is_delta_type(type)) {
+		hdrlen = sprintf(hdr, "%s %lu", typename(type), size)+1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+	} else
+		sha1 = NULL;
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
 	stream.avail_out = size;
+	last_out = buf;
 
 	do {
 		stream.next_in = fill(1);
 		stream.avail_in = input_len;
 		status = git_inflate(&stream, 0);
 		use(input_len - stream.avail_in);
+		if (sha1)
+			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
+		last_out = stream.next_out;
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, _("inflate returned %d"), status);
 	git_inflate_end(&stream);
+	if (sha1)
+		git_SHA1_Final(sha1, &c);
 	return buf;
 }
 
-static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
+static void *unpack_raw_entry(struct object_entry *obj,
+			      union delta_base *delta_base,
+			      unsigned char *sha1)
 {
 	unsigned char *p;
 	unsigned long size, c;
@@ -467,7 +492,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size);
+	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -569,9 +594,8 @@ static void find_delta_children(const union delta_base *base,
 }
 
 static void sha1_object(const void *data, unsigned long size,
-			enum object_type type, unsigned char *sha1)
+			enum object_type type, const unsigned char *sha1)
 {
-	hash_sha1_file(data, size, typename(type), sha1);
 	read_lock();
 	if (has_sha1_file(sha1)) {
 		void *has_data;
@@ -627,11 +651,6 @@ static void sha1_object(const void *data, unsigned long size,
 	}
 }
 
-static int is_delta_type(enum object_type type)
-{
-	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
-}
-
 /*
  * This function is part of find_unresolved_deltas(). There are two
  * walkers going in the opposite ways.
@@ -711,6 +730,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 	free(delta_data);
 	if (!result->data)
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
+	hash_sha1_file(result->data, result->size,
+		       typename(delta_obj->real_type), delta_obj->idx.sha1);
 	sha1_object(result->data, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
@@ -851,7 +872,7 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base);
+		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
 		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time)
  2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
@ 2012-05-16 12:50 ` Nguyễn Thái Ngọc Duy
  2012-05-18 22:20   ` Junio C Hamano
  2012-05-18 22:15 ` [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Junio C Hamano
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
  2 siblings, 1 reply; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-16 12:50 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

unpack_raw_entry() will not allocate and return decompressed blobs if
they are larger than core.bigFileThreshold.

The blob content is needed by sha1_object() in some cases. When we do
need the blob content, we put it back in core with
get_data_from_pack(). However we rarely need that in pratice.

The first case is when we find an in-repo blob with the same SHA-1. We
need to do collision test, byte-on-byte. Normally (e.g. in
fetch/pull/clone) this does not happen because git avoid to send
objects that client already has.

The other case is when --strict is specified and the object in
question is not a blob, which can't happen in reality becase we deal
with large _blobs_ here.

Note: --verify (or git-verify-pack) a pack from current repository
will trigger collision test on every object in the pack, which
effectively disables this patch. This could be easily worked around by
setting GIT_DIR to an imaginary place with no packs.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c |   52 ++++++++++++++++++++++++++++++++++++++++++-------
 t/t1050-large.sh     |    5 ++++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index ccb0214..1b790df 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type)
 static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
+	static char fixed_buf[8192];
 	int status;
 	git_zstream stream;
-	void *buf = xmalloc(size);
+	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
 	int hdrlen;
@@ -406,11 +407,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
+	if (type == OBJ_BLOB && size > big_file_threshold)
+		buf = fixed_buf;
+	else
+		buf = xmalloc(size);
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
 	last_out = buf;
 
 	do {
@@ -420,6 +425,10 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		use(input_len - stream.avail_in);
 		if (sha1)
 			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
+		if (buf == fixed_buf) {
+			stream.next_out = buf;
+			stream.avail_out = sizeof(fixed_buf);
+		}
 		last_out = stream.next_out;
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
@@ -427,7 +436,7 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	git_inflate_end(&stream);
 	if (sha1)
 		git_SHA1_Final(sha1, &c);
-	return buf;
+	return buf == fixed_buf ? NULL : buf;
 }
 
 static void *unpack_raw_entry(struct object_entry *obj,
@@ -593,14 +602,21 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
-static void sha1_object(const void *data, unsigned long size,
-			enum object_type type, const unsigned char *sha1)
+static void sha1_object(const void *data, struct object_entry *obj_entry,
+			unsigned long size, enum object_type type,
+			const unsigned char *sha1)
 {
+	void *new_data = NULL;
+
+	assert(data || obj_entry);
+
 	read_lock();
 	if (has_sha1_file(sha1)) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
+		if (!data)
+			data = new_data = get_data_from_pack(obj_entry);
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
 		if (!has_data)
@@ -625,6 +641,9 @@ static void sha1_object(const void *data, unsigned long size,
 			int eaten;
 			void *buf = (void *) data;
 
+			if (!buf)
+				buf = new_data = get_data_from_pack(obj_entry);
+
 			/*
 			 * we do not need to free the memory here, as the
 			 * buf is deleted by the caller.
@@ -649,6 +668,8 @@ static void sha1_object(const void *data, unsigned long size,
 		}
 		read_unlock();
 	}
+
+	free(new_data);
 }
 
 /*
@@ -732,7 +753,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
-	sha1_object(result->data, result->size, delta_obj->real_type,
+	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
 	nr_resolved_deltas++;
@@ -862,7 +883,7 @@ static void *threaded_second_pass(void *data)
  */
 static void parse_pack_objects(unsigned char *sha1)
 {
-	int i;
+	int i, nr_delays = 0;
 	struct delta_entry *delta = deltas;
 	struct stat st;
 
@@ -878,8 +899,12 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
+		} else if (!data) {
+			/* large blobs, check later */
+			obj->real_type = OBJ_BAD;
+			nr_delays++;
 		} else
-			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -899,6 +924,17 @@ static void parse_pack_objects(unsigned char *sha1)
 	if (S_ISREG(st.st_mode) &&
 			lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size)
 		die(_("pack has junk at the end"));
+
+	for (i = 0; i < nr_objects; i++) {
+		struct object_entry *obj = &objects[i];
+		if (obj->real_type != OBJ_BAD)
+			continue;
+		obj->real_type = obj->type;
+		sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1);
+		nr_delays--;
+	}
+	if (nr_delays)
+		die(_("confusion beyond insanity in parse_pack_objects()"));
 }
 
 /*
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..3f80688 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' '
 
 '
 
+test_expect_success 'index-pack' '
+	git clone file://"`pwd`"/.git foo &&
+	GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack
+'
+
 test_expect_success 'repack' '
 	git repack -ad
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] index-pack: hash non-delta objects while reading from stream
  2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
  2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
@ 2012-05-18 22:15 ` Junio C Hamano
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
  2 siblings, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-18 22:15 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static void *unpack_entry_data(unsigned long offset, unsigned long size,
> +			       enum object_type type, unsigned char *sha1)
>  {
>  	int status;
>  	git_zstream stream;
>  	void *buf = xmalloc(size);
> +	git_SHA_CTX c;
> +	char hdr[32];
> +	int hdrlen;
> +	unsigned char *last_out;
> +
> +	if (!is_delta_type(type)) {
> +		hdrlen = sprintf(hdr, "%s %lu", typename(type), size)+1;

s/+/ & /;

> +		git_SHA1_Init(&c);
> +		git_SHA1_Update(&c, hdr, hdrlen);
> +	} else
> +		sha1 = NULL;
> ...
>  	memset(&stream, 0, sizeof(stream));
>  	git_inflate_init(&stream);
>  	stream.next_out = buf;
>  	stream.avail_out = size;
> +	last_out = buf;
>  
>  	do {
>  		stream.next_in = fill(1);
>  		stream.avail_in = input_len;
>  		status = git_inflate(&stream, 0);
>  		use(input_len - stream.avail_in);
> +		if (sha1)
> +			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
> +		last_out = stream.next_out;
>  	} while (status == Z_OK);

Shouldn't "last_out" be scoped inside the loop, perhaps like this,

	do {
		unsigned char *boo = stream.next_out;
		stream.next_in = fill(1);
		stream.avail_in = input_len;
		status = git_inflate(&stream, 0);
		use(input_len - stream.avail_in);
		if (sha1)
			git_SHA1_Update(&c, boo, stream.next_out - boo);
	} while (status == Z_OK);

instead of leaking it to the whole function?

> @@ -711,6 +730,8 @@ static void resolve_delta(struct object_entry *delta_obj,
>  	free(delta_data);
>  	if (!result->data)
>  		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
> +	hash_sha1_file(result->data, result->size,
> +		       typename(delta_obj->real_type), delta_obj->idx.sha1);
>  	sha1_object(result->data, result->size, delta_obj->real_type,
>  		    delta_obj->idx.sha1);
>  	counter_lock();

The call-chain after your patch looks like this:

    parse_pack_objects()
    . for each incoming object 
      -> unpack_raw_entry()
	 . check type
         -> unpack_entry_data()
            . check type
            . unpack to obtain the data while hashing
	      if the type can be hashed.
      . check type
      . if the data is complete, let sha1_object() check it

The updated sha1_object() no longer hashes but the caller is responsible
for giving a correct hash value for the data, so you added a call to
hash_sha1_file() to the other caller (i.e resolve_delta()) of the
function.

The flow of the resulting code feels somewhat iffy that two functions
that are very far apart in the callchain have to coordinate among
themselves to ensure that the object data is always hashed once: plain
ones in unpack_entry_data() and deltified ones in resolve_delta().  It
smells like inviting future bugs when we introduce new in-pack types
that are neither delta or plain.

Overall it looks correct.  Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time)
  2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
@ 2012-05-18 22:20   ` Junio C Hamano
  2012-05-19  5:31     ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 16+ messages in thread
From: Junio C Hamano @ 2012-05-18 22:20 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> unpack_raw_entry() will not allocate and return decompressed blobs if
> they are larger than core.bigFileThreshold.
>
> The blob content is needed by sha1_object() in some cases. When we do
> need the blob content, we put it back in core with
> get_data_from_pack(). However we rarely need that in pratice.
>
> The first case is when we find an in-repo blob with the same SHA-1. We
> need to do collision test, byte-on-byte. Normally (e.g. in
> fetch/pull/clone) this does not happen because git avoid to send
> objects that client already has.

Perhaps the codepath that performs the byte-for-byte comparison can be
taught to stream from the received pack data and whatever was already
in the repository, using the streaming interface?  That way you do not
have to hold all of the both objects at the same time in core, no?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time)
  2012-05-18 22:20   ` Junio C Hamano
@ 2012-05-19  5:31     ` Nguyen Thai Ngoc Duy
  0 siblings, 0 replies; 16+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-19  5:31 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, May 19, 2012 at 5:20 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> unpack_raw_entry() will not allocate and return decompressed blobs if
>> they are larger than core.bigFileThreshold.
>>
>> The blob content is needed by sha1_object() in some cases. When we do
>> need the blob content, we put it back in core with
>> get_data_from_pack(). However we rarely need that in pratice.
>>
>> The first case is when we find an in-repo blob with the same SHA-1. We
>> need to do collision test, byte-on-byte. Normally (e.g. in
>> fetch/pull/clone) this does not happen because git avoid to send
>> objects that client already has.
>
> Perhaps the codepath that performs the byte-for-byte comparison can be
> taught to stream from the received pack data and whatever was already
> in the repository, using the streaming interface?  That way you do not
> have to hold all of the both objects at the same time in core, no?

Sure. But that does not happen until you are attacked. Maybe later.

verify-pack'ing also runs into this, but that's another issue and
should be fixed separately (hopefully by verify-pack users).
-- 
Duy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/6] index-pack: hash non-delta objects while reading from stream
  2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
  2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
  2012-05-18 22:15 ` [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Junio C Hamano
@ 2012-05-23 14:09 ` Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
                     ` (4 more replies)
  2 siblings, 5 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 fix spaces around "+ 1" operator in unpack_entry_data and last_out's scope.

 builtin/index-pack.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dc2cfe6..a744856 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -384,11 +384,27 @@ static void unlink_base_data(struct base_data *c)
 	free_base_data(c);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static int is_delta_type(enum object_type type)
+{
+	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
+}
+
+static void *unpack_entry_data(unsigned long offset, unsigned long size,
+			       enum object_type type, unsigned char *sha1)
 {
 	int status;
 	git_zstream stream;
 	void *buf = xmalloc(size);
+	git_SHA_CTX c;
+	char hdr[32];
+	int hdrlen;
+
+	if (!is_delta_type(type)) {
+		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+	} else
+		sha1 = NULL;
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
@@ -396,18 +412,25 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size)
 	stream.avail_out = size;
 
 	do {
+		unsigned char *last_out = stream.next_out;
 		stream.next_in = fill(1);
 		stream.avail_in = input_len;
 		status = git_inflate(&stream, 0);
 		use(input_len - stream.avail_in);
+		if (sha1)
+			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, _("inflate returned %d"), status);
 	git_inflate_end(&stream);
+	if (sha1)
+		git_SHA1_Final(sha1, &c);
 	return buf;
 }
 
-static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
+static void *unpack_raw_entry(struct object_entry *obj,
+			      union delta_base *delta_base,
+			      unsigned char *sha1)
 {
 	unsigned char *p;
 	unsigned long size, c;
@@ -467,7 +490,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size);
+	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -569,9 +592,8 @@ static void find_delta_children(const union delta_base *base,
 }
 
 static void sha1_object(const void *data, unsigned long size,
-			enum object_type type, unsigned char *sha1)
+			enum object_type type, const unsigned char *sha1)
 {
-	hash_sha1_file(data, size, typename(type), sha1);
 	read_lock();
 	if (has_sha1_file(sha1)) {
 		void *has_data;
@@ -627,11 +649,6 @@ static void sha1_object(const void *data, unsigned long size,
 	}
 }
 
-static int is_delta_type(enum object_type type)
-{
-	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
-}
-
 /*
  * This function is part of find_unresolved_deltas(). There are two
  * walkers going in the opposite ways.
@@ -711,6 +728,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 	free(delta_data);
 	if (!result->data)
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
+	hash_sha1_file(result->data, result->size,
+		       typename(delta_obj->real_type), delta_obj->idx.sha1);
 	sha1_object(result->data, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
@@ -851,7 +870,7 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base);
+		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
 		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time)
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack Nguyễn Thái Ngọc Duy
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

unpack_raw_entry() will not allocate and return decompressed blobs if
they are larger than core.bigFileThreshold. sha1_object() may not be
called on those objects because there's no actual content.

sha1_object() is called later on those objects, where we can safely
use get_data_from_pack() to retrieve blob content for checking.
However we always do that when we definitely need the blob
content. And we often don't.

There are two cases when we may need object content. The first case is
when we find an in-repo blob with the same SHA-1. We need to do
collision test, byte-on-byte. If this test is on, the blob must be
loaded on memory (i.e. no streaming). Normally (e.g. in
fetch/pull/clone) this does not happen because git avoid to send
objects that client already has.

The other case is when --strict is specified and the object in
question is not a blob, which can't happen in reality becase we deal
with large _blobs_ here.

Note: --verify (or git-verify-pack) a pack from current repository
will trigger collision test on every object in the pack, which
effectively disables this patch. This could be easily worked around by
setting GIT_DIR to an imaginary place with no packs.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--------
 t/t1050-large.sh     |  5 +++++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index a744856..c7c2b88 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type)
 static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
+	static char fixed_buf[8192];
 	int status;
 	git_zstream stream;
-	void *buf = xmalloc(size);
+	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
 	int hdrlen;
@@ -405,11 +406,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
+	if (type == OBJ_BLOB && size > big_file_threshold)
+		buf = fixed_buf;
+	else
+		buf = xmalloc(size);
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
 
 	do {
 		unsigned char *last_out = stream.next_out;
@@ -419,13 +424,17 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		use(input_len - stream.avail_in);
 		if (sha1)
 			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
+		if (buf == fixed_buf) {
+			stream.next_out = buf;
+			stream.avail_out = sizeof(fixed_buf);
+		}
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, _("inflate returned %d"), status);
 	git_inflate_end(&stream);
 	if (sha1)
 		git_SHA1_Final(sha1, &c);
-	return buf;
+	return buf == fixed_buf ? NULL : buf;
 }
 
 static void *unpack_raw_entry(struct object_entry *obj,
@@ -591,14 +600,21 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
-static void sha1_object(const void *data, unsigned long size,
-			enum object_type type, const unsigned char *sha1)
+static void sha1_object(const void *data, struct object_entry *obj_entry,
+			unsigned long size, enum object_type type,
+			const unsigned char *sha1)
 {
+	void *new_data = NULL;
+
+	assert(data || obj_entry);
+
 	read_lock();
 	if (has_sha1_file(sha1)) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
+		if (!data)
+			data = new_data = get_data_from_pack(obj_entry);
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
 		if (!has_data)
@@ -623,6 +639,9 @@ static void sha1_object(const void *data, unsigned long size,
 			int eaten;
 			void *buf = (void *) data;
 
+			if (!buf)
+				buf = new_data = get_data_from_pack(obj_entry);
+
 			/*
 			 * we do not need to free the memory here, as the
 			 * buf is deleted by the caller.
@@ -647,6 +666,8 @@ static void sha1_object(const void *data, unsigned long size,
 		}
 		read_unlock();
 	}
+
+	free(new_data);
 }
 
 /*
@@ -730,7 +751,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
-	sha1_object(result->data, result->size, delta_obj->real_type,
+	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
 	nr_resolved_deltas++;
@@ -860,7 +881,7 @@ static void *threaded_second_pass(void *data)
  */
 static void parse_pack_objects(unsigned char *sha1)
 {
-	int i;
+	int i, nr_delays = 0;
 	struct delta_entry *delta = deltas;
 	struct stat st;
 
@@ -876,8 +897,12 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
+		} else if (!data) {
+			/* large blobs, check later */
+			obj->real_type = OBJ_BAD;
+			nr_delays++;
 		} else
-			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -897,6 +922,17 @@ static void parse_pack_objects(unsigned char *sha1)
 	if (S_ISREG(st.st_mode) &&
 			lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size)
 		die(_("pack has junk at the end"));
+
+	for (i = 0; i < nr_objects; i++) {
+		struct object_entry *obj = &objects[i];
+		if (obj->real_type != OBJ_BAD)
+			continue;
+		obj->real_type = obj->type;
+		sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1);
+		nr_delays--;
+	}
+	if (nr_delays)
+		die(_("confusion beyond insanity in parse_pack_objects()"));
 }
 
 /*
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..3f80688 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' '
 
 '
 
+test_expect_success 'index-pack' '
+	git clone file://"`pwd`"/.git foo &&
+	GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack
+'
+
 test_expect_success 'repack' '
 	git repack -ad
 '
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

This allows caller to consume large inflated object with a fixed
amount of memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 for streaming collision test..

 builtin/index-pack.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c7c2b88..9129299 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -504,7 +504,9 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	return data;
 }
 
-static void *get_data_from_pack(struct object_entry *obj)
+static void *unpack_data(struct object_entry *obj,
+			 int (*consume)(const unsigned char *, unsigned long, void *),
+			 void *cb_data)
 {
 	off_t from = obj[0].idx.offset + obj[0].hdr_size;
 	unsigned long len = obj[1].idx.offset - from;
@@ -512,15 +514,16 @@ static void *get_data_from_pack(struct object_entry *obj)
 	git_zstream stream;
 	int status;
 
-	data = xmalloc(obj->size);
+	data = xmalloc(consume ? 64*1024 : obj->size);
 	inbuf = xmalloc((len < 64*1024) ? len : 64*1024);
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = data;
-	stream.avail_out = obj->size;
+	stream.avail_out = consume ? 64*1024 : obj->size;
 
 	do {
+		unsigned char *last_out = stream.next_out;
 		ssize_t n = (len < 64*1024) ? len : 64*1024;
 		n = pread(pack_fd, inbuf, n, from);
 		if (n < 0)
@@ -535,6 +538,15 @@ static void *get_data_from_pack(struct object_entry *obj)
 		stream.next_in = inbuf;
 		stream.avail_in = n;
 		status = git_inflate(&stream, 0);
+		if (consume) {
+			if (consume(last_out, stream.next_out - last_out, cb_data)) {
+				free(inbuf);
+				free(data);
+				return NULL;
+			}
+			stream.next_out = data;
+			stream.avail_out = 64*1024;
+		}
 	} while (len && status == Z_OK && !stream.avail_in);
 
 	/* This has been inflated OK when first encountered, so... */
@@ -543,9 +555,18 @@ static void *get_data_from_pack(struct object_entry *obj)
 
 	git_inflate_end(&stream);
 	free(inbuf);
+	if (consume) {
+		free(data);
+		data = NULL;
+	}
 	return data;
 }
 
+static void *get_data_from_pack(struct object_entry *obj)
+{
+	return unpack_data(obj, NULL, NULL);
+}
+
 static int compare_delta_bases(const union delta_base *base1,
 			       const union delta_base *base2,
 			       enum object_type type1,
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy
  2012-05-23 16:03     ` Junio C Hamano
  2012-05-24 13:55     ` [PATCH v2.1 " Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
  4 siblings, 2 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 the use of sha1_object_info() instead of has_sha1_file() is the cause
 of 6/6 as it is called on non-existent objects too.

 builtin/index-pack.c   | 83 +++++++++++++++++++++++++++++++++++++++++++++++---
 t/t5300-pack-object.sh |  5 +++
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9129299..05b1d35 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -9,6 +9,7 @@
 #include "progress.h"
 #include "fsck.h"
 #include "exec_cmd.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char index_pack_usage[] =
@@ -621,21 +622,94 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
+struct compare_data {
+	struct object_entry *entry;
+	struct git_istream *st;
+	unsigned char *buf;
+	unsigned long buf_size;
+};
+
+static int compare_objects(const unsigned char *buf, unsigned long size,
+			   void *cb_data)
+{
+	struct compare_data *data = cb_data;
+
+	if (data->buf_size < size) {
+		free(data->buf);
+		data->buf = xmalloc(size);
+		data->buf_size = size;
+	}
+
+	while (size) {
+		ssize_t len = read_istream(data->st, data->buf, size);
+		if (len == 0)
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (len < 0)
+			die(_("unable to read %s"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (memcmp(buf, data->buf, len))
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		size -= len;
+		buf += len;
+	}
+	return 0;
+}
+
+static int check_collison(struct object_entry *entry)
+{
+	struct compare_data data;
+	enum object_type type;
+	unsigned long size;
+
+	if (entry->size <= big_file_threshold || entry->type != OBJ_BLOB)
+		return -1;
+
+	memset(&data, 0, sizeof(data));
+	data.entry = entry;
+	data.st = open_istream(entry->idx.sha1, &type, &size, NULL);
+	if (!data.st)
+		return -1;
+	if (size != entry->size || type != entry->type)
+		die(_("SHA1 COLLISION FOUND WITH %s !"),
+		    sha1_to_hex(entry->idx.sha1));
+	unpack_data(entry, compare_objects, &data);
+	close_istream(data.st);
+	free(data.buf);
+	return 0;
+}
+
 static void sha1_object(const void *data, struct object_entry *obj_entry,
 			unsigned long size, enum object_type type,
 			const unsigned char *sha1)
 {
 	void *new_data = NULL;
+	int collision_test_needed = 1;
+	enum object_type has_type;
+	unsigned long has_size;
 
 	assert(data || obj_entry);
 
 	read_lock();
-	if (has_sha1_file(sha1)) {
+	has_type = sha1_object_info(sha1, &has_size);
+	read_unlock();
+
+	if (has_type < 0)
+		collision_test_needed = 0;
+	if (collision_test_needed && !data) {
+		read_lock();
+		if (!check_collison(obj_entry))
+			collision_test_needed = 0;
+		read_unlock();
+	}
+	if (collision_test_needed) {
 		void *has_data;
-		enum object_type has_type;
-		unsigned long has_size;
+		if (has_type != type || has_size != size)
+			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		if (!data)
 			data = new_data = get_data_from_pack(obj_entry);
+		read_lock();
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
 		if (!has_data)
@@ -644,8 +718,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
 		    memcmp(data, has_data, size) != 0)
 			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		free(has_data);
-	} else
-		read_unlock();
+	}
 
 	if (strict) {
 		read_lock();
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index d9d856b..300ed91 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -418,4 +418,9 @@ test_expect_success \
     'test_must_fail git index-pack -o bad.idx test-3.pack 2>msg &&
      grep "SHA1 COLLISION FOUND" msg'
 
+test_expect_success \
+    'make sure index-pack detects the SHA1 collision (large blobs)' \
+    'test_must_fail git -c core.bigfilethreshold=1 index-pack -o bad.idx test-3.pack 2>msg &&
+     grep "SHA1 COLLISION FOUND" msg'
+
 test_done
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy
  2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
  4 siblings, 0 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 not part of the original goal, but while i'm touching this code..

 builtin/index-pack.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 05b1d35..641edb5 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -78,6 +78,7 @@ static int nr_threads;
 static int from_stdin;
 static int strict;
 static int verbose;
+static int verify;
 
 static struct progress *progress;
 
@@ -688,11 +689,13 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
 	int collision_test_needed = 1;
 	enum object_type has_type;
 	unsigned long has_size;
+	struct object_info oi;
 
 	assert(data || obj_entry);
 
+	oi.sizep = &has_size;
 	read_lock();
-	has_type = sha1_object_info(sha1, &has_size);
+	has_type = sha1_object_info_extended(sha1, &oi);
 	read_unlock();
 
 	if (has_type < 0)
@@ -703,6 +706,30 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
 			collision_test_needed = 0;
 		read_unlock();
 	}
+	read_lock();
+	if (collision_test_needed && oi.whence == OI_PACKED &&
+	    verify && !from_stdin &&
+	    is_pack_valid(oi.u.packed.pack)) {
+		static int good_stats = -1;
+		static struct stat st1, st2;
+
+		good_stats = good_stats == -1 &&
+			fstat_is_reliable() &&
+			!fstat(pack_fd, &st2) &&
+			!stat(oi.u.packed.pack->pack_name, &st1);
+		read_unlock();
+
+		/*
+		 * Windows builds do not support inode info. Inode
+		 * comparison would be wrong. Luckily in all Windows
+		 * builds set UNRELIABLE_FSTAT and good_stats would
+		 * always be 0, so we never get to inode comparison
+		 * test.
+		 */
+		if (good_stats && st1.st_ino == st2.st_ino)
+			collision_test_needed = 0;
+	} else
+		read_unlock();
 	if (collision_test_needed) {
 		void *has_data;
 		if (has_type != type || has_size != size)
@@ -1459,7 +1486,7 @@ static void show_pack_info(int stat_only)
 
 int cmd_index_pack(int argc, const char **argv, const char *prefix)
 {
-	int i, fix_thin_pack = 0, verify = 0, stat_only = 0, stat = 0;
+	int i, fix_thin_pack = 0, stat_only = 0, stat = 0;
 	const char *curr_pack, *curr_index;
 	const char *index_name = NULL, *pack_name = NULL;
 	const char *keep_name = NULL, *keep_msg = NULL;
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects
  2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
                     ` (3 preceding siblings ...)
  2012-05-23 14:09   ` [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy
  2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
  4 siblings, 1 reply; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-23 14:09 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 makes sense to me, but I might have overlooked something

 sha1_file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sha1_file.c b/sha1_file.c
index 3c4f165..520b41e 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2133,6 +2133,8 @@ static int sha1_loose_object_info(const unsigned char *sha1, unsigned long *size
 	git_zstream stream;
 	char hdr[32];
 
+	if (!has_loose_object(sha1))
+		return -1;
 	map = map_sha1_file(sha1, &mapsize);
 	if (!map)
 		return error("unable to find %s", sha1_to_hex(sha1));
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects
  2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
@ 2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
  2012-05-23 16:01       ` Junio C Hamano
  0 siblings, 1 reply; 16+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-23 14:24 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

On Wed, May 23, 2012 at 9:09 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  makes sense to me, but I might have overlooked something

while it's still making sense for me, i think it's more logical  to
move the check to the caller, where "entry in pack?" check is also
done.

>
>  sha1_file.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/sha1_file.c b/sha1_file.c
> index 3c4f165..520b41e 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -2133,6 +2133,8 @@ static int sha1_loose_object_info(const unsigned char *sha1, unsigned long *size
>        git_zstream stream;
>        char hdr[32];
>
> +       if (!has_loose_object(sha1))
> +               return -1;
>        map = map_sha1_file(sha1, &mapsize);
>        if (!map)
>                return error("unable to find %s", sha1_to_hex(sha1));
> --
> 1.7.10.2.549.g9354186
>



-- 
Duy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects
  2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
@ 2012-05-23 16:01       ` Junio C Hamano
  2012-05-24 13:12         ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 16+ messages in thread
From: Junio C Hamano @ 2012-05-23 16:01 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git, Junio C Hamano

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> On Wed, May 23, 2012 at 9:09 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
>>
>> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
>> ---
>> makes sense to me, but I might have overlooked something
>
> while it's still making sense for me, i think it's more logical  to
> move the check to the caller, where "entry in pack?" check is also
> done.

I think most of the callers of sha1_object_info_extended() are using this
function, saying "We expect this object to exist somewhere, perhaps in
pack or perhaps in a loose form, and trying to see what it is", and they
rely on the first error message "unable to find" to be issued.

So in that sense, I do not see how this patch makes any sense at all.
Care to point out a codepath where we throw a random 20 bytes at it in
order to see if an object with the given object name exists?  That would
be the only case where "unable to find" might be an unwanted error
message.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs
  2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
@ 2012-05-23 16:03     ` Junio C Hamano
  2012-05-24 13:55     ` [PATCH v2.1 " Nguyễn Thái Ngọc Duy
  1 sibling, 0 replies; 16+ messages in thread
From: Junio C Hamano @ 2012-05-23 16:03 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  the use of sha1_object_info() instead of has_sha1_file() is the cause
>  of 6/6 as it is called on non-existent objects too.

I think it is wrong to call object-info before knowing if it even exists.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects
  2012-05-23 16:01       ` Junio C Hamano
@ 2012-05-24 13:12         ` Nguyen Thai Ngoc Duy
  0 siblings, 0 replies; 16+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-24 13:12 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, May 23, 2012 at 11:01 PM, Junio C Hamano <gitster@pobox.com> wrote:
>> while it's still making sense for me, i think it's more logical  to
>> move the check to the caller, where "entry in pack?" check is also
>> done.
>
> I think most of the callers of sha1_object_info_extended() are using this
> function, saying "We expect this object to exist somewhere, perhaps in
> pack or perhaps in a loose form, and trying to see what it is", and they
> rely on the first error message "unable to find" to be issued.

hmm.. if you see it from that angle, yes it makes sense

> So in that sense, I do not see how this patch makes any sense at all.
> Care to point out a codepath where we throw a random 20 bytes at it in
> order to see if an object with the given object name exists?  That would
> be the only case where "unable to find" might be an unwanted error
> message.

packed_delta_info(), fast-import (I think) and cat-file do not check
for object existence before calling this.
-- 
Duy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2.1 4/6] index-pack: use streaming interface for collision test on large blobs
  2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
  2012-05-23 16:03     ` Junio C Hamano
@ 2012-05-24 13:55     ` Nguyễn Thái Ngọc Duy
  1 sibling, 0 replies; 16+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-24 13:55 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

When putting whole objects in core is unavoidable, try match object
type and size first before actually inflating.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 The bypassing has_sha1_object() and doing sha1_object_info() directly mainly to
 help avoid unnecessary collision tesst when user run this command on
 an in-repo pack. But I realized there's no sure way to check if the
 given pack (especiall with --stdin) is in repo. So I'll drop 5/6 and
 6/6. People just have to set GIT_DIR=/nowhere.

 builtin/index-pack.c   | 82 +++++++++++++++++++++++++++++++++++++++++++++++---
 t/t5300-pack-object.sh |  5 +++
 2 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9129299..8b5c1eb 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -9,6 +9,7 @@
 #include "progress.h"
 #include "fsck.h"
 #include "exec_cmd.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char index_pack_usage[] =
@@ -621,31 +622,102 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
+struct compare_data {
+	struct object_entry *entry;
+	struct git_istream *st;
+	unsigned char *buf;
+	unsigned long buf_size;
+};
+
+static int compare_objects(const unsigned char *buf, unsigned long size,
+			   void *cb_data)
+{
+	struct compare_data *data = cb_data;
+
+	if (data->buf_size < size) {
+		free(data->buf);
+		data->buf = xmalloc(size);
+		data->buf_size = size;
+	}
+
+	while (size) {
+		ssize_t len = read_istream(data->st, data->buf, size);
+		if (len == 0)
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (len < 0)
+			die(_("unable to read %s"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (memcmp(buf, data->buf, len))
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		size -= len;
+		buf += len;
+	}
+	return 0;
+}
+
+static int check_collison(struct object_entry *entry)
+{
+	struct compare_data data;
+	enum object_type type;
+	unsigned long size;
+
+	if (entry->size <= big_file_threshold || entry->type != OBJ_BLOB)
+		return -1;
+
+	memset(&data, 0, sizeof(data));
+	data.entry = entry;
+	data.st = open_istream(entry->idx.sha1, &type, &size, NULL);
+	if (!data.st)
+		return -1;
+	if (size != entry->size || type != entry->type)
+		die(_("SHA1 COLLISION FOUND WITH %s !"),
+		    sha1_to_hex(entry->idx.sha1));
+	unpack_data(entry, compare_objects, &data);
+	close_istream(data.st);
+	free(data.buf);
+	return 0;
+}
+
 static void sha1_object(const void *data, struct object_entry *obj_entry,
 			unsigned long size, enum object_type type,
 			const unsigned char *sha1)
 {
 	void *new_data = NULL;
+	int collision_test_needed;
 
 	assert(data || obj_entry);
 
 	read_lock();
-	if (has_sha1_file(sha1)) {
+	collision_test_needed = has_sha1_file(sha1);
+	read_unlock();
+
+	if (collision_test_needed && !data) {
+		read_lock();
+		if (!check_collison(obj_entry))
+			collision_test_needed = 0;
+		read_unlock();
+	}
+	if (collision_test_needed) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
-		if (!data)
-			data = new_data = get_data_from_pack(obj_entry);
+		read_lock();
+		has_type = sha1_object_info(sha1, &has_size);
+		if (has_type != type || has_size != size)
+			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
+		if (!data)
+			data = new_data = get_data_from_pack(obj_entry);
 		if (!has_data)
 			die(_("cannot read existing object %s"), sha1_to_hex(sha1));
 		if (size != has_size || type != has_type ||
 		    memcmp(data, has_data, size) != 0)
 			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		free(has_data);
-	} else
-		read_unlock();
+	}
 
 	if (strict) {
 		read_lock();
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index d9d856b..300ed91 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -418,4 +418,9 @@ test_expect_success \
     'test_must_fail git index-pack -o bad.idx test-3.pack 2>msg &&
      grep "SHA1 COLLISION FOUND" msg'
 
+test_expect_success \
+    'make sure index-pack detects the SHA1 collision (large blobs)' \
+    'test_must_fail git -c core.bigfilethreshold=1 index-pack -o bad.idx test-3.pack 2>msg &&
+     grep "SHA1 COLLISION FOUND" msg'
+
 test_done
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-05-24 13:59 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
2012-05-18 22:20   ` Junio C Hamano
2012-05-19  5:31     ` Nguyen Thai Ngoc Duy
2012-05-18 22:15 ` [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Junio C Hamano
2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
2012-05-23 16:03     ` Junio C Hamano
2012-05-24 13:55     ` [PATCH v2.1 " Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
2012-05-23 16:01       ` Junio C Hamano
2012-05-24 13:12         ` Nguyen Thai Ngoc Duy

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).