git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Linus Torvalds <torvalds@osdl.org>
To: Johannes Schindelin <Johannes.Schindelin@gmx.de>
Cc: Carl Baldwin <cnb@fc.hp.com>, Junio C Hamano <junkio@cox.net>,
	Git Mailing List <git@vger.kernel.org>
Subject: Re: [PATCH 2/3] sha1_file: add the ability to parse objects in "pack file format"
Date: Tue, 11 Jul 2006 12:48:08 -0700 (PDT)	[thread overview]
Message-ID: <Pine.LNX.4.64.0607111241460.5623@g5.osdl.org> (raw)
In-Reply-To: <Pine.LNX.4.63.0607112116270.29667@wbgn013.biozentrum.uni-wuerzburg.de>



On Tue, 11 Jul 2006, Johannes Schindelin wrote:
> 
> > Or, more likely, the parse_sha1_header() function should just be changed 
> > to check the binary format first (and then add your comment about why that 
> > is safe).
> 
> Yes, exactly.

Here's a newer verson of [2/3], with these issues fixed. It actually fixes 
things twice: (a) by parsing the binary version first (which makes sense 
for a totally independent reason - if that is going to be the "default" 
version in the long run, we should just test it first anyway) and (b) by 
making the ASCII version parser stricter too.

This did, btw, also fix the test failure, so the fact that the ASCII 
header parser wasn't careful enough was actually a problem in real life.

So please throw away the old version.

		Linus

---
From: Linus Torvalds <torvalds@osdl.org>
Subject: [PATCH 2/3] sha1_file: add the ability to parse objects in "pack file format"

The pack-file format is slightly different from the traditional git
object format, in that it has a much denser binary header encoding.

The traditional format uses an ASCII string with type and length
information, which is somewhat wasteful.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sha1_file.c |   93 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index 8734d50..15ccf5e 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -697,9 +697,9 @@ static int unpack_sha1_header(z_stream *
 	return inflate(stream, 0);
 }
 
-static void *unpack_sha1_rest(z_stream *stream, void *buffer, unsigned long size)
+static void *unpack_sha1_rest(z_stream *stream, void *buffer, unsigned long size, unsigned int hdrlen)
 {
-	int bytes = strlen(buffer) + 1;
+	int bytes = hdrlen;
 	unsigned char *buf = xmalloc(1+size);
 
 	memcpy(buf, (char *) buffer + bytes, stream->total_out - bytes);
@@ -720,25 +720,40 @@ static void *unpack_sha1_rest(z_stream *
  * too permissive for what we want to check. So do an anal
  * object header parse by hand.
  */
-static int parse_sha1_header(char *hdr, char *type, unsigned long *sizep)
+static int parse_ascii_sha1_header(char *hdr, char *type, unsigned long *sizep)
 {
-	int i;
+	int bytes = 0;
 	unsigned long size;
 
 	/*
 	 * The type can be at most ten bytes (including the 
 	 * terminating '\0' that we add), and is followed by
 	 * a space. 
+	 *
+	 * We want at least three characters, and they should
+	 * all be normal lower-case letters.
 	 */
-	i = 10;
 	for (;;) {
-		char c = *hdr++;
+		unsigned char c = *hdr++;
+		bytes++;
 		if (c == ' ')
 			break;
-		if (!--i)
+		if (bytes >= 10)
 			return -1;
 		*type++ = c;
+
+		/*
+		 * The high nybble must be 6 of 7, see
+		 * parse_binary_header(). This covers
+		 * all ASCII lowercase characters.
+		 */
+		if (c < 0x60 || c > 0x7f)
+			return -1;
 	}
+
+	/* Minimum three letters and the space */
+	if (bytes < 4)
+		return -1;
 	*type = 0;
 
 	/*
@@ -746,6 +761,7 @@ static int parse_sha1_header(char *hdr, 
 	 * decimal format (ie "010" is not valid).
 	 */
 	size = *hdr++ - '0';
+	bytes++;
 	if (size > 9)
 		return -1;
 	if (size) {
@@ -754,6 +770,7 @@ static int parse_sha1_header(char *hdr, 
 			if (c > 9)
 				break;
 			hdr++;
+			bytes++;
 			size = size * 10 + c;
 		}
 	}
@@ -762,20 +779,74 @@ static int parse_sha1_header(char *hdr, 
 	/*
 	 * The length must be followed by a zero byte
 	 */
-	return *hdr ? -1 : 0;
+	bytes++;
+	return *hdr ? -1 : bytes;
+}
+
+/*
+ * We never confuse a binary header with an old ASCII one,
+ * because the ASCII one will always start with a lower-case
+ * letter, meaning that the first byte will be of the form
+ * 0x6? or 0x7?.
+ *
+ * That in turn would be parsed as object type 6 or 7, neither
+ * of which is valid for a unpacked object (object type 7 is
+ * a delta, and can only exist in a pack-file, while object type
+ * 6 is invalid).
+ */
+static int parse_binary_sha1_header(char *hdr, char *type, unsigned long *sizep)
+{
+	unsigned char c;
+	int bytes = 1;
+	unsigned long size;
+	unsigned object_type, bits;
+	static const char *typename[8] = {
+		NULL,	/* OBJ_EXT */
+		"commit", "tree", "blob", "tag",
+		NULL, NULL, NULL
+	};
+
+	c = *hdr++;
+	object_type = (c >> 4) & 7;
+	if (!typename[object_type])
+		return -1;
+	strcpy(type, typename[object_type]);
+	size = c & 15;
+	bits = 4;
+	while (!(c & 0x80)) {
+		if (bits >= 8*sizeof(unsigned long))
+			return -1;
+		c = *hdr++;
+		size += (unsigned long) (c & 0x7f) << bits;
+		bytes++;
+		bits += 7;
+	}
+	*sizep = size;
+	return bytes;
+}
+
+static int parse_sha1_header(char *hdr, char *type, unsigned long *sizep)
+{
+	int retval = parse_binary_sha1_header(hdr, type, sizep);
+	if (retval < 0)
+		retval = parse_ascii_sha1_header(hdr, type, sizep);
+	return retval;
 }
 
 void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned long *size)
 {
-	int ret;
+	int ret, hdrlen;
 	z_stream stream;
 	char hdr[8192];
 
 	ret = unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr));
-	if (ret < Z_OK || parse_sha1_header(hdr, type, size) < 0)
+	if (ret < Z_OK)
+		return NULL;
+	hdrlen = parse_sha1_header(hdr, type, size);
+	if (hdrlen < 0)
 		return NULL;
 
-	return unpack_sha1_rest(&stream, hdr, *size);
+	return unpack_sha1_rest(&stream, hdr, *size, hdrlen);
 }
 
 /* forward declaration for a mutually recursive function */

  reply	other threads:[~2006-07-11 19:54 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-07-10 23:01 Revisiting large binary files issue Carl Baldwin
2006-07-10 23:14 ` Junio C Hamano
2006-07-11  6:20   ` Peter Baumann
2006-07-10 23:28 ` Linus Torvalds
2006-07-11  9:40   ` [RFC]: Pack-file object format for individual objects (Was: Revisiting large binary files issue.) sf
2006-07-11 18:00     ` Linus Torvalds
2006-07-11 21:45       ` sf
2006-07-11 22:17         ` Linus Torvalds
2006-07-11 22:26           ` Linus Torvalds
2006-07-11 14:55   ` Revisiting large binary files issue Carl Baldwin
2006-07-11 17:09     ` Linus Torvalds
2006-07-11 17:10       ` [PATCH 1/3] Make the unpacked object header functions static to sha1_file.c Linus Torvalds
2006-07-11 17:12       ` [PATCH 2/3] sha1_file: add the ability to parse objects in "pack file format" Linus Torvalds
2006-07-11 18:40         ` Johannes Schindelin
2006-07-11 18:58           ` Linus Torvalds
2006-07-11 19:20             ` Johannes Schindelin
2006-07-11 19:48               ` Linus Torvalds [this message]
2006-07-11 21:25                 ` Johannes Schindelin
2006-07-11 21:47                 ` Junio C Hamano
2006-07-11 21:24         ` sf
2006-07-11 22:09           ` Linus Torvalds
2006-07-11 22:25             ` sf
2006-07-11 23:03             ` Junio C Hamano
2006-07-12  0:03               ` Linus Torvalds
2006-07-12  0:39                 ` Johannes Schindelin
2006-07-12  3:45                   ` Linus Torvalds
2006-07-12  4:31                     ` Linus Torvalds
2006-07-12  6:35                     ` Junio C Hamano
2006-07-12 16:29                       ` Linus Torvalds
2006-07-12  0:46                 ` Junio C Hamano
2006-07-12  3:42                   ` Linus Torvalds
2006-07-12  6:49                 ` Peter Baumann
2006-07-12  7:16                   ` Junio C Hamano
2006-07-12  8:28                     ` Peter Baumann
2006-07-12 15:13                   ` Linus Torvalds
2006-07-12 15:27                     ` Junio C Hamano
2006-07-11 17:16       ` [PATCH 3/3] Enable the new binary header format for unpacked objects Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0607111241460.5623@g5.osdl.org \
    --to=torvalds@osdl.org \
    --cc=Johannes.Schindelin@gmx.de \
    --cc=cnb@fc.hp.com \
    --cc=git@vger.kernel.org \
    --cc=junkio@cox.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).