git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jeff King <peff@peff.net>
To: Nguyen Thai Ngoc Duy <pclouds@gmail.com>
Cc: David Michael Barr <b@rr-dav.id.au>,
	Git Mailing List <git@vger.kernel.org>
Subject: Re: [RFC] pack-objects: compression level for non-blobs
Date: Sat, 29 Dec 2012 04:48:17 -0500	[thread overview]
Message-ID: <20121229094816.GA23396@sigill.intra.peff.net> (raw)
In-Reply-To: <20121229090558.GA31291@sigill.intra.peff.net>

On Sat, Dec 29, 2012 at 04:05:58AM -0500, Jeff King wrote:

> On Sat, Dec 29, 2012 at 12:27:47AM -0500, Jeff King wrote:
> 
> > > I think I tried the partial decompression for commit header and it did
> > > not help much (or I misremember it, not so sure).
> > 
> > I'll see if I can dig up the reference, as it was something I was going
> > to look at next.
> 
> I tried the simple patch below, but it actually made things slower!  I'm
> assuming it is because the streaming setup is not micro-optimized very
> well. A custom read_sha1_until_blank_line() could probably do better.

Something like the patch below, which does speed things up. But not
nearly as much as I'd hoped:

  [before]
  $ best-of-five git rev-list --count --all
  real    0m4.197s
  user    0m4.112s
  sys     0m0.072s

  [after]
  $ best-of-five git rev-list --count --all
  real    0m3.782s
  user    0m3.708s
  sys     0m0.064s

Only about a 10% speedup (versus ~75% with uncompressed commits).

---
diff --git a/cache.h b/cache.h
index 18fdd18..a494d3b 100644
--- a/cache.h
+++ b/cache.h
@@ -724,6 +724,7 @@ int offset_1st_component(const char *path);
 
 /* object replacement */
 #define READ_SHA1_FILE_REPLACE 1
+#define READ_SHA1_FILE_HEADER 2
 extern void *read_sha1_file_extended(const unsigned char *sha1, enum object_type *type, unsigned long *size, unsigned flag);
 static inline void *read_sha1_file(const unsigned char *sha1, enum object_type *type, unsigned long *size)
 {
@@ -1059,7 +1060,7 @@ extern int is_pack_valid(struct packed_git *);
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
-extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
+extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *, int);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
 extern unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t);
 extern int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *);
diff --git a/commit.c b/commit.c
index e8eb0ae..0a088dc 100644
--- a/commit.c
+++ b/commit.c
@@ -312,12 +312,13 @@ int parse_commit(struct commit *item)
 	void *buffer;
 	unsigned long size;
 	int ret;
+	int flags = save_commit_buffer ? 0 : READ_SHA1_FILE_HEADER;
 
 	if (!item)
 		return -1;
 	if (item->object.parsed)
 		return 0;
-	buffer = read_sha1_file(item->object.sha1, &type, &size);
+	buffer = read_sha1_file_extended(item->object.sha1, &type, &size, flags);
 	if (!buffer)
 		return error("Could not read %s",
 			     sha1_to_hex(item->object.sha1));
diff --git a/fast-import.c b/fast-import.c
index c2a814e..a140d57 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1303,7 +1303,7 @@ static void *gfi_unpack_entry(
 		 */
 		p->pack_size = pack_size + 20;
 	}
-	return unpack_entry(p, oe->idx.offset, &type, sizep);
+	return unpack_entry(p, oe->idx.offset, &type, sizep, 0);
 }
 
 static const char *get_mode(const char *str, uint16_t *modep)
diff --git a/pack-check.c b/pack-check.c
index 63a595c..e4a43c0 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -116,7 +116,7 @@ static int verify_packfile(struct packed_git *p,
 					    sha1_to_hex(entries[i].sha1),
 					    p->pack_name, (uintmax_t)offset);
 		}
-		data = unpack_entry(p, entries[i].offset, &type, &size);
+		data = unpack_entry(p, entries[i].offset, &type, &size, 0);
 		if (!data)
 			err = error("cannot unpack %s from %s at offset %"PRIuMAX"",
 				    sha1_to_hex(entries[i].sha1), p->pack_name,
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1f1f31a 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1469,16 +1469,19 @@ static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type
 	return *hdr ? -1 : type_from_string(type);
 }
 
-static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type *type, unsigned long *size, const unsigned char *sha1)
+static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type *type, unsigned long *size, const unsigned char *sha1, int stop_at_blank)
 {
 	int ret;
 	git_zstream stream;
-	char hdr[8192];
+	char hdr[512];
 
 	ret = unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr));
 	if (ret < Z_OK || (*type = parse_sha1_header(hdr, size)) < 0)
 		return NULL;
-
+	if (stop_at_blank && strstr(hdr, "\n\n")) {
+		*size = strlen(hdr);
+		return xstrdup(hdr);
+	}
 	return unpack_sha1_rest(&stream, hdr, *size, sha1);
 }
 
@@ -1667,8 +1670,11 @@ static void *unpack_compressed_entry(struct packed_git *p,
 static void *unpack_compressed_entry(struct packed_git *p,
 				    struct pack_window **w_curs,
 				    off_t curpos,
-				    unsigned long size)
+				    unsigned long *sizep,
+				    int stop_at_blank)
 {
+	static const int chunk_size = 256;
+	unsigned long size = *sizep;
 	int st;
 	git_zstream stream;
 	unsigned char *buffer, *in;
@@ -1676,15 +1682,27 @@ static void *unpack_compressed_entry(struct packed_git *p,
 	buffer = xmallocz(size);
 	memset(&stream, 0, sizeof(stream));
 	stream.next_out = buffer;
-	stream.avail_out = size + 1;
+
+	if (stop_at_blank)
+		stream.avail_out = chunk_size;
+	else
+		stream.avail_out = size + 1;
 
 	git_inflate_init(&stream);
 	do {
 		in = use_pack(p, w_curs, curpos, &stream.avail_in);
 		stream.next_in = in;
 		st = git_inflate(&stream, Z_FINISH);
-		if (!stream.avail_out)
-			break; /* the payload is larger than it should be */
+		if (!stream.avail_out) {
+			if (!stop_at_blank)
+				break; /* the payload is larger than it should be */
+			if (memmem(buffer, chunk_size, "\n\n", 2)) {
+				git_inflate_end(&stream);
+				*sizep = chunk_size;
+				return buffer;
+			}
+			stream.avail_out = size + 1 - chunk_size;
+		}
 		curpos += stream.next_in - in;
 	} while (st == Z_OK || st == Z_BUF_ERROR);
 	git_inflate_end(&stream);
@@ -1731,7 +1749,8 @@ static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
 }
 
 static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
-	unsigned long *base_size, enum object_type *type, int keep_cache)
+	unsigned long *base_size, enum object_type *type, int keep_cache,
+	int stop_at_blank)
 {
 	void *ret;
 	unsigned long hash = pack_entry_hash(p, base_offset);
@@ -1739,9 +1758,9 @@ static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
 
 	ret = ent->data;
 	if (!ret || ent->p != p || ent->base_offset != base_offset)
-		return unpack_entry(p, base_offset, type, base_size);
+		return unpack_entry(p, base_offset, type, base_size, stop_at_blank);
 
-	if (!keep_cache) {
+	if (!stop_at_blank && !keep_cache) {
 		ent->data = NULL;
 		ent->lru.next->prev = ent->lru.prev;
 		ent->lru.prev->next = ent->lru.next;
@@ -1810,7 +1829,7 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 }
 
 static void *read_object(const unsigned char *sha1, enum object_type *type,
-			 unsigned long *size);
+			 unsigned long *size, int stop_at_blank);
 
 static void *unpack_delta_entry(struct packed_git *p,
 				struct pack_window **w_curs,
@@ -1832,7 +1851,7 @@ static void *unpack_delta_entry(struct packed_git *p,
 		return NULL;
 	}
 	unuse_pack(w_curs);
-	base = cache_or_unpack_entry(p, base_offset, &base_size, type, 0);
+	base = cache_or_unpack_entry(p, base_offset, &base_size, type, 0, 0);
 	if (!base) {
 		/*
 		 * We're probably in deep shit, but let's try to fetch
@@ -1851,12 +1870,12 @@ static void *unpack_delta_entry(struct packed_git *p,
 		      sha1_to_hex(base_sha1), (uintmax_t)base_offset,
 		      p->pack_name);
 		mark_bad_packed_object(p, base_sha1);
-		base = read_object(base_sha1, type, &base_size);
+		base = read_object(base_sha1, type, &base_size, 0);
 		if (!base)
 			return NULL;
 	}
 
-	delta_data = unpack_compressed_entry(p, w_curs, curpos, delta_size);
+	delta_data = unpack_compressed_entry(p, w_curs, curpos, &delta_size, 0);
 	if (!delta_data) {
 		error("failed to unpack compressed delta "
 		      "at offset %"PRIuMAX" from %s",
@@ -1895,7 +1914,8 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 int do_check_packed_object_crc;
 
 void *unpack_entry(struct packed_git *p, off_t obj_offset,
-		   enum object_type *type, unsigned long *sizep)
+		   enum object_type *type, unsigned long *sizep,
+		   int stop_at_blank)
 {
 	struct pack_window *w_curs = NULL;
 	off_t curpos = obj_offset;
@@ -1929,7 +1949,8 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	case OBJ_TREE:
 	case OBJ_BLOB:
 	case OBJ_TAG:
-		data = unpack_compressed_entry(p, &w_curs, curpos, *sizep);
+		data = unpack_compressed_entry(p, &w_curs, curpos, sizep,
+					       stop_at_blank);
 		break;
 	default:
 		data = NULL;
@@ -2208,14 +2229,15 @@ static void *read_packed_sha1(const unsigned char *sha1,
 }
 
 static void *read_packed_sha1(const unsigned char *sha1,
-			      enum object_type *type, unsigned long *size)
+			      enum object_type *type, unsigned long *size,
+			      int stop_at_blank)
 {
 	struct pack_entry e;
 	void *data;
 
 	if (!find_pack_entry(sha1, &e))
 		return NULL;
-	data = cache_or_unpack_entry(e.p, e.offset, size, type, 1);
+	data = cache_or_unpack_entry(e.p, e.offset, size, type, 1, stop_at_blank);
 	if (!data) {
 		/*
 		 * We're probably in deep shit, but let's try to fetch
@@ -2226,7 +2248,7 @@ static void *read_packed_sha1(const unsigned char *sha1,
 		error("failed to read object %s at offset %"PRIuMAX" from %s",
 		      sha1_to_hex(sha1), (uintmax_t)e.offset, e.p->pack_name);
 		mark_bad_packed_object(e.p, sha1);
-		data = read_object(sha1, type, size);
+		data = read_object(sha1, type, size, stop_at_blank);
 	}
 	return data;
 }
@@ -2255,7 +2277,7 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 }
 
 static void *read_object(const unsigned char *sha1, enum object_type *type,
-			 unsigned long *size)
+			 unsigned long *size, int stop_at_blank)
 {
 	unsigned long mapsize;
 	void *map, *buf;
@@ -2268,17 +2290,18 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 		return xmemdupz(co->buf, co->size);
 	}
 
-	buf = read_packed_sha1(sha1, type, size);
+	buf = read_packed_sha1(sha1, type, size, stop_at_blank);
 	if (buf)
 		return buf;
 	map = map_sha1_file(sha1, &mapsize);
 	if (map) {
-		buf = unpack_sha1_file(map, mapsize, type, size, sha1);
+		buf = unpack_sha1_file(map, mapsize, type, size, sha1,
+				       stop_at_blank);
 		munmap(map, mapsize);
 		return buf;
 	}
 	reprepare_packed_git();
-	return read_packed_sha1(sha1, type, size);
+	return read_packed_sha1(sha1, type, size, stop_at_blank);
 }
 
 /*
@@ -2296,9 +2319,10 @@ void *read_sha1_file_extended(const unsigned char *sha1,
 	const struct packed_git *p;
 	const unsigned char *repl = (flag & READ_SHA1_FILE_REPLACE)
 		? lookup_replace_object(sha1) : sha1;
+	int stop_at_blank = !!(flag & READ_SHA1_FILE_HEADER);
 
 	errno = 0;
-	data = read_object(repl, type, size);
+	data = read_object(repl, type, size, stop_at_blank);
 	if (data)
 		return data;
 
@@ -2597,7 +2621,7 @@ int force_object_loose(const unsigned char *sha1, time_t mtime)
 
 	if (has_loose_object(sha1))
 		return 0;
-	buf = read_packed_sha1(sha1, &type, &len);
+	buf = read_packed_sha1(sha1, &type, &len, 0);
 	if (!buf)
 		return error("cannot read sha1_file for %s", sha1_to_hex(sha1));
 	hdrlen = sprintf(hdr, "%s %lu", typename(type), len) + 1;

  reply	other threads:[~2012-12-29 10:10 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-11-26  6:25 [RFC] pack-objects: compression level for non-blobs David Michael Barr
2012-11-26 12:35 ` David Michael Barr
2012-12-29  0:41 ` Jeff King
2012-12-29  4:34   ` Nguyen Thai Ngoc Duy
2012-12-29  5:07     ` Jeff King
2012-12-29  5:25       ` Nguyen Thai Ngoc Duy
2012-12-29  5:27         ` Jeff King
2012-12-29  9:05           ` Jeff King
2012-12-29  9:48             ` Jeff King [this message]
2012-12-30 12:05           ` Jeff King
2012-12-30 12:53             ` Nguyen Thai Ngoc Duy
2012-12-30 21:31               ` Jeff King
2012-12-31 18:06                 ` Shawn Pearce
2013-01-01  4:15                   ` Duy Nguyen
2013-01-01 12:10                     ` Duy Nguyen
2013-01-01 17:17                       ` Shawn Pearce
2013-01-01 23:47                         ` Junio C Hamano
2013-01-02  2:23                         ` Duy Nguyen
2013-01-01 20:02                       ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20121229094816.GA23396@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=b@rr-dav.id.au \
    --cc=git@vger.kernel.org \
    --cc=pclouds@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).