[RFC] pack-objects: compression level for non-blobs

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [RFC] pack-objects: compression level for non-blobs
@ 2012-11-26  6:25 David Michael Barr
  2012-11-26 12:35 ` David Michael Barr
  2012-12-29  0:41 ` Jeff King
  0 siblings, 2 replies; 19+ messages in thread
From: David Michael Barr @ 2012-11-26  6:25 UTC (permalink / raw)
  To: Git Mailing List; +Cc: David Michael Barr

Add config pack.graphcompression similar to pack.compression.
Applies to non-blob objects and if unspecified falls back to pack.compression.

We may identify objects compressed with level 0 by their leading bytes.
Use this to force recompression when the source and target levels mismatch.
Limit its application to when the config pack.graphcompression is set.

Signed-off-by: David Michael Barr <b@rr-dav.id.au>
---
 builtin/pack-objects.c | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

 I started working on this just before taking a vacation,
 so it's been a little while coming.

 The intent is to allow selective recompression of pack data.
 For small objects/deltas the overhead of deflate is significant.
 This may improve read performance for the object graph.

 I ran some unscientific experiments with the chromium repository.
 With pack.graphcompression = 0, there was a 2.7% increase in pack size.
 I saw a 35% improvement with cold caches and 43% otherwise on git log --raw.

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f069462..9518daf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -40,6 +40,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	unsigned int hash;	/* name hint hash */
 	enum object_type type;
+	enum object_type actual_type;
 	enum object_type in_pack_type;	/* could be delta */
 	unsigned char in_pack_header_size;
 	unsigned char preferred_base; /* we do not pack this, but is available
@@ -81,6 +82,8 @@ static int num_preferred_base;
 static struct progress *progress_state;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
+static int pack_graph_compression_level = Z_DEFAULT_COMPRESSION;
+static int pack_graph_compression_seen;
 
 static unsigned long delta_cache_size = 0;
 static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
@@ -125,14 +128,14 @@ static void *get_delta(struct object_entry *entry)
 	return delta_buf;
 }
 
-static unsigned long do_compress(void **pptr, unsigned long size)
+static unsigned long do_compress(void **pptr, unsigned long size, int level)
 {
 	git_zstream stream;
 	void *in, *out;
 	unsigned long maxsize;
 
 	memset(&stream, 0, sizeof(stream));
-	git_deflate_init(&stream, pack_compression_level);
+	git_deflate_init(&stream, level);
 	maxsize = git_deflate_bound(&stream, size);
 
 	in = *pptr;
@@ -191,6 +194,18 @@ static unsigned long write_large_blob_data(struct git_istream *st, struct sha1fi
 	return olen;
 }
 
+static int check_pack_compressed(struct packed_git *p,
+		struct pack_window **w_curs,
+		off_t offset)
+{
+	unsigned long avail;
+	int compressed = 0;
+	unsigned char *in = use_pack(p, w_curs, offset, &avail);
+	if (avail >= 3)
+		compressed = !!(in[2] & 0x6);
+	return compressed;
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -240,6 +255,8 @@ static void copy_pack_data(struct sha1file *f,
 	}
 }
 
+#define compression_level(type) ((type) && (type) != OBJ_BLOB ? pack_graph_compression_level : pack_compression_level)
+
 /* Return 0 if we will bust the pack-size limit */
 static unsigned long write_no_reuse_object(struct sha1file *f, struct object_entry *entry,
 					   unsigned long limit, int usable_delta)
@@ -286,7 +303,7 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
-		datalen = do_compress(&buf, size);
+		datalen = do_compress(&buf, size, compression_level(entry->actual_type));
 
 	/*
 	 * The object header is a byte of 'type' followed by zero or
@@ -379,6 +396,13 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 	offset += entry->in_pack_header_size;
 	datalen -= entry->in_pack_header_size;
 
+	if (!pack_to_stdout &&
+	    pack_graph_compression_seen &&
+	    check_pack_compressed(p, &w_curs, offset) != !!compression_level(entry->actual_type)) {
+		unuse_pack(&w_curs);
+		return write_no_reuse_object(f, entry, limit, usable_delta);
+	}
+
 	if (!pack_to_stdout && p->index_version == 1 &&
 	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
 		error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
@@ -955,6 +979,8 @@ static int add_object_entry(const unsigned char *sha1, enum object_type type,
 	memset(entry, 0, sizeof(*entry));
 	hashcpy(entry->idx.sha1, sha1);
 	entry->hash = hash;
+	if (pack_graph_compression_seen)
+		entry->actual_type = sha1_object_info(sha1, NULL);
 	if (type)
 		entry->type = type;
 	if (exclude)
@@ -1758,7 +1784,8 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
 			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
+							  entry->delta_size,
+							  compression_level(entry->actual_type));
 			cache_lock();
 			delta_cache_size -= entry->delta_size;
 			delta_cache_size += entry->z_delta_size;
@@ -2159,6 +2186,16 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 			    pack_idx_opts.version);
 		return 0;
 	}
+	if (!strcmp(k, "pack.graphcompression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack graph compression level %d", level);
+		pack_graph_compression_level = level;
+		pack_graph_compression_seen = 1;
+		return 0;
+	}
 	return git_default_config(k, v, cb);
 }
 
@@ -2519,6 +2556,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	argc = parse_options(argc, argv, prefix, pack_objects_options,
 			     pack_usage, 0);
 
+	/* Fall back after option parsing to catch --compression */
+	if (!pack_graph_compression_seen)
+		pack_graph_compression_level = pack_compression_level;
+
 	if (argc) {
 		base_name = argv[0];
 		argc--;
-- 
1.8.0

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-11-26  6:25 [RFC] pack-objects: compression level for non-blobs David Michael Barr
@ 2012-11-26 12:35 ` David Michael Barr
  2012-12-29  0:41 ` Jeff King
  1 sibling, 0 replies; 19+ messages in thread
From: David Michael Barr @ 2012-11-26 12:35 UTC (permalink / raw)
  To: Git Mailing List; +Cc: David Michael Barr

> Add config pack.graphcompression similar to pack.compression.
> Applies to non-blob objects and if unspecified falls back to pack.compression.
> 
> We may identify objects compressed with level 0 by their leading bytes.
> Use this to force recompression when the source and target levels mismatch.
> Limit its application to when the config pack.graphcompression is set.
> 
> Signed-off-by: David Michael Barr <b@rr-dav.id.au (mailto:b@rr-dav.id.au)>
> ---
> builtin/pack-objects.c | 49 +++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 45 insertions(+), 4 deletions(-)
> 
> I started working on this just before taking a vacation,
> so it's been a little while coming.
> 
> The intent is to allow selective recompression of pack data.
> For small objects/deltas the overhead of deflate is significant.
> This may improve read performance for the object graph.
> 
> I ran some unscientific experiments with the chromium repository.
> With pack.graphcompression = 0, there was a 2.7% increase in pack size.
> I saw a 35% improvement with cold caches and 43% otherwise on git log --raw.

I neglected to mention that this is a WIP. I get failures with certain repositories: 

    fatal: delta size changed

--
David Michael Barr

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-11-26  6:25 [RFC] pack-objects: compression level for non-blobs David Michael Barr
  2012-11-26 12:35 ` David Michael Barr
@ 2012-12-29  0:41 ` Jeff King
  2012-12-29  4:34   ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 19+ messages in thread
From: Jeff King @ 2012-12-29  0:41 UTC (permalink / raw)
  To: David Michael Barr; +Cc: Git Mailing List

On Mon, Nov 26, 2012 at 05:25:54PM +1100, David Michael Barr wrote:

>  The intent is to allow selective recompression of pack data.
>  For small objects/deltas the overhead of deflate is significant.
>  This may improve read performance for the object graph.
> 
>  I ran some unscientific experiments with the chromium repository.
>  With pack.graphcompression = 0, there was a 2.7% increase in pack size.
>  I saw a 35% improvement with cold caches and 43% otherwise on git log --raw.

There wasn't much response to this, but those numbers are encouraging. I
was curious to replicate them, as well as to break it down by trees and
commits. I also wanted to test on more repositories, as well as on both
SSDs and spinning disks (for cold-cache numbers). Maybe that will catch
more people's interest.

As you mentioned in your follow-up, I ran into the "delta size changed"
problem. Not sure if it is related, but I noticed here:

> @@ -379,6 +396,13 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
>  	offset += entry->in_pack_header_size;
>  	datalen -= entry->in_pack_header_size;
>  
> +	if (!pack_to_stdout &&
> +	    pack_graph_compression_seen &&
> +	    check_pack_compressed(p, &w_curs, offset) != !!compression_level(entry->actual_type)) {
> +		unuse_pack(&w_curs);
> +		return write_no_reuse_object(f, entry, limit, usable_delta);
> +	}
> +

...that we seem to re-compress more than necessary. If I instrument that
block with a message to stderr and run "git repack -ad" repeatedly
without changing the config in between, runs after the first should
never re-compress, right? But they seem to. I'm not sure if your
check_pack_compressed heuristic is off or something else. It may or may
not be related to the "delta sized change" failure.

But we can leave this side a bit for a moment. Conceptually there are
two interesting things going on in your patch:

  1. Per-object-type compression levels

  2. Auto-recompression when levels change.

We can figure out (2) later. The meat of the idea is (1), and the patch
for that is much simpler. In fact, we can test it out with entirely
stock git by creating separate tree, commit, and blob packs, each with
different compression. So that's what I did for my timing, just to keep
things simple.

I timed git.git, linux-2.6.git, and WebKit.git. For each repo, I tested
it with four pack compression scenarios:

  1. all objects at -1 (zlib default)

  2. commits at 0, everything else at -1

  3. trees at 0, everything else at -1

  4. commits and trees at 0, everything else at -1

For each scenario, I timed "git rev-list --count --all" to traverse all
commits (which roughly models things like merge-base, ahead/behind
counts, etc), and then the same thing with "--objects" to traverse all
objects (which would roughly match what "git prune" or the "counting
objects" phase of packing would do). For each command, I timed both warm
and cold disk cache (the latter via "echo 3 >/proc/sys/vm/drop_caches").
Each timing is a best-of-five.  The timings were done on a machine with
an SSD (which probably matters for cold-cache; I have some spinning disk
numbers later).

Here are the git.git numbers:

 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none |  41.48        | 0.78        | 0.33        |  2.35        |  1.94       
commit |  49.34 (+18%) | 0.57 (-26%) | 0.09 (-74%) |  2.48  (+5%) |  1.70 (-12%)
  tree |  45.43  (+9%) | 0.80  (+3%) | 0.33   (0%) |  2.11  (-9%) |  1.74 (-10%)
  both |  53.31 (+28%) | 0.79  (+1%) | 0.08 (-75%) |  2.27  (-3%) |  1.49 (-23%)

The pack column specifies which scenario (i.e., what was left
uncompressed).  The size column is the size of the object-dir (in
megabytes). The other columns are times to run each command in
wall-clock seconds. Percentages are comparisons to the baseline "none"
case (i.e., the status quo).

So you can see that it's a big win for warm-cache pure-commit
traversals. As a sanity check, we can see that the tree-only case is not
helped at all there (because we do not look at trees at all). The
cold-cache case is helped, too, but that benefit goes away (and even
hurts slightly, but that is probably within the noise) when we also
leave the trees uncompressed.

The full-objects traversal doesn't fare quite as well, though there's
still some improvement. I think it argues for leaving both uncompressed,
as the warm case really benefits when both are uncompressed. You lose
the cold-cache benefits in the revs-only case, though.

Here are the numbers for linux-2.6.git:

 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none | 864.61        | 8.66        | 4.07        | 42.76        | 36.32       
commit | 970.46 (+12%) | 8.87  (+2%) | 1.02 (-74%) | 42.94   (0%) | 33.43  (-7%)
  tree | 895.37  (+3%) | 9.08  (+4%) | 4.07   (0%) | 36.01 (-15%) | 29.62 (-18%)
  both | 1001.25 (+15%) | 8.90  (+2%) | 1.03 (-74%) | 35.57 (-16%) | 26.25 (-27%)

Similar warm-cache numbers, but the cold cache for the revs-only case is
hurt a little bit more.

And here's WebKit.git (sizes are in gigabytes this time):

 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none |   3.46        | 1.61        | 1.38        | 20.46        | 18.72       
commit |   3.54  (+2%) | 1.42 (-11%) | 0.34 (-75%) | 20.42   (0%) | 17.57  (-6%)
  tree |   3.59  (+3%) | 1.61   (0%) | 1.39   (0%) | 16.01 (-21%) | 14.00 (-25%)
  both |   3.67  (+6%) | 1.45 (-10%) | 0.34 (-75%) | 15.94 (-22%) | 12.91 (-31%)

Pretty similar again (slightly better on the full object traversal).

And finally, for comparison, here are the numbers from a (much slower)
machine that has spinning disks (albeit in a mirrored raid, which should
improve read times) on git.git:

 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none |  41.35        | 1.85        | 0.64        |  5.58        |  3.91       
commit |  49.23 (+19%) | 1.94  (+4%) | 0.14 (-77%) |  5.51  (-1%) |  3.40 (-12%)
  tree |  45.27  (+9%) | 1.78  (-3%) | 0.64   (0%) |  5.13  (-8%) |  3.53  (-9%)
  both |  53.16 (+28%) | 1.83  (-1%) | 0.14 (-77%) |  4.96 (-11%) |  3.32 (-14%)

Surprisingly not all that different than the SSD times. Which may mean I
screwed something up. I'm happy to make my test harness available if
anybody else feels like timing on their repos or machines. But it does
point to potentially leaving commits uncompressed, and possibly trees.

I wonder if we could do even better, though. For a traversal, we only
need to look at the commit header. We could potentially do a progressive
inflate and stop before getting to the commit message (which is the bulk
of the data, and the part that is most likely to benefit from
compression).

-Peff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  0:41 ` Jeff King
@ 2012-12-29  4:34   ` Nguyen Thai Ngoc Duy
  2012-12-29  5:07     ` Jeff King
  0 siblings, 1 reply; 19+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-12-29  4:34 UTC (permalink / raw)
  To: Jeff King; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 7:41 AM, Jeff King <peff@peff.net> wrote:
> I wonder if we could do even better, though. For a traversal, we only
> need to look at the commit header. We could potentially do a progressive
> inflate and stop before getting to the commit message (which is the bulk
> of the data, and the part that is most likely to benefit from
> compression).

Commit cache should solve this efficiently as it also eliminates
parsing cost. We discussed this last time as a side topic of the
reachability bitmap feature.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  4:34   ` Nguyen Thai Ngoc Duy
@ 2012-12-29  5:07     ` Jeff King
  2012-12-29  5:25       ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 19+ messages in thread
From: Jeff King @ 2012-12-29  5:07 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 11:34:09AM +0700, Nguyen Thai Ngoc Duy wrote:

> On Sat, Dec 29, 2012 at 7:41 AM, Jeff King <peff@peff.net> wrote:
> > I wonder if we could do even better, though. For a traversal, we only
> > need to look at the commit header. We could potentially do a progressive
> > inflate and stop before getting to the commit message (which is the bulk
> > of the data, and the part that is most likely to benefit from
> > compression).
> 
> Commit cache should solve this efficiently as it also eliminates
> parsing cost. We discussed this last time as a side topic of the
> reachability bitmap feature.

I agree that a commit cache would solve this (though it can not help the
tree traversal). But just dropping the compression (or doing partial
decompression when we only care about the beginning part) is way less
code and complexity. There's no cache to manage.

-Peff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  5:07     ` Jeff King
@ 2012-12-29  5:25       ` Nguyen Thai Ngoc Duy
  2012-12-29  5:27         ` Jeff King
  0 siblings, 1 reply; 19+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-12-29  5:25 UTC (permalink / raw)
  To: Jeff King; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 12:07 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Dec 29, 2012 at 11:34:09AM +0700, Nguyen Thai Ngoc Duy wrote:
>
>> On Sat, Dec 29, 2012 at 7:41 AM, Jeff King <peff@peff.net> wrote:
>> > I wonder if we could do even better, though. For a traversal, we only
>> > need to look at the commit header. We could potentially do a progressive
>> > inflate and stop before getting to the commit message (which is the bulk
>> > of the data, and the part that is most likely to benefit from
>> > compression).
>>
>> Commit cache should solve this efficiently as it also eliminates
>> parsing cost. We discussed this last time as a side topic of the
>> reachability bitmap feature.
>
> I agree that a commit cache would solve this (though it can not help the
> tree traversal).

Yeah, caching trees efficiently is not easy.

> But just dropping the compression (or doing partial
> decompression when we only care about the beginning part) is way less
> code and complexity.

I think I tried the partial decompression for commit header and it did
not help much (or I misremember it, not so sure).

> There's no cache to manage.

If reachability bitmap is implemented, we'll have per-pack cache
infrastructure ready, so less work there for commit cache.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  5:25       ` Nguyen Thai Ngoc Duy
@ 2012-12-29  5:27         ` Jeff King
  2012-12-29  9:05           ` Jeff King
  2012-12-30 12:05           ` Jeff King
  0 siblings, 2 replies; 19+ messages in thread
From: Jeff King @ 2012-12-29  5:27 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 12:25:04PM +0700, Nguyen Thai Ngoc Duy wrote:

> > But just dropping the compression (or doing partial
> > decompression when we only care about the beginning part) is way less
> > code and complexity.
> 
> I think I tried the partial decompression for commit header and it did
> not help much (or I misremember it, not so sure).

I'll see if I can dig up the reference, as it was something I was going
to look at next.

> > There's no cache to manage.
> 
> If reachability bitmap is implemented, we'll have per-pack cache
> infrastructure ready, so less work there for commit cache.

True. I don't want to dissuade you from doing any commit cache work. I
only wanted to point out that this alternative may have merit because of
its simplicity (so we can use it until a caching solution exists, or
even after, if managing the cache has downsides).

-Peff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  5:27         ` Jeff King
@ 2012-12-29  9:05           ` Jeff King
  2012-12-29  9:48             ` Jeff King
  2012-12-30 12:05           ` Jeff King
  1 sibling, 1 reply; 19+ messages in thread
From: Jeff King @ 2012-12-29  9:05 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 12:27:47AM -0500, Jeff King wrote:

> > I think I tried the partial decompression for commit header and it did
> > not help much (or I misremember it, not so sure).
> 
> I'll see if I can dig up the reference, as it was something I was going
> to look at next.

I tried the simple patch below, but it actually made things slower!  I'm
assuming it is because the streaming setup is not micro-optimized very
well. A custom read_sha1_until_blank_line() could probably do better.

diff --git a/commit.c b/commit.c
index e8eb0ae..efd6c06 100644
--- a/commit.c
+++ b/commit.c
@@ -8,6 +8,7 @@
 #include "notes.h"
 #include "gpg-interface.h"
 #include "mergesort.h"
+#include "streaming.h"
 
 static struct commit_extra_header *read_commit_extra_header_lines(const char *buf, size_t len, const char **);
 
@@ -306,6 +307,39 @@ int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long s
 	return 0;
 }
 
+static void *read_commit_header(const unsigned char *sha1,
+				enum object_type *type,
+				unsigned long *size)
+{
+	static const int chunk_size = 256;
+	struct strbuf buf = STRBUF_INIT;
+	struct git_istream *st;
+
+	st = open_istream(sha1, type, size, NULL);
+	if (!st)
+		return NULL;
+	while (1) {
+		size_t start = buf.len;
+		ssize_t readlen;
+
+		strbuf_grow(&buf, chunk_size);
+		readlen = read_istream(st, buf.buf + start, chunk_size);
+		buf.buf[start + readlen + 1] = '\0';
+		buf.len += readlen;
+
+		if (readlen < 0) {
+			close_istream(st);
+			strbuf_release(&buf);
+			return NULL;
+		}
+		if (!readlen || strstr(buf.buf + start, "\n\n"))
+			break;
+	}
+
+	close_istream(st);
+	return strbuf_detach(&buf, size);
+}
+
 int parse_commit(struct commit *item)
 {
 	enum object_type type;
@@ -317,7 +351,11 @@ int parse_commit(struct commit *item)
 		return -1;
 	if (item->object.parsed)
 		return 0;
-	buffer = read_sha1_file(item->object.sha1, &type, &size);
+
+	if (!save_commit_buffer)
+		buffer = read_commit_header(item->object.sha1, &type, &size);
+	else
+		buffer = read_sha1_file(item->object.sha1, &type, &size);
 	if (!buffer)
 		return error("Could not read %s",
 			     sha1_to_hex(item->object.sha1));

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  9:05           ` Jeff King
@ 2012-12-29  9:48             ` Jeff King
  0 siblings, 0 replies; 19+ messages in thread
From: Jeff King @ 2012-12-29  9:48 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 04:05:58AM -0500, Jeff King wrote:

> On Sat, Dec 29, 2012 at 12:27:47AM -0500, Jeff King wrote:
> 
> > > I think I tried the partial decompression for commit header and it did
> > > not help much (or I misremember it, not so sure).
> > 
> > I'll see if I can dig up the reference, as it was something I was going
> > to look at next.
> 
> I tried the simple patch below, but it actually made things slower!  I'm
> assuming it is because the streaming setup is not micro-optimized very
> well. A custom read_sha1_until_blank_line() could probably do better.

Something like the patch below, which does speed things up. But not
nearly as much as I'd hoped:

  [before]
  $ best-of-five git rev-list --count --all
  real    0m4.197s
  user    0m4.112s
  sys     0m0.072s

  [after]
  $ best-of-five git rev-list --count --all
  real    0m3.782s
  user    0m3.708s
  sys     0m0.064s

Only about a 10% speedup (versus ~75% with uncompressed commits).

---
diff --git a/cache.h b/cache.h
index 18fdd18..a494d3b 100644
--- a/cache.h
+++ b/cache.h
@@ -724,6 +724,7 @@ int offset_1st_component(const char *path);
 
 /* object replacement */
 #define READ_SHA1_FILE_REPLACE 1
+#define READ_SHA1_FILE_HEADER 2
 extern void *read_sha1_file_extended(const unsigned char *sha1, enum object_type *type, unsigned long *size, unsigned flag);
 static inline void *read_sha1_file(const unsigned char *sha1, enum object_type *type, unsigned long *size)
 {
@@ -1059,7 +1060,7 @@ extern int is_pack_valid(struct packed_git *);
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
-extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
+extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *, int);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
 extern unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t);
 extern int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *);
diff --git a/commit.c b/commit.c
index e8eb0ae..0a088dc 100644
--- a/commit.c
+++ b/commit.c
@@ -312,12 +312,13 @@ int parse_commit(struct commit *item)
 	void *buffer;
 	unsigned long size;
 	int ret;
+	int flags = save_commit_buffer ? 0 : READ_SHA1_FILE_HEADER;
 
 	if (!item)
 		return -1;
 	if (item->object.parsed)
 		return 0;
-	buffer = read_sha1_file(item->object.sha1, &type, &size);
+	buffer = read_sha1_file_extended(item->object.sha1, &type, &size, flags);
 	if (!buffer)
 		return error("Could not read %s",
 			     sha1_to_hex(item->object.sha1));
diff --git a/fast-import.c b/fast-import.c
index c2a814e..a140d57 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1303,7 +1303,7 @@ static void *gfi_unpack_entry(
 		 */
 		p->pack_size = pack_size + 20;
 	}
-	return unpack_entry(p, oe->idx.offset, &type, sizep);
+	return unpack_entry(p, oe->idx.offset, &type, sizep, 0);
 }
 
 static const char *get_mode(const char *str, uint16_t *modep)
diff --git a/pack-check.c b/pack-check.c
index 63a595c..e4a43c0 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -116,7 +116,7 @@ static int verify_packfile(struct packed_git *p,
 					    sha1_to_hex(entries[i].sha1),
 					    p->pack_name, (uintmax_t)offset);
 		}
-		data = unpack_entry(p, entries[i].offset, &type, &size);
+		data = unpack_entry(p, entries[i].offset, &type, &size, 0);
 		if (!data)
 			err = error("cannot unpack %s from %s at offset %"PRIuMAX"",
 				    sha1_to_hex(entries[i].sha1), p->pack_name,
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1f1f31a 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1469,16 +1469,19 @@ static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type
 	return *hdr ? -1 : type_from_string(type);
 }
 
-static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type *type, unsigned long *size, const unsigned char *sha1)
+static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type *type, unsigned long *size, const unsigned char *sha1, int stop_at_blank)
 {
 	int ret;
 	git_zstream stream;
-	char hdr[8192];
+	char hdr[512];
 
 	ret = unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr));
 	if (ret < Z_OK || (*type = parse_sha1_header(hdr, size)) < 0)
 		return NULL;
-
+	if (stop_at_blank && strstr(hdr, "\n\n")) {
+		*size = strlen(hdr);
+		return xstrdup(hdr);
+	}
 	return unpack_sha1_rest(&stream, hdr, *size, sha1);
 }
 
@@ -1667,8 +1670,11 @@ static void *unpack_compressed_entry(struct packed_git *p,
 static void *unpack_compressed_entry(struct packed_git *p,
 				    struct pack_window **w_curs,
 				    off_t curpos,
-				    unsigned long size)
+				    unsigned long *sizep,
+				    int stop_at_blank)
 {
+	static const int chunk_size = 256;
+	unsigned long size = *sizep;
 	int st;
 	git_zstream stream;
 	unsigned char *buffer, *in;
@@ -1676,15 +1682,27 @@ static void *unpack_compressed_entry(struct packed_git *p,
 	buffer = xmallocz(size);
 	memset(&stream, 0, sizeof(stream));
 	stream.next_out = buffer;
-	stream.avail_out = size + 1;
+
+	if (stop_at_blank)
+		stream.avail_out = chunk_size;
+	else
+		stream.avail_out = size + 1;
 
 	git_inflate_init(&stream);
 	do {
 		in = use_pack(p, w_curs, curpos, &stream.avail_in);
 		stream.next_in = in;
 		st = git_inflate(&stream, Z_FINISH);
-		if (!stream.avail_out)
-			break; /* the payload is larger than it should be */
+		if (!stream.avail_out) {
+			if (!stop_at_blank)
+				break; /* the payload is larger than it should be */
+			if (memmem(buffer, chunk_size, "\n\n", 2)) {
+				git_inflate_end(&stream);
+				*sizep = chunk_size;
+				return buffer;
+			}
+			stream.avail_out = size + 1 - chunk_size;
+		}
 		curpos += stream.next_in - in;
 	} while (st == Z_OK || st == Z_BUF_ERROR);
 	git_inflate_end(&stream);
@@ -1731,7 +1749,8 @@ static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
 }
 
 static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
-	unsigned long *base_size, enum object_type *type, int keep_cache)
+	unsigned long *base_size, enum object_type *type, int keep_cache,
+	int stop_at_blank)
 {
 	void *ret;
 	unsigned long hash = pack_entry_hash(p, base_offset);
@@ -1739,9 +1758,9 @@ static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
 
 	ret = ent->data;
 	if (!ret || ent->p != p || ent->base_offset != base_offset)
-		return unpack_entry(p, base_offset, type, base_size);
+		return unpack_entry(p, base_offset, type, base_size, stop_at_blank);
 
-	if (!keep_cache) {
+	if (!stop_at_blank && !keep_cache) {
 		ent->data = NULL;
 		ent->lru.next->prev = ent->lru.prev;
 		ent->lru.prev->next = ent->lru.next;
@@ -1810,7 +1829,7 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 }
 
 static void *read_object(const unsigned char *sha1, enum object_type *type,
-			 unsigned long *size);
+			 unsigned long *size, int stop_at_blank);
 
 static void *unpack_delta_entry(struct packed_git *p,
 				struct pack_window **w_curs,
@@ -1832,7 +1851,7 @@ static void *unpack_delta_entry(struct packed_git *p,
 		return NULL;
 	}
 	unuse_pack(w_curs);
-	base = cache_or_unpack_entry(p, base_offset, &base_size, type, 0);
+	base = cache_or_unpack_entry(p, base_offset, &base_size, type, 0, 0);
 	if (!base) {
 		/*
 		 * We're probably in deep shit, but let's try to fetch
@@ -1851,12 +1870,12 @@ static void *unpack_delta_entry(struct packed_git *p,
 		      sha1_to_hex(base_sha1), (uintmax_t)base_offset,
 		      p->pack_name);
 		mark_bad_packed_object(p, base_sha1);
-		base = read_object(base_sha1, type, &base_size);
+		base = read_object(base_sha1, type, &base_size, 0);
 		if (!base)
 			return NULL;
 	}
 
-	delta_data = unpack_compressed_entry(p, w_curs, curpos, delta_size);
+	delta_data = unpack_compressed_entry(p, w_curs, curpos, &delta_size, 0);
 	if (!delta_data) {
 		error("failed to unpack compressed delta "
 		      "at offset %"PRIuMAX" from %s",
@@ -1895,7 +1914,8 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 int do_check_packed_object_crc;
 
 void *unpack_entry(struct packed_git *p, off_t obj_offset,
-		   enum object_type *type, unsigned long *sizep)
+		   enum object_type *type, unsigned long *sizep,
+		   int stop_at_blank)
 {
 	struct pack_window *w_curs = NULL;
 	off_t curpos = obj_offset;
@@ -1929,7 +1949,8 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	case OBJ_TREE:
 	case OBJ_BLOB:
 	case OBJ_TAG:
-		data = unpack_compressed_entry(p, &w_curs, curpos, *sizep);
+		data = unpack_compressed_entry(p, &w_curs, curpos, sizep,
+					       stop_at_blank);
 		break;
 	default:
 		data = NULL;
@@ -2208,14 +2229,15 @@ static void *read_packed_sha1(const unsigned char *sha1,
 }
 
 static void *read_packed_sha1(const unsigned char *sha1,
-			      enum object_type *type, unsigned long *size)
+			      enum object_type *type, unsigned long *size,
+			      int stop_at_blank)
 {
 	struct pack_entry e;
 	void *data;
 
 	if (!find_pack_entry(sha1, &e))
 		return NULL;
-	data = cache_or_unpack_entry(e.p, e.offset, size, type, 1);
+	data = cache_or_unpack_entry(e.p, e.offset, size, type, 1, stop_at_blank);
 	if (!data) {
 		/*
 		 * We're probably in deep shit, but let's try to fetch
@@ -2226,7 +2248,7 @@ static void *read_packed_sha1(const unsigned char *sha1,
 		error("failed to read object %s at offset %"PRIuMAX" from %s",
 		      sha1_to_hex(sha1), (uintmax_t)e.offset, e.p->pack_name);
 		mark_bad_packed_object(e.p, sha1);
-		data = read_object(sha1, type, size);
+		data = read_object(sha1, type, size, stop_at_blank);
 	}
 	return data;
 }
@@ -2255,7 +2277,7 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 }
 
 static void *read_object(const unsigned char *sha1, enum object_type *type,
-			 unsigned long *size)
+			 unsigned long *size, int stop_at_blank)
 {
 	unsigned long mapsize;
 	void *map, *buf;
@@ -2268,17 +2290,18 @@ static void *read_object(const unsigned char *sha1, enum object_type *type,
 		return xmemdupz(co->buf, co->size);
 	}
 
-	buf = read_packed_sha1(sha1, type, size);
+	buf = read_packed_sha1(sha1, type, size, stop_at_blank);
 	if (buf)
 		return buf;
 	map = map_sha1_file(sha1, &mapsize);
 	if (map) {
-		buf = unpack_sha1_file(map, mapsize, type, size, sha1);
+		buf = unpack_sha1_file(map, mapsize, type, size, sha1,
+				       stop_at_blank);
 		munmap(map, mapsize);
 		return buf;
 	}
 	reprepare_packed_git();
-	return read_packed_sha1(sha1, type, size);
+	return read_packed_sha1(sha1, type, size, stop_at_blank);
 }
 
 /*
@@ -2296,9 +2319,10 @@ void *read_sha1_file_extended(const unsigned char *sha1,
 	const struct packed_git *p;
 	const unsigned char *repl = (flag & READ_SHA1_FILE_REPLACE)
 		? lookup_replace_object(sha1) : sha1;
+	int stop_at_blank = !!(flag & READ_SHA1_FILE_HEADER);
 
 	errno = 0;
-	data = read_object(repl, type, size);
+	data = read_object(repl, type, size, stop_at_blank);
 	if (data)
 		return data;
 
@@ -2597,7 +2621,7 @@ int force_object_loose(const unsigned char *sha1, time_t mtime)
 
 	if (has_loose_object(sha1))
 		return 0;
-	buf = read_packed_sha1(sha1, &type, &len);
+	buf = read_packed_sha1(sha1, &type, &len, 0);
 	if (!buf)
 		return error("cannot read sha1_file for %s", sha1_to_hex(sha1));
 	hdrlen = sprintf(hdr, "%s %lu", typename(type), len) + 1;

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-29  5:27         ` Jeff King
  2012-12-29  9:05           ` Jeff King
@ 2012-12-30 12:05           ` Jeff King
  2012-12-30 12:53             ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 19+ messages in thread
From: Jeff King @ 2012-12-30 12:05 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sat, Dec 29, 2012 at 12:27:47AM -0500, Jeff King wrote:

> > If reachability bitmap is implemented, we'll have per-pack cache
> > infrastructure ready, so less work there for commit cache.
> 
> True. I don't want to dissuade you from doing any commit cache work. I
> only wanted to point out that this alternative may have merit because of
> its simplicity (so we can use it until a caching solution exists, or
> even after, if managing the cache has downsides).

So I was thinking about this, which led to some coding, which led to
some benchmarking.

I want to clean up a few things in the code before I post it, but the
general idea is to have arbitrary per-pack cache files in the
objects/pack directory. Like this:

  $ cd objects/pack && ls
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.commits
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.idx
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.pack
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.parents
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.timestamps
  pack-a3e262f40d95fc0cc97d92797ff9988551367b75.trees

Each file describes the objects in the matching pack. If a new pack is
generated, you'd throw away the old cache files along with the old pack,
and generate new ones. Or not. These are totally optional, and an older
version of git will just ignore them. A newer version will use them if
they're available, and otherwise fallback to the existing code (i.e.,
reading the whole object from the pack). So you can generate them at
repack time, later on, or not at all. For now I have a separate command
that generates them based on the pack index; if this turns out to be a
good idea, it would probably get called as part of "repack".

Each file is a set of fixed-length records. The "commits" file contains
the sha1 of every commit in the pack (sorted). A binary search of the
mmap'd file gives the position of a particular commit within the list,
and that position is used to index the parents, timestamps, and trees
files (obviously if it is missing, then the other files are useless, but
we already have to be able to fallback to just reading the objects
anyway).

I split it out into multiple files because you can actually operate with
a subset (though in my initial attempt, I transparently plug in at the
parse_commit layer, which means we need all items to consider the commit
"parsed", whether the caller actually cares or not. But in theory a
reader could only want to ask for one item).  Making a "generation"
cache file is an obvious next step (and because we already have
"commits", it is only 4 bytes per commit on top of it). Reachability
bitmaps would be another one (though due to the compression, I am not
sure they will work with a fixed-size record design, so this may need
some modification).

Anyway, here are the numbers I came up with (appended to my earlier
compression numbers):

git.git:
 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none |  56.72        | 0.68        | 0.33        |  2.45        |  1.94       
commit |  64.61 (+13%) | 0.50 (-26%) | 0.09 (-74%) |  2.42  (-1%) |  1.69 (-13%)
  tree |  60.68  (+6%) | 0.79 (+16%) | 0.33   (0%) |  2.23  (-8%) |  1.75  (-9%)
  both |  68.54 (+20%) | 0.48 (-29%) | 0.08 (-75%) |  2.24  (-8%) |  1.48 (-23%)
 cache |  59.29  (+4%) | 0.57 (-16%) | 0.05 (-84%) |  2.23  (-8%) |  1.66 (-14%)

linux.git:
 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none | 864.61        | 8.66        | 4.07        | 42.76        | 36.32       
commit | 970.46 (+12%) | 8.87  (+2%) | 1.02 (-74%) | 42.94   (0%) | 33.43  (-7%)
  tree | 895.37  (+3%) | 9.08  (+4%) | 4.07   (0%) | 36.01 (-15%) | 29.62 (-18%)
  both |1001.25 (+15%) | 8.90  (+2%) | 1.03 (-74%) | 35.57 (-16%) | 26.25 (-27%)
 cache | 894.78  (+3%) | 4.88 (-43%) | 0.69 (-83%) | 38.80  (-9%) | 32.79  (-9%)

webkit.git:
 Pack  | Size          |  Cold Revs  |  Warm Revs  | Cold Objects | Warm Objects
-------+---------------+-------------+-------------+--------------+--------------
  none |   3.46        | 1.61        | 1.38        | 20.46        | 18.72       
commit |   3.54  (+2%) | 1.42 (-11%) | 0.34 (-75%) | 20.42   (0%) | 17.57  (-6%)
  tree |   3.59  (+3%) | 1.61   (0%) | 1.39   (0%) | 16.01 (-21%) | 14.00 (-25%)
  both |   3.67  (+6%) | 1.45 (-10%) | 0.34 (-75%) | 15.94 (-22%) | 12.91 (-31%)
 cache |   3.47   (0%) | 0.49 (-69%) | 0.14 (-90%) | 19.53  (-4%) | 17.86  (-4%)

So you can see that it performs even better than no-compression on the
warm-revs case. Which makes sense, since we do not even have to touch
the object data at all, and can do the whole traversal straight out of
the cache. So we do not even have to memcpy the bytes around. And it
takes up even less space (3-4% versus 12-13% on the first two repos).
Which makes sense, because even though we are duplicating some
information that is in the packfile, we are leaving all of the commit
message bodies compressed.

The other interesting thing is that the cold cache performance also
improves by a lot. Again, this makes sense; we are doing the traversal
completely out of cache, and our data is even more tightly packed in the
cache than it is in the packfile.

Of course, it does very little for the full --objects listing, where we
spend most of our time inflating trees. We could couple this with
uncompressed trees (which are not all that much bigger, since the sha1s
do not compress anyway). Or we could have an external tree cache, but
I'm not sure exactly what it would look like (this is basically
reinventing bits of packv4, but doing so in a way that is redundant with
the existing packfile, rather than replacing it). Or since the point of
--objects is usually reachability, it may make more sense to pursue the
bitmap, which should be even faster still.

-Peff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-30 12:05           ` Jeff King
@ 2012-12-30 12:53             ` Nguyen Thai Ngoc Duy
  2012-12-30 21:31               ` Jeff King
  0 siblings, 1 reply; 19+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-12-30 12:53 UTC (permalink / raw)
  To: Jeff King; +Cc: David Michael Barr, Git Mailing List

On Sun, Dec 30, 2012 at 7:05 PM, Jeff King <peff@peff.net> wrote:
> So I was thinking about this, which led to some coding, which led to
> some benchmarking.

I like your way of thinking! May I suggest you take a new year break
first, then "think" about reachability bitmaps ;-) 2013 will be an
exciting year.

> I want to clean up a few things in the code before I post it, but the
> general idea is to have arbitrary per-pack cache files in the
> objects/pack directory. Like this:
>
>   $ cd objects/pack && ls
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.commits
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.idx
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.pack
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.parents
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.timestamps
>   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.trees
>
> Each file describes the objects in the matching pack. If a new pack is
> generated, you'd throw away the old cache files along with the old pack,
> and generate new ones. Or not. These are totally optional, and an older
> version of git will just ignore them. A newer version will use them if
> they're available, and otherwise fallback to the existing code (i.e.,
> reading the whole object from the pack). So you can generate them at

You have probably thought about this (and I don't have the source to
check first), but we may need to version these extra files so we can
change the format later if needed. Git versions that do not recognize
new versions simply ignore the cahce.

> repack time, later on, or not at all. For now I have a separate command
> that generates them based on the pack index; if this turns out to be a
> good idea, it would probably get called as part of "repack".

I'd like to make it part of index-pack, where we have nearly
everything in memory. But let's leave it as a separate command first.

> Each file is a set of fixed-length records. The "commits" file contains
> the sha1 of every commit in the pack (sorted). A binary search of the
> mmap'd file gives the position of a particular commit within the list,

I think we could avoid storing sha-1 in the cache with Shawn's idea
[1]. But now I read it again I fail to see it :(

[1] http://article.gmane.org/gmane.comp.version-control.git/206485

> Of course, it does very little for the full --objects listing, where we
> spend most of our time inflating trees. We could couple this with
> uncompressed trees (which are not all that much bigger, since the sha1s
> do not compress anyway). Or we could have an external tree cache, but
> I'm not sure exactly what it would look like (this is basically
> reinventing bits of packv4, but doing so in a way that is redundant with
> the existing packfile, rather than replacing it).

Depending on the use case, we could just generate packv4-like cache
for recently-used trees only. I'm not sure how tree cache impact a
merge operation on a very large worktree (iow, a lot of trees
referenced from HEAD to be inflated). This is something a cache can
do, but a new pack version cannot.

> Or since the point of
> --objects is usually reachability, it may make more sense to pursue the
> bitmap, which should be even faster still.

Yes. And if narrow clone ever comes, which needs --objects limited by
pathspec, we could just produce extra bitmaps for frequently-used
pathspecs and only allow narrow clone with those pathspecs.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-30 12:53             ` Nguyen Thai Ngoc Duy
@ 2012-12-30 21:31               ` Jeff King
  2012-12-31 18:06                 ` Shawn Pearce
  0 siblings, 1 reply; 19+ messages in thread
From: Jeff King @ 2012-12-30 21:31 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: David Michael Barr, Git Mailing List

On Sun, Dec 30, 2012 at 07:53:48PM +0700, Nguyen Thai Ngoc Duy wrote:

> >   $ cd objects/pack && ls
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.commits
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.idx
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.pack
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.parents
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.timestamps
> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.trees
> >
> > Each file describes the objects in the matching pack. If a new pack is
> > generated, you'd throw away the old cache files along with the old pack,
> > and generate new ones. Or not. These are totally optional, and an older
> > version of git will just ignore them. A newer version will use them if
> > they're available, and otherwise fallback to the existing code (i.e.,
> > reading the whole object from the pack). So you can generate them at
> 
> You have probably thought about this (and I don't have the source to
> check first), but we may need to version these extra files so we can
> change the format later if needed. Git versions that do not recognize
> new versions simply ignore the cahce.

Agreed. The current code has a 4-byte magic, followed by a 4-byte
version number, followed by a 4-byte record size[1]. Then the data,
followed by the pack sha1, followed by a sha1 of all of the preceding
data.  So you can verify the validity of any cache file (both its
checksum, and that it matches the right packfile), just as you can with
a ".idx" file.

[1] Probably the magic and version should be per-file-type, and the
    record size should be implicit from that; right now I make
    assumptions about what is in the files based on their names, but
    that is not part of the checksum.

> > repack time, later on, or not at all. For now I have a separate command
> > that generates them based on the pack index; if this turns out to be a
> > good idea, it would probably get called as part of "repack".
> 
> I'd like to make it part of index-pack, where we have nearly
> everything in memory. But let's leave it as a separate command first.

Yeah, in the long run that may work. The steps I figured were:

  1. Optional, external command. Let people experiment.

  2. Once it has proven itself, run the command from index-pack by
     default (or with a config option).

  3. If it turns out too slow, move the generation directly into the
     index-pack process.

The current iteration does not seem all that slow, but that is because I
am mostly picking static data out of the commits. So I have to load the
commits, and that's it. But something like reachability might be more
expensive (OTOH, it will always be more expensive, whether we have the
objects in memory or not).

> > Each file is a set of fixed-length records. The "commits" file contains
> > the sha1 of every commit in the pack (sorted). A binary search of the
> > mmap'd file gives the position of a particular commit within the list,
> 
> I think we could avoid storing sha-1 in the cache with Shawn's idea
> [1]. But now I read it again I fail to see it :(
> 
> [1] http://article.gmane.org/gmane.comp.version-control.git/206485

Right. My implementation is very similar to what Shawn said there. I.e.,
the timestamps file is literally 4 bytes times the number of commits.
The parents file is 40 bytes per commit (2 parents, with a marker to
indicate "more or less than 2"), though a lot of it is zero bytes.

Some alternatives I'm thinking about are:

  1. Using non-fixed-size records, which would allow trivial compression
     of entries like null sha1s. This would mean adding a separate
     lookup table, though, mapping sha1s to offsets. Still, even a
     32-bit offset is only 4 bytes per commit. If it meant dropping 40
     bytes of zeroes from the 2nd parent field out of half of all
     commits, that would be a win space-wise. It would be a
     double-indirect lookup, but it's constant effort, and only two page
     hits (which would be warm after the first lookup anyway).

  2. Storing offsets to objects in the packfile rather than their sha1s.
     This would save a lot of space, but would mean we couldn't refer to
     parents outside of the pack, but that may be OK. This is an
     optimization, and the case we want to target is a fully (or mostly)
     packed repo. It's OK to have the lookup fail and fallback to
     accessing the object.

  3. Dropping the "commits" file and just using the pack-*.idx as the
     index. The problem is that it is sparse in the commit space. So
     just naively storing 40 bytes per entry is going to waste a lot of
     space. If we had a separate index as in (1) above, that could be
     dropped to (say) 4 bytes of offset per object. But still, right now
     the commits file for linux-2.6 is about 7.2M (20 bytes times ~376K
     commits). There are almost 3 million total objects, so even storing
     4 bytes per object is going to be worse.

  4. Making a new index version that stores the sha1s separated by type.
     This means we can piggy-back on the regular index to get a packed
     list of just commits. But it also means that regular sha1 lookups
     of the objects have to look in several places (unless the caller
     annotates the call to read_sha1_object with "I am expecting this
     sha1 to be a commit"). And of course it means bumping the index
     version, which is a pain. The external index means it can be
     completely optional on top of the current index/pack.

> Depending on the use case, we could just generate packv4-like cache
> for recently-used trees only. I'm not sure how tree cache impact a
> merge operation on a very large worktree (iow, a lot of trees
> referenced from HEAD to be inflated). This is something a cache can
> do, but a new pack version cannot.

I do not care too much about the cost of running merge on a large
working tree. Of course it's better to make our optimizations as
generally applicable as possible, but there is a lot of other work going
on in a merge. The really painful, noticeable, repetitive bits right now
are:

  1. Running git-prune.

  2. Creating a pack from git-upload-pack.

Which are both just reachability problems. Something like "git log --
<pathspec>" would also be helped by packv4-ish tree access patterns,
though, but not by reachability bitmaps. And that may be something
worth caring about.

> Yes. And if narrow clone ever comes, which needs --objects limited by
> pathspec, we could just produce extra bitmaps for frequently-used
> pathspecs and only allow narrow clone with those pathspecs.

I hadn't thought about that. But yeah, because of the optional, external
nature, there's no reason you couldn't have extra bitmap sets for
specialized situations.

-Peff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-30 21:31               ` Jeff King
@ 2012-12-31 18:06                 ` Shawn Pearce
  2013-01-01  4:15                   ` Duy Nguyen
  0 siblings, 1 reply; 19+ messages in thread
From: Shawn Pearce @ 2012-12-31 18:06 UTC (permalink / raw)
  To: Jeff King; +Cc: Nguyen Thai Ngoc Duy, David Michael Barr, Git Mailing List

This thread is pretty interesting. Unfortunately the holidays have
kept me busy. But I am excited by the work David and Peff are doing.
:-)

On Sun, Dec 30, 2012 at 1:31 PM, Jeff King <peff@peff.net> wrote:
> On Sun, Dec 30, 2012 at 07:53:48PM +0700, Nguyen Thai Ngoc Duy wrote:
>
>> >   $ cd objects/pack && ls
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.commits
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.idx
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.pack
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.parents
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.timestamps
>> >   pack-a3e262f40d95fc0cc97d92797ff9988551367b75.trees
>> >
>> > Each file describes the objects in the matching pack. If a new pack is
>> > generated, you'd throw away the old cache files along with the old pack,
>> > and generate new ones. Or not. These are totally optional, and an older
>> > version of git will just ignore them. A newer version will use them if
>> > they're available, and otherwise fallback to the existing code (i.e.,
>> > reading the whole object from the pack). So you can generate them at
>>
>> You have probably thought about this (and I don't have the source to
>> check first), but we may need to version these extra files so we can
>> change the format later if needed. Git versions that do not recognize
>> new versions simply ignore the cahce.
>
> Agreed. The current code has a 4-byte magic, followed by a 4-byte
> version number, followed by a 4-byte record size[1]. Then the data,
> followed by the pack sha1, followed by a sha1 of all of the preceding
> data.  So you can verify the validity of any cache file (both its
> checksum, and that it matches the right packfile), just as you can with
> a ".idx" file.

Put the pack sha1 into the header, rather than the trailer. Its really
annoying that you read the header, determine you probably understand
this file, and then have to seek to END-40 to read the pack sha1 and
verify it matches the pack. In an ideal world the pack sha1 would have
been the file name, making this less of an issue, but someone didn't
anticipate repacking the same object set with possibly different
results. :-(

The idx format is kind of wrong here, I wish we had put the pack sha1
into the header. Given that we mmap the files the 20 bytes in front
vs. 20 bytes in the trailer wouldn't have made any difference on
access cost.

>> > Each file is a set of fixed-length records. The "commits" file contains
>> > the sha1 of every commit in the pack (sorted). A binary search of the
>> > mmap'd file gives the position of a particular commit within the list,
>>
>> I think we could avoid storing sha-1 in the cache with Shawn's idea
>> [1]. But now I read it again I fail to see it :(
>>
>> [1] http://article.gmane.org/gmane.comp.version-control.git/206485
>
> Right. My implementation is very similar to what Shawn said there. I.e.,
> the timestamps file is literally 4 bytes times the number of commits.
> The parents file is 40 bytes per commit (2 parents, with a marker to
> indicate "more or less than 2"), though a lot of it is zero bytes.

Hmm, after re-reading [1] I still like my idea better. But I won't
find the time to code it myself, so I'll have to go with whatever
someone else writes. :-)

Since tree pointers are also required when parsing a commit (even if
they might not get used e.g. `git log master`) maybe this should be 16
bytes per commit storing the commit time, tree pointer, and 2 parents,
with the last 3 fields using the N-th object in the sorted sha1 list
in the idx. Sorting the file by pack stream ordering gives you good
locality during rev-list operations and makes it compact if
pack-objects adheres to writing commits before other objects.

Unfortunately this ordering requires the pack reverse index in memory
to translate from sha1 to position in the cache file. Making the
reverse index is a non-trivial cost that may dominate the running time
for smaller traversals, or the startup time for `git log` outputting
to the pager.

> Some alternatives I'm thinking about are:
>
>   1. Using non-fixed-size records, which would allow trivial compression
>      of entries like null sha1s. This would mean adding a separate
>      lookup table, though, mapping sha1s to offsets. Still, even a
>      32-bit offset is only 4 bytes per commit. If it meant dropping 40
>      bytes of zeroes from the 2nd parent field out of half of all
>      commits, that would be a win space-wise. It would be a
>      double-indirect lookup, but it's constant effort, and only two page
>      hits (which would be warm after the first lookup anyway).

Or use a 16 byte fixed width record (see above).

>   2. Storing offsets to objects in the packfile rather than their sha1s.
>      This would save a lot of space, but would mean we couldn't refer to
>      parents outside of the pack, but that may be OK. This is an
>      optimization, and the case we want to target is a fully (or mostly)
>      packed repo. It's OK to have the lookup fail and fallback to
>      accessing the object.

I glossed over this in both [1] and this message. I think its
perfectly reasonable to require parsing the commit when the commit's
parents are outside of the pack. These edge commits are infrequent
compared to the number of commits within the pack. Just mark them the
same way you mark an octopus merge, so the reader knows the parent
data is not available in the cache. For most repositories the bulk of
the commits will be in a single giant pack that contains history to
the root.

I wouldn't store the byte offsets here, those are possibly 8 bytes
wide on bigger packs. Instead store the Nth position in the pack
stream. Even if you store byte offsets you need to use the pack
reverse index to recover the SHA-1 in log N time. If you store the Nth
position you also use the pack reverse index, but you can fit all
objects from that pack in 4 bytes rather than 8 bytes per reference.

>   3. Dropping the "commits" file and just using the pack-*.idx as the
>      index. The problem is that it is sparse in the commit space. So
>      just naively storing 40 bytes per entry is going to waste a lot of
>      space. If we had a separate index as in (1) above, that could be
>      dropped to (say) 4 bytes of offset per object. But still, right now
>      the commits file for linux-2.6 is about 7.2M (20 bytes times ~376K
>      commits). There are almost 3 million total objects, so even storing
>      4 bytes per object is going to be worse.

Fix pack-objects to behave the way JGit does, cluster commits first in
the pack stream. Now you have a dense space of commits. If I remember
right this has a tiny positive improvement for most rev-list
operations with very little downside.

>   4. Making a new index version that stores the sha1s separated by type.
>      This means we can piggy-back on the regular index to get a packed
>      list of just commits. But it also means that regular sha1 lookups
>      of the objects have to look in several places (unless the caller
>      annotates the call to read_sha1_object with "I am expecting this
>      sha1 to be a commit"). And of course it means bumping the index
>      version, which is a pain. The external index means it can be
>      completely optional on top of the current index/pack.

I don't think this is worthwhile.

>> Depending on the use case, we could just generate packv4-like cache
>> for recently-used trees only. I'm not sure how tree cache impact a
>> merge operation on a very large worktree (iow, a lot of trees
>> referenced from HEAD to be inflated). This is something a cache can
>> do, but a new pack version cannot.
>
> I do not care too much about the cost of running merge on a large
> working tree. Of course it's better to make our optimizations as
> generally applicable as possible, but there is a lot of other work going
> on in a merge. The really painful, noticeable, repetitive bits right now
> are:
>
>   1. Running git-prune.
>
>   2. Creating a pack from git-upload-pack.
>
> Which are both just reachability problems. Something like "git log --
> <pathspec>" would also be helped by packv4-ish tree access patterns,
> though, but not by reachability bitmaps. And that may be something
> worth caring about.

blame would also benefit from a packv4-ish tree.

But upload-pack and prune can make massive improvements through
bitmaps, while packv4-ish tree would be only a marginal incremental
improvement. In the case of upload-pack having a bitmap gives you much
more knowledge of the remote's have set, and allows making a smaller
pack, in a lot less time, and a smaller server memory footprint. Now
that we have implemented bitmaps in our servers, I can say you really
don't want to ignore the gains we can get from them. packv4-ish tree
might help some other workloads, but bitmaps provide a better solution
to these reachability problems than anything else we know.

>> Yes. And if narrow clone ever comes, which needs --objects limited by
>> pathspec, we could just produce extra bitmaps for frequently-used
>> pathspecs and only allow narrow clone with those pathspecs.
>
> I hadn't thought about that. But yeah, because of the optional, external
> nature, there's no reason you couldn't have extra bitmap sets for
> specialized situations.

Right. We still need to redo the JGit patch series to eject the
bitmaps into an extension file. :-(

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2012-12-31 18:06                 ` Shawn Pearce
@ 2013-01-01  4:15                   ` Duy Nguyen
  2013-01-01 12:10                     ` Duy Nguyen
  0 siblings, 1 reply; 19+ messages in thread
From: Duy Nguyen @ 2013-01-01  4:15 UTC (permalink / raw)
  To: Jeff King; +Cc: Shawn Pearce, David Michael Barr, Git Mailing List

On Tue, Jan 1, 2013 at 1:06 AM, Shawn Pearce <spearce@spearce.org> wrote:
>>   3. Dropping the "commits" file and just using the pack-*.idx as the
>>      index. The problem is that it is sparse in the commit space. So
>>      just naively storing 40 bytes per entry is going to waste a lot of
>>      space. If we had a separate index as in (1) above, that could be
>>      dropped to (say) 4 bytes of offset per object. But still, right now
>>      the commits file for linux-2.6 is about 7.2M (20 bytes times ~376K
>>      commits). There are almost 3 million total objects, so even storing
>>      4 bytes per object is going to be worse.
>
> Fix pack-objects to behave the way JGit does, cluster commits first in
> the pack stream. Now you have a dense space of commits. If I remember
> right this has a tiny positive improvement for most rev-list
> operations with very little downside.

I was going to suggest a similar thing. The current state of C Git's
pack writing is not bad. We mix commits and tags together, but tags
are few usually. Once we get the upper and lower bound, in terms of
object position in the pack, of the commit+tag region, we could reduce
the waste significantly. That is if you sort the cache by the object
order in the pack.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2013-01-01  4:15                   ` Duy Nguyen
@ 2013-01-01 12:10                     ` Duy Nguyen
  2013-01-01 17:17                       ` Shawn Pearce
  2013-01-01 20:02                       ` Junio C Hamano
  0 siblings, 2 replies; 19+ messages in thread
From: Duy Nguyen @ 2013-01-01 12:10 UTC (permalink / raw)
  To: Jeff King
  Cc: Shawn Pearce, David Michael Barr, Git Mailing List,
	Junio C Hamano

On Tue, Jan 1, 2013 at 11:15 AM, Duy Nguyen <pclouds@gmail.com> wrote:
>> Fix pack-objects to behave the way JGit does, cluster commits first in
>> the pack stream. Now you have a dense space of commits. If I remember
>> right this has a tiny positive improvement for most rev-list
>> operations with very little downside.
>
> I was going to suggest a similar thing. The current state of C Git's
> pack writing is not bad. We mix commits and tags together, but tags

And I was wrong. At least since 1b4bb16 (pack-objects: optimize
"recency order" - 2011-06-30) commits are spread out and can be mixed
with trees too. Grouping them back defeats what Junio did in that
commit, I think.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2013-01-01 12:10                     ` Duy Nguyen
@ 2013-01-01 17:17                       ` Shawn Pearce
  2013-01-01 23:47                         ` Junio C Hamano
  2013-01-02  2:23                         ` Duy Nguyen
  2013-01-01 20:02                       ` Junio C Hamano
  1 sibling, 2 replies; 19+ messages in thread
From: Shawn Pearce @ 2013-01-01 17:17 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Jeff King, David Michael Barr, Git Mailing List, Junio C Hamano

On Tue, Jan 1, 2013 at 4:10 AM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Tue, Jan 1, 2013 at 11:15 AM, Duy Nguyen <pclouds@gmail.com> wrote:
>>> Fix pack-objects to behave the way JGit does, cluster commits first in
>>> the pack stream. Now you have a dense space of commits. If I remember
>>> right this has a tiny positive improvement for most rev-list
>>> operations with very little downside.
>>
>> I was going to suggest a similar thing. The current state of C Git's
>> pack writing is not bad. We mix commits and tags together, but tags
>
> And I was wrong. At least since 1b4bb16 (pack-objects: optimize
> "recency order" - 2011-06-30) commits are spread out and can be mixed
> with trees too. Grouping them back defeats what Junio did in that
> commit, I think.

I think you misunderstand what 1b4bb16 does. Junio uses a layout
similar to what JGit has done for years. Commits are packed, then
trees, then blobs. Only annotated tags are interspersed with commits.
The decision on where to place tags is different, but has a similar
purpose. How blobs are written is very different, Junio's
implementation is strictly better than JGit's[1].

So we can use pack ordering. There will be a gap because of tags, but
if we assume there are less tags than commits, it will still be a
reasonable cache file size.

[1] I have known this since he was developing this commit. We talked
about clustering by delta chain and the improvements it showed in
CGit. I tried to implement a similar delta chain clustering in JGit
but broke something in the packer and caused data corruption, so its
stalled.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2013-01-01 12:10                     ` Duy Nguyen
  2013-01-01 17:17                       ` Shawn Pearce
@ 2013-01-01 20:02                       ` Junio C Hamano
  1 sibling, 0 replies; 19+ messages in thread
From: Junio C Hamano @ 2013-01-01 20:02 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Jeff King, Shawn Pearce, David Michael Barr, Git Mailing List

Duy Nguyen <pclouds@gmail.com> writes:

> On Tue, Jan 1, 2013 at 11:15 AM, Duy Nguyen <pclouds@gmail.com> wrote:
>>> Fix pack-objects to behave the way JGit does, cluster commits first in
>>> the pack stream. Now you have a dense space of commits. If I remember
>>> right this has a tiny positive improvement for most rev-list
>>> operations with very little downside.
>>
>> I was going to suggest a similar thing. The current state of C Git's
>> pack writing is not bad. We mix commits and tags together, but tags
>
> And I was wrong. At least since 1b4bb16 (pack-objects: optimize
> "recency order" - 2011-06-30) commits are spread out and can be mixed
> with trees too.

Really?  That certainly wasn't the intention of that change.

The compute_write_order() function first fills the commits in the
original recency order (the order in which rev-list discovered them
by traversing the history from the tips) until we find a commit that
is tagged by a ref in the refs/tags/ hierarchy.  When we reach that
point, we stop showing the commits and show all the tags in the
refs/tags/ hierarchy and commits that are tagged by them, breaking
the original ordering of commits so that ancient but tagged commits
clump at this point.  After that, we resume showing the rest of the
commits and tags in the original order they came to us.  Trees are
done next, and then the remainder.

So I am not sure how trees can appear between commits.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2013-01-01 17:17                       ` Shawn Pearce
@ 2013-01-01 23:47                         ` Junio C Hamano
  2013-01-02  2:23                         ` Duy Nguyen
  1 sibling, 0 replies; 19+ messages in thread
From: Junio C Hamano @ 2013-01-01 23:47 UTC (permalink / raw)
  To: Shawn Pearce; +Cc: Duy Nguyen, Jeff King, David Michael Barr, Git Mailing List

Shawn Pearce <spearce@spearce.org> writes:

> How blobs are written is very different, Junio's
> implementation is strictly better than JGit's[1].

I do not think there can be a single ordering that is strictly
better than any other one.  The "clump all objects in a delta family
and write them width-first, starting from the base object" may give
you a reasonable trade-off for a result of normal repack, but if you
repack (like I do) with very shallow --depth with wide --window to
really get a tight pack, a delta may end up having too may uncles
between it and its father, requiring a large seek to skip over all
the uncles in order to grab the delta data, after you reconstitute
the delta base object.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] pack-objects: compression level for non-blobs
  2013-01-01 17:17                       ` Shawn Pearce
  2013-01-01 23:47                         ` Junio C Hamano
@ 2013-01-02  2:23                         ` Duy Nguyen
  1 sibling, 0 replies; 19+ messages in thread
From: Duy Nguyen @ 2013-01-02  2:23 UTC (permalink / raw)
  To: Shawn Pearce
  Cc: Jeff King, David Michael Barr, Git Mailing List, Junio C Hamano

On Wed, Jan 2, 2013 at 12:17 AM, Shawn Pearce <spearce@spearce.org> wrote:
>> And I was wrong. At least since 1b4bb16 (pack-objects: optimize
>> "recency order" - 2011-06-30) commits are spread out and can be mixed
>> with trees too. Grouping them back defeats what Junio did in that
>> commit, I think.
>
> I think you misunderstand what 1b4bb16 does. Junio uses a layout
> similar to what JGit has done for years. Commits are packed, then
> trees, then blobs. Only annotated tags are interspersed with commits.
> The decision on where to place tags is different, but has a similar
> purpose.

This is embarrassing. I looked at verify-pack output and somehow saw
trees mixed with commits. I must have read it wrong. "git verify-pack
-v <pack>|awk '{print $2;}'|uniq on recently created pack shows that
only tags and commits are mixed. Sorry for the noise.
-- 
Duy

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2013-01-02  2:24 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-26  6:25 [RFC] pack-objects: compression level for non-blobs David Michael Barr
2012-11-26 12:35 ` David Michael Barr
2012-12-29  0:41 ` Jeff King
2012-12-29  4:34   ` Nguyen Thai Ngoc Duy
2012-12-29  5:07     ` Jeff King
2012-12-29  5:25       ` Nguyen Thai Ngoc Duy
2012-12-29  5:27         ` Jeff King
2012-12-29  9:05           ` Jeff King
2012-12-29  9:48             ` Jeff King
2012-12-30 12:05           ` Jeff King
2012-12-30 12:53             ` Nguyen Thai Ngoc Duy
2012-12-30 21:31               ` Jeff King
2012-12-31 18:06                 ` Shawn Pearce
2013-01-01  4:15                   ` Duy Nguyen
2013-01-01 12:10                     ` Duy Nguyen
2013-01-01 17:17                       ` Shawn Pearce
2013-01-01 23:47                         ` Junio C Hamano
2013-01-02  2:23                         ` Duy Nguyen
2013-01-01 20:02                       ` Junio C Hamano

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).