From: David Michael Barr <b@rr-dav.id.au>
To: Git Mailing List <git@vger.kernel.org>
Cc: David Michael Barr <b@rr-dav.id.au>
Subject: [RFC] pack-objects: compression level for non-blobs
Date: Mon, 26 Nov 2012 17:25:54 +1100 [thread overview]
Message-ID: <1353911154-23495-1-git-send-email-b@rr-dav.id.au> (raw)
Add config pack.graphcompression similar to pack.compression.
Applies to non-blob objects and if unspecified falls back to pack.compression.
We may identify objects compressed with level 0 by their leading bytes.
Use this to force recompression when the source and target levels mismatch.
Limit its application to when the config pack.graphcompression is set.
Signed-off-by: David Michael Barr <b@rr-dav.id.au>
---
builtin/pack-objects.c | 49 +++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 45 insertions(+), 4 deletions(-)
I started working on this just before taking a vacation,
so it's been a little while coming.
The intent is to allow selective recompression of pack data.
For small objects/deltas the overhead of deflate is significant.
This may improve read performance for the object graph.
I ran some unscientific experiments with the chromium repository.
With pack.graphcompression = 0, there was a 2.7% increase in pack size.
I saw a 35% improvement with cold caches and 43% otherwise on git log --raw.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f069462..9518daf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -40,6 +40,7 @@ struct object_entry {
unsigned long z_delta_size; /* delta data size (compressed) */
unsigned int hash; /* name hint hash */
enum object_type type;
+ enum object_type actual_type;
enum object_type in_pack_type; /* could be delta */
unsigned char in_pack_header_size;
unsigned char preferred_base; /* we do not pack this, but is available
@@ -81,6 +82,8 @@ static int num_preferred_base;
static struct progress *progress_state;
static int pack_compression_level = Z_DEFAULT_COMPRESSION;
static int pack_compression_seen;
+static int pack_graph_compression_level = Z_DEFAULT_COMPRESSION;
+static int pack_graph_compression_seen;
static unsigned long delta_cache_size = 0;
static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
@@ -125,14 +128,14 @@ static void *get_delta(struct object_entry *entry)
return delta_buf;
}
-static unsigned long do_compress(void **pptr, unsigned long size)
+static unsigned long do_compress(void **pptr, unsigned long size, int level)
{
git_zstream stream;
void *in, *out;
unsigned long maxsize;
memset(&stream, 0, sizeof(stream));
- git_deflate_init(&stream, pack_compression_level);
+ git_deflate_init(&stream, level);
maxsize = git_deflate_bound(&stream, size);
in = *pptr;
@@ -191,6 +194,18 @@ static unsigned long write_large_blob_data(struct git_istream *st, struct sha1fi
return olen;
}
+static int check_pack_compressed(struct packed_git *p,
+ struct pack_window **w_curs,
+ off_t offset)
+{
+ unsigned long avail;
+ int compressed = 0;
+ unsigned char *in = use_pack(p, w_curs, offset, &avail);
+ if (avail >= 3)
+ compressed = !!(in[2] & 0x6);
+ return compressed;
+}
+
/*
* we are going to reuse the existing object data as is. make
* sure it is not corrupt.
@@ -240,6 +255,8 @@ static void copy_pack_data(struct sha1file *f,
}
}
+#define compression_level(type) ((type) && (type) != OBJ_BLOB ? pack_graph_compression_level : pack_compression_level)
+
/* Return 0 if we will bust the pack-size limit */
static unsigned long write_no_reuse_object(struct sha1file *f, struct object_entry *entry,
unsigned long limit, int usable_delta)
@@ -286,7 +303,7 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
else if (entry->z_delta_size)
datalen = entry->z_delta_size;
else
- datalen = do_compress(&buf, size);
+ datalen = do_compress(&buf, size, compression_level(entry->actual_type));
/*
* The object header is a byte of 'type' followed by zero or
@@ -379,6 +396,13 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
offset += entry->in_pack_header_size;
datalen -= entry->in_pack_header_size;
+ if (!pack_to_stdout &&
+ pack_graph_compression_seen &&
+ check_pack_compressed(p, &w_curs, offset) != !!compression_level(entry->actual_type)) {
+ unuse_pack(&w_curs);
+ return write_no_reuse_object(f, entry, limit, usable_delta);
+ }
+
if (!pack_to_stdout && p->index_version == 1 &&
check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
error("corrupt packed object for %s", sha1_to_hex(entry->idx.sha1));
@@ -955,6 +979,8 @@ static int add_object_entry(const unsigned char *sha1, enum object_type type,
memset(entry, 0, sizeof(*entry));
hashcpy(entry->idx.sha1, sha1);
entry->hash = hash;
+ if (pack_graph_compression_seen)
+ entry->actual_type = sha1_object_info(sha1, NULL);
if (type)
entry->type = type;
if (exclude)
@@ -1758,7 +1784,8 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
*/
if (entry->delta_data && !pack_to_stdout) {
entry->z_delta_size = do_compress(&entry->delta_data,
- entry->delta_size);
+ entry->delta_size,
+ compression_level(entry->actual_type));
cache_lock();
delta_cache_size -= entry->delta_size;
delta_cache_size += entry->z_delta_size;
@@ -2159,6 +2186,16 @@ static int git_pack_config(const char *k, const char *v, void *cb)
pack_idx_opts.version);
return 0;
}
+ if (!strcmp(k, "pack.graphcompression")) {
+ int level = git_config_int(k, v);
+ if (level == -1)
+ level = Z_DEFAULT_COMPRESSION;
+ else if (level < 0 || level > Z_BEST_COMPRESSION)
+ die("bad pack graph compression level %d", level);
+ pack_graph_compression_level = level;
+ pack_graph_compression_seen = 1;
+ return 0;
+ }
return git_default_config(k, v, cb);
}
@@ -2519,6 +2556,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
argc = parse_options(argc, argv, prefix, pack_objects_options,
pack_usage, 0);
+ /* Fall back after option parsing to catch --compression */
+ if (!pack_graph_compression_seen)
+ pack_graph_compression_level = pack_compression_level;
+
if (argc) {
base_name = argv[0];
argc--;
--
1.8.0
next reply other threads:[~2012-11-26 6:26 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-11-26 6:25 David Michael Barr [this message]
2012-11-26 12:35 ` [RFC] pack-objects: compression level for non-blobs David Michael Barr
2012-12-29 0:41 ` Jeff King
2012-12-29 4:34 ` Nguyen Thai Ngoc Duy
2012-12-29 5:07 ` Jeff King
2012-12-29 5:25 ` Nguyen Thai Ngoc Duy
2012-12-29 5:27 ` Jeff King
2012-12-29 9:05 ` Jeff King
2012-12-29 9:48 ` Jeff King
2012-12-30 12:05 ` Jeff King
2012-12-30 12:53 ` Nguyen Thai Ngoc Duy
2012-12-30 21:31 ` Jeff King
2012-12-31 18:06 ` Shawn Pearce
2013-01-01 4:15 ` Duy Nguyen
2013-01-01 12:10 ` Duy Nguyen
2013-01-01 17:17 ` Shawn Pearce
2013-01-01 23:47 ` Junio C Hamano
2013-01-02 2:23 ` Duy Nguyen
2013-01-01 20:02 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1353911154-23495-1-git-send-email-b@rr-dav.id.au \
--to=b@rr-dav.id.au \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).