From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
"Nicolas Pitre" <nico@fluxnic.net>,
"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
Date: Sun, 13 May 2012 11:37:42 +0700 [thread overview]
Message-ID: <1336883862-9013-1-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <alpine.LFD.2.02.1205121220070.21030@xanadu.home>
git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.
Use streaming interface to read/compress/write these blobs in one go.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
On Sat, May 12, 2012 at 11:51 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
>> if (!to_reuse) {
>> no_reuse:
>> if (!usable_delta) {
>> - buf = read_sha1_file(entry->idx.sha1, &type, &size);
>> - if (!buf)
>> - die("unable to read %s", sha1_to_hex(entry->idx.sha1));
>> + type = sha1_object_info(entry->idx.sha1, &size);
>
> Please don't use sha1_object_info() lightly. This is a potentially
> expensive operation, and you really don't want to do it on each objects.
>
> And as a matter of fact, the information you are looking for has already
> been determined earlier. See the code in check_object() which tries
> hard to avoid sha1_object_info() as much as possible.
>
> Therefore you should have entry->type and entry->size already set for
> you to use.
Thanks. sha1_object_info() removed in favor of entry->type and entry->size.
builtin/pack-objects.c | 72 ++++++++++++++++++++++++++++++++++++++++++++----
t/t1050-large.sh | 16 ++++++++++
2 files changed, 82 insertions(+), 6 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..ab5438a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
#include "list-objects.h"
#include "progress.h"
#include "refs.h"
+#include "streaming.h"
#include "thread-utils.h"
static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
return stream.total_out;
}
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+ git_zstream stream;
+ unsigned char ibuf[1024 * 16];
+ unsigned char obuf[1024 * 16];
+ int zret;
+
+ struct git_istream *st;
+ enum object_type type;
+ unsigned long sz;
+
+ st = open_istream(sha1, &type, &sz, NULL);
+ if (!st)
+ die(_("unable to read %s"), sha1_to_hex(sha1));
+
+ memset(&stream, 0, sizeof(stream));
+ git_deflate_init(&stream, pack_compression_level);
+
+ if (type != OBJ_BLOB)
+ die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+ for (;;) {
+ ssize_t readlen;
+ readlen = read_istream(st, ibuf, sizeof(ibuf));
+ if (readlen == -1)
+ die(_("unable to read %s"), sha1_to_hex(sha1));
+
+ stream.next_in = ibuf;
+ stream.avail_in = readlen;
+ zret = Z_OK;
+ while ((stream.avail_in || readlen == 0) &&
+ (zret == Z_OK || zret == Z_BUF_ERROR)) {
+ stream.next_out = obuf;
+ stream.avail_out = sizeof(obuf);
+ zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+ sha1write(f, obuf, stream.next_out - obuf);
+ }
+ if (stream.avail_in)
+ die(_("deflate error (%d)"), zret);
+ if (readlen == 0) {
+ if (zret != Z_STREAM_END)
+ die(_("deflate error (%d)"), zret);
+ break;
+ }
+ }
+ close_istream(st);
+ git_deflate_end(&stream);
+}
+
/*
* we are going to reuse the existing object data as is. make
* sure it is not corrupt.
@@ -259,9 +309,13 @@ static unsigned long write_object(struct sha1file *f,
if (!to_reuse) {
no_reuse:
if (!usable_delta) {
- buf = read_sha1_file(entry->idx.sha1, &type, &size);
- if (!buf)
- die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+ if (entry->type == OBJ_BLOB && entry->size > big_file_threshold)
+ buf = NULL;
+ else {
+ buf = read_sha1_file(entry->idx.sha1, &type, &size);
+ if (!buf)
+ die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+ }
/*
* make sure no cached delta data remains from a
* previous attempt before a pack split occurred.
@@ -284,8 +338,11 @@ static unsigned long write_object(struct sha1file *f,
if (entry->z_delta_size)
datalen = entry->z_delta_size;
- else
+ else if (buf)
datalen = do_compress(&buf, size);
+ else
+ /* large blob case, just assume we don't compress well */
+ datalen = size;
/*
* The object header is a byte of 'type' followed by zero or
@@ -330,8 +387,11 @@ static unsigned long write_object(struct sha1file *f,
}
sha1write(f, header, hdrlen);
}
- sha1write(f, buf, datalen);
- free(buf);
+ if (buf) {
+ sha1write(f, buf, datalen);
+ free(buf);
+ } else
+ write_large_blob_data(f, entry->idx.sha1);
}
else {
struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
git repack -ad
'
+test_expect_success 'pack-objects with large loose object' '
+ echo Z | dd of=large4 bs=1k seek=2000 &&
+ OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+ P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+ (
+ unset GIT_ALLOC_LIMIT &&
+ cat large4 | git hash-object -w --stdin &&
+ git cat-file blob $OBJ >actual &&
+ cmp large4 actual
+ ) &&
+ echo $OBJ | git pack-objects .git/objects/pack/pack &&
+ rm $P &&
+ git cat-file blob $OBJ >actual &&
+ cmp large4 actual
+'
+
test_expect_success 'tar achiving' '
git archive --format=tar HEAD >/dev/null
'
--
1.7.8.36.g69ee2
next prev parent reply other threads:[~2012-05-13 4:41 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre
2012-05-13 4:37 ` Nguyễn Thái Ngọc Duy [this message]
2012-05-14 15:56 ` [PATCH v2] " Junio C Hamano
2012-05-14 19:43 ` Junio C Hamano
2012-05-15 11:18 ` Nguyen Thai Ngoc Duy
2012-05-15 15:27 ` Junio C Hamano
2012-05-16 7:09 ` Nguyen Thai Ngoc Duy
2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
2012-05-16 12:02 ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
2012-05-18 21:05 ` Junio C Hamano
2012-05-16 12:02 ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
2012-05-18 21:16 ` Junio C Hamano
2012-05-19 2:43 ` Nicolas Pitre
2012-05-16 12:02 ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-18 21:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1336883862-9013-1-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=nico@fluxnic.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).