git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / code / Atom feed
* [PATCH] unpack-objects: unpack large object in stream
@ 2021-10-09  8:20 Han Xin
  2021-10-19  7:37 ` Han Xin
                   ` (14 more replies)
  0 siblings, 15 replies; 57+ messages in thread
From: Han Xin @ 2021-10-09  8:20 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use the setting of "core.bigFileThreshold" as
the threshold for large object.

Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          |  41 +++++++-
 object-file.c                     | 149 +++++++++++++++++++++++++++---
 object-store.h                    |   9 ++
 t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
 4 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8ac77e60a8 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+static void fill_stream(struct git_zstream *stream)
+{
+	stream->next_in = fill(1);
+	stream->avail_in = len;
+}
+
+static void use_stream(struct git_zstream *stream)
+{
+	use(len - stream->avail_in);
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	struct git_zstream_reader reader;
+	struct object_id *oid = &obj_list[nr].oid;
+
+	reader.fill = &fill_stream;
+	reader.use = &use_stream;
+
+	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
+				     oid, dry_run))
+		die("failed to write object in stream");
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index a8be899481..06c1693675 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
+			       int fd, unsigned char *compressed,
+			       int compressed_len, const void *buf,
+			       size_t len, int flush)
+{
+	int ret;
+
+	stream->next_in = (void *)buf;
+	stream->avail_in = len;
+	do {
+		unsigned char *in0 = stream->next_in;
+		ret = git_deflate(stream, flush);
+		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream->next_out = compressed;
+		stream->avail_out = compressed_len;
+	} while (ret == Z_OK);
+
+	return ret;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime)
@@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
-	do {
-		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	ret = write_object_buffer(&stream, &c, fd, compressed,
+				  sizeof(compressed), buf, len,
+				  Z_FINISH);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
@@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
 	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
 }
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid,
+			     int dry_run)
+{
+	git_zstream istream, ostream;
+	unsigned char buf[8192], compressed[4096];
+	char hdr[MAX_HEADER_LEN];
+	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
+	int ret = 0;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+
+	/* Write tmpfile in objects dir, because oid is unknown */
+	if (!dry_run) {
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (errno == EACCES)
+				ret = error(_("insufficient permission for adding an object to repository database %s"),
+					get_object_directory());
+			else
+				ret = error_errno(_("unable to create temporary file"));
+			goto cleanup;
+		}
+	}
+
+	memset(&istream, 0, sizeof(istream));
+	istream.next_out = buf;
+	istream.avail_out = sizeof(buf);
+	git_inflate_init(&istream);
+
+	if (!dry_run) {
+		/* Set it up */
+		git_deflate_init(&ostream, zlib_compression_level);
+		ostream.next_out = compressed;
+		ostream.avail_out = sizeof(compressed);
+		the_hash_algo->init_fn(&c);
+
+		/* First header */
+		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
+				(uintmax_t)len) + 1;
+		ostream.next_in = (unsigned char *)hdr;
+		ostream.avail_in = hdrlen;
+		while (git_deflate(&ostream, 0) == Z_OK)
+			; /* nothing */
+		the_hash_algo->update_fn(&c, hdr, hdrlen);
+	}
+
+	/* Then the data itself */
+	do {
+		unsigned char *last_out = istream.next_out;
+		reader->fill(&istream);
+		istatus = git_inflate(&istream, 0);
+		if (istatus == Z_STREAM_END)
+			flush = Z_FINISH;
+		reader->use(&istream);
+		if (!dry_run)
+			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
+						      sizeof(compressed), last_out,
+						      istream.next_out - last_out,
+						      flush);
+		istream.next_out = buf;
+		istream.avail_out = sizeof(buf);
+	} while (istatus == Z_OK);
+
+	if (istream.total_out != len || istatus != Z_STREAM_END)
+		die( _("inflate returned %d"), istatus);
+	git_inflate_end(&istream);
+
+	if (dry_run)
+		goto cleanup;
+
+	if (ostatus != Z_STREAM_END)
+		die(_("unable to deflate new object (%d)"), ostatus);
+	ostatus = git_deflate_end_gently(&ostream);
+	if (ostatus != Z_OK)
+		die(_("deflateEnd on object failed (%d)"), ostatus);
+	the_hash_algo->final_fn(oid->hash, &c);
+	close_loose_object(fd);
+
+	/* We get the oid now */
+	loose_object_path(the_repository, &filename, oid);
+
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		/*
+		 * Make sure the directory exists; note that the contents
+		 * of the buffer are undefined after mkstemp returns an
+		 * error, so we have to rewrite the whole buffer from
+		 * scratch.
+		 */
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
+			unlink_or_warn(tmp_file.buf);
+			strbuf_release(&dir);
+			ret = -1;
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	ret = finalize_object_file(tmp_file.buf, filename.buf);
+
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return ret;
+}
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags)
diff --git a/object-store.h b/object-store.h
index d24915ced1..12b113ef93 100644
--- a/object-store.h
+++ b/object-store.h
@@ -33,6 +33,11 @@ struct object_directory {
 	char *path;
 };
 
+struct git_zstream_reader {
+	void (*fill)(struct git_zstream *);
+	void (*use)(struct git_zstream *);
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 int write_object_file(const void *buf, unsigned long len,
 		      const char *type, struct object_id *oid);
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid, int dry_run);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.0.1.g09a6bb964f.dirty


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
@ 2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-10-19  7:37 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
@ 2021-10-20 14:42 ` Philip Oakley
  2021-10-21  3:42   ` Han Xin
  2021-11-03  1:48 ` Han Xin
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Philip Oakley @ 2021-10-20 14:42 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

On 09/10/2021 09:20, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +static void fill_stream(struct git_zstream *stream)
> +{
> +	stream->next_in = fill(1);
> +	stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +	use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)

Can we use size_t for the `size`, and possibly `nr`, to improve
compatibility with Windows systems where unsigned long is only 32 bits?

There has been some work in the past on providing large file support on
Windows, which requires numerous long -> size_t changes.

Philip
> +{
> +	struct git_zstream_reader reader;
> +	struct object_id *oid = &obj_list[nr].oid;
> +
> +	reader.fill = &fill_stream;
> +	reader.use = &use_stream;
> +
> +	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +				     oid, dry_run))
> +		die("failed to write object in stream");
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  				   unsigned nr)
>  {
> -	void *buf = get_data(size);
> +	void *buf;
> +
> +	/* Write large blob in stream without allocating full buffer. */
> +	if (type == OBJ_BLOB && size > big_file_threshold) {
> +		write_stream_blob(nr, size);
> +		return;
> +	}
>  
> +	buf = get_data(size);
>  	if (!dry_run && buf)
>  		write_object(nr, type, buf, size);
>  	else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +			       int fd, unsigned char *compressed,
> +			       int compressed_len, const void *buf,
> +			       size_t len, int flush)
> +{
> +	int ret;
> +
> +	stream->next_in = (void *)buf;
> +	stream->avail_in = len;
> +	do {
> +		unsigned char *in0 = stream->next_in;
> +		ret = git_deflate(stream, flush);
> +		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream->next_out = compressed;
> +		stream->avail_out = compressed_len;
> +	} while (ret == Z_OK);
> +
> +	return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> -	stream.next_in = (void *)buf;
> -	stream.avail_in = len;
> -	do {
> -		unsigned char *in0 = stream.next_in;
> -		ret = git_deflate(&stream, Z_FINISH);
> -		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -			die(_("unable to write loose object file"));
> -		stream.next_out = compressed;
> -		stream.avail_out = sizeof(compressed);
> -	} while (ret == Z_OK);
> +	ret = write_object_buffer(&stream, &c, fd, compressed,
> +				  sizeof(compressed), buf, len,
> +				  Z_FINISH);
>  
>  	if (ret != Z_STREAM_END)
>  		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>  	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid,
> +			     int dry_run)
> +{
> +	git_zstream istream, ostream;
> +	unsigned char buf[8192], compressed[4096];
> +	char hdr[MAX_HEADER_LEN];
> +	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +	int ret = 0;
> +	git_hash_ctx c;
> +	struct strbuf tmp_file = STRBUF_INIT;
> +	struct strbuf filename = STRBUF_INIT;
> +
> +	/* Write tmpfile in objects dir, because oid is unknown */
> +	if (!dry_run) {
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +		fd = create_tmpfile(&tmp_file, filename.buf);
> +		if (fd < 0) {
> +			if (errno == EACCES)
> +				ret = error(_("insufficient permission for adding an object to repository database %s"),
> +					get_object_directory());
> +			else
> +				ret = error_errno(_("unable to create temporary file"));
> +			goto cleanup;
> +		}
> +	}
> +
> +	memset(&istream, 0, sizeof(istream));
> +	istream.next_out = buf;
> +	istream.avail_out = sizeof(buf);
> +	git_inflate_init(&istream);
> +
> +	if (!dry_run) {
> +		/* Set it up */
> +		git_deflate_init(&ostream, zlib_compression_level);
> +		ostream.next_out = compressed;
> +		ostream.avail_out = sizeof(compressed);
> +		the_hash_algo->init_fn(&c);
> +
> +		/* First header */
> +		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +				(uintmax_t)len) + 1;
> +		ostream.next_in = (unsigned char *)hdr;
> +		ostream.avail_in = hdrlen;
> +		while (git_deflate(&ostream, 0) == Z_OK)
> +			; /* nothing */
> +		the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	}
> +
> +	/* Then the data itself */
> +	do {
> +		unsigned char *last_out = istream.next_out;
> +		reader->fill(&istream);
> +		istatus = git_inflate(&istream, 0);
> +		if (istatus == Z_STREAM_END)
> +			flush = Z_FINISH;
> +		reader->use(&istream);
> +		if (!dry_run)
> +			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +						      sizeof(compressed), last_out,
> +						      istream.next_out - last_out,
> +						      flush);
> +		istream.next_out = buf;
> +		istream.avail_out = sizeof(buf);
> +	} while (istatus == Z_OK);
> +
> +	if (istream.total_out != len || istatus != Z_STREAM_END)
> +		die( _("inflate returned %d"), istatus);
> +	git_inflate_end(&istream);
> +
> +	if (dry_run)
> +		goto cleanup;
> +
> +	if (ostatus != Z_STREAM_END)
> +		die(_("unable to deflate new object (%d)"), ostatus);
> +	ostatus = git_deflate_end_gently(&ostream);
> +	if (ostatus != Z_OK)
> +		die(_("deflateEnd on object failed (%d)"), ostatus);
> +	the_hash_algo->final_fn(oid->hash, &c);
> +	close_loose_object(fd);
> +
> +	/* We get the oid now */
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		/*
> +		 * Make sure the directory exists; note that the contents
> +		 * of the buffer are undefined after mkstemp returns an
> +		 * error, so we have to rewrite the whole buffer from
> +		 * scratch.
> +		 */
> +		strbuf_add(&dir, filename.buf, dirlen - 1);
> +		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +			unlink_or_warn(tmp_file.buf);
> +			strbuf_release(&dir);
> +			ret = -1;
> +			goto cleanup;
> +		}
> +		strbuf_release(&dir);
> +	}
> +
> +	ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);
> +	return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct git_zstream_reader {
> +	void (*fill)(struct git_zstream *);
> +	void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)
>  
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>  		      const char *type, struct object_id *oid);
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort
> +	) >expect &&
> +	git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +	test_must_fail git push dest.git HEAD 2>err &&
> +	test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +	git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +	git push dest.git HEAD &&
> +	git -C dest.git fsck &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +	PACK=$(echo main | git pack-objects --progress --revs test) &&
> +	unset GIT_ALLOC_LIMIT &&
> +	git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 2m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 1m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_done


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-20 14:42 ` Philip Oakley
@ 2021-10-21  3:42   ` Han Xin
  2021-10-21 22:47     ` Philip Oakley
  0 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-10-21  3:42 UTC (permalink / raw)
  To: Philip Oakley; +Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Han Xin

Philip Oakley <philipoakley@iee.email> 于2021年10月20日周三 下午10:43写道:
>
> On 09/10/2021 09:20, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When calling "unpack_non_delta_entry()", will allocate full memory for
> > the whole size of the unpacked object and write the buffer to loose file
> > on disk. This may lead to OOM for the git-unpack-objects process when
> > unpacking a very large object.
> >
> > In function "unpack_delta_entry()", will also allocate full memory to
> > buffer the whole delta, but since there will be no delta for an object
> > larger than "core.bigFileThreshold", this issue is moderate.
> >
> > To resolve the OOM issue in "git-unpack-objects", we can unpack large
> > object to file in stream, and use the setting of "core.bigFileThreshold" as
> > the threshold for large object.
> >
> > Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  builtin/unpack-objects.c          |  41 +++++++-
> >  object-file.c                     | 149 +++++++++++++++++++++++++++---
> >  object-store.h                    |   9 ++
> >  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
> >  4 files changed, 279 insertions(+), 12 deletions(-)
> >  create mode 100755 t/t5590-receive-unpack-objects.sh
> >
> > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> > index 4a9466295b..8ac77e60a8 100644
> > --- a/builtin/unpack-objects.c
> > +++ b/builtin/unpack-objects.c
> > @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
> >       }
> >  }
> >
> > +static void fill_stream(struct git_zstream *stream)
> > +{
> > +     stream->next_in = fill(1);
> > +     stream->avail_in = len;
> > +}
> > +
> > +static void use_stream(struct git_zstream *stream)
> > +{
> > +     use(len - stream->avail_in);
> > +}
> > +
> > +static void write_stream_blob(unsigned nr, unsigned long size)
>
> Can we use size_t for the `size`, and possibly `nr`, to improve
> compatibility with Windows systems where unsigned long is only 32 bits?
>
> There has been some work in the past on providing large file support on
> Windows, which requires numerous long -> size_t changes.
>
> Philip

Thanks for your review. I'm not sure if I should do this change in this patch,
it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
`write_object()` and many others.

> > +{
> > +     struct git_zstream_reader reader;
> > +     struct object_id *oid = &obj_list[nr].oid;
> > +
> > +     reader.fill = &fill_stream;
> > +     reader.use = &use_stream;
> > +
> > +     if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> > +                                  oid, dry_run))
> > +             die("failed to write object in stream");
> > +     if (strict && !dry_run) {
> > +             struct blob *blob = lookup_blob(the_repository, oid);
> > +             if (blob)
> > +                     blob->object.flags |= FLAG_WRITTEN;
> > +             else
> > +                     die("invalid blob object from stream");
> > +     }
> > +     obj_list[nr].obj = NULL;
> > +}
> > +
> >  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
> >                                  unsigned nr)
> >  {
> > -     void *buf = get_data(size);
> > +     void *buf;
> > +
> > +     /* Write large blob in stream without allocating full buffer. */
> > +     if (type == OBJ_BLOB && size > big_file_threshold) {
> > +             write_stream_blob(nr, size);
> > +             return;
> > +     }
> >
> > +     buf = get_data(size);
> >       if (!dry_run && buf)
> >               write_object(nr, type, buf, size);
> >       else
> > diff --git a/object-file.c b/object-file.c
> > index a8be899481..06c1693675 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> > +                            int fd, unsigned char *compressed,
> > +                            int compressed_len, const void *buf,
> > +                            size_t len, int flush)
> > +{
> > +     int ret;
> > +
> > +     stream->next_in = (void *)buf;
> > +     stream->avail_in = len;
> > +     do {
> > +             unsigned char *in0 = stream->next_in;
> > +             ret = git_deflate(stream, flush);
> > +             the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> > +             if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> > +                     die(_("unable to write loose object file"));
> > +             stream->next_out = compressed;
> > +             stream->avail_out = compressed_len;
> > +     } while (ret == Z_OK);
> > +
> > +     return ret;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> >                             int hdrlen, const void *buf, unsigned long len,
> >                             time_t mtime)
> > @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > -     stream.next_in = (void *)buf;
> > -     stream.avail_in = len;
> > -     do {
> > -             unsigned char *in0 = stream.next_in;
> > -             ret = git_deflate(&stream, Z_FINISH);
> > -             the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> > -             if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> > -                     die(_("unable to write loose object file"));
> > -             stream.next_out = compressed;
> > -             stream.avail_out = sizeof(compressed);
> > -     } while (ret == Z_OK);
> > +     ret = write_object_buffer(&stream, &c, fd, compressed,
> > +                               sizeof(compressed), buf, len,
> > +                               Z_FINISH);
> >
> >       if (ret != Z_STREAM_END)
> >               die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> > @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
> >       return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
> >  }
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid,
> > +                          int dry_run)
> > +{
> > +     git_zstream istream, ostream;
> > +     unsigned char buf[8192], compressed[4096];
> > +     char hdr[MAX_HEADER_LEN];
> > +     int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> > +     int ret = 0;
> > +     git_hash_ctx c;
> > +     struct strbuf tmp_file = STRBUF_INIT;
> > +     struct strbuf filename = STRBUF_INIT;
> > +
> > +     /* Write tmpfile in objects dir, because oid is unknown */
> > +     if (!dry_run) {
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +             fd = create_tmpfile(&tmp_file, filename.buf);
> > +             if (fd < 0) {
> > +                     if (errno == EACCES)
> > +                             ret = error(_("insufficient permission for adding an object to repository database %s"),
> > +                                     get_object_directory());
> > +                     else
> > +                             ret = error_errno(_("unable to create temporary file"));
> > +                     goto cleanup;
> > +             }
> > +     }
> > +
> > +     memset(&istream, 0, sizeof(istream));
> > +     istream.next_out = buf;
> > +     istream.avail_out = sizeof(buf);
> > +     git_inflate_init(&istream);
> > +
> > +     if (!dry_run) {
> > +             /* Set it up */
> > +             git_deflate_init(&ostream, zlib_compression_level);
> > +             ostream.next_out = compressed;
> > +             ostream.avail_out = sizeof(compressed);
> > +             the_hash_algo->init_fn(&c);
> > +
> > +             /* First header */
> > +             hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> > +                             (uintmax_t)len) + 1;
> > +             ostream.next_in = (unsigned char *)hdr;
> > +             ostream.avail_in = hdrlen;
> > +             while (git_deflate(&ostream, 0) == Z_OK)
> > +                     ; /* nothing */
> > +             the_hash_algo->update_fn(&c, hdr, hdrlen);
> > +     }
> > +
> > +     /* Then the data itself */
> > +     do {
> > +             unsigned char *last_out = istream.next_out;
> > +             reader->fill(&istream);
> > +             istatus = git_inflate(&istream, 0);
> > +             if (istatus == Z_STREAM_END)
> > +                     flush = Z_FINISH;
> > +             reader->use(&istream);
> > +             if (!dry_run)
> > +                     ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> > +                                                   sizeof(compressed), last_out,
> > +                                                   istream.next_out - last_out,
> > +                                                   flush);
> > +             istream.next_out = buf;
> > +             istream.avail_out = sizeof(buf);
> > +     } while (istatus == Z_OK);
> > +
> > +     if (istream.total_out != len || istatus != Z_STREAM_END)
> > +             die( _("inflate returned %d"), istatus);
> > +     git_inflate_end(&istream);
> > +
> > +     if (dry_run)
> > +             goto cleanup;
> > +
> > +     if (ostatus != Z_STREAM_END)
> > +             die(_("unable to deflate new object (%d)"), ostatus);
> > +     ostatus = git_deflate_end_gently(&ostream);
> > +     if (ostatus != Z_OK)
> > +             die(_("deflateEnd on object failed (%d)"), ostatus);
> > +     the_hash_algo->final_fn(oid->hash, &c);
> > +     close_loose_object(fd);
> > +
> > +     /* We get the oid now */
> > +     loose_object_path(the_repository, &filename, oid);
> > +
> > +     dirlen = directory_size(filename.buf);
> > +     if (dirlen) {
> > +             struct strbuf dir = STRBUF_INIT;
> > +             /*
> > +              * Make sure the directory exists; note that the contents
> > +              * of the buffer are undefined after mkstemp returns an
> > +              * error, so we have to rewrite the whole buffer from
> > +              * scratch.
> > +              */
> > +             strbuf_add(&dir, filename.buf, dirlen - 1);
> > +             if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> > +                     unlink_or_warn(tmp_file.buf);
> > +                     strbuf_release(&dir);
> > +                     ret = -1;
> > +                     goto cleanup;
> > +             }
> > +             strbuf_release(&dir);
> > +     }
> > +
> > +     ret = finalize_object_file(tmp_file.buf, filename.buf);
> > +
> > +cleanup:
> > +     strbuf_release(&tmp_file);
> > +     strbuf_release(&filename);
> > +     return ret;
> > +}
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags)
> > diff --git a/object-store.h b/object-store.h
> > index d24915ced1..12b113ef93 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -33,6 +33,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct git_zstream_reader {
> > +     void (*fill)(struct git_zstream *);
> > +     void (*use)(struct git_zstream *);
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)
> >
> > @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
> >  int write_object_file(const void *buf, unsigned long len,
> >                     const char *type, struct object_id *oid);
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid, int dry_run);
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags);
> > diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> > new file mode 100755
> > index 0000000000..7e63dfc0db
> > --- /dev/null
> > +++ b/t/t5590-receive-unpack-objects.sh
> > @@ -0,0 +1,92 @@
> > +#!/bin/sh
> > +#
> > +# Copyright (c) 2021 Han Xin
> > +#
> > +
> > +test_description='Test unpack-objects when receive pack'
> > +
> > +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> > +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> > +
> > +. ./test-lib.sh
> > +
> > +test_expect_success "create commit with big blobs (1.5 MB)" '
> > +     test-tool genrandom foo 1500000 >big-blob &&
> > +     test_commit --append foo big-blob &&
> > +     test-tool genrandom bar 1500000 >big-blob &&
> > +     test_commit --append bar big-blob &&
> > +     (
> > +             cd .git &&
> > +             find objects/?? -type f | sort
> > +     ) >expect &&
> > +     git repack -ad
> > +'
> > +
> > +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> > +     GIT_ALLOC_LIMIT=1m &&
> > +     export GIT_ALLOC_LIMIT
> > +'
> > +
> > +test_expect_success 'prepare dest repository' '
> > +     git init --bare dest.git &&
> > +     git -C dest.git config core.bigFileThreshold 2m &&
> > +     git -C dest.git config receive.unpacklimit 100
> > +'
> > +
> > +test_expect_success 'fail to push: cannot allocate' '
> > +     test_must_fail git push dest.git HEAD 2>err &&
> > +     test_i18ngrep "remote: fatal: attempting to allocate" err &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     ! test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'set a lower bigfile threshold' '
> > +     git -C dest.git config core.bigFileThreshold 1m
> > +'
> > +
> > +test_expect_success 'unpack big object in stream' '
> > +     git push dest.git HEAD &&
> > +     git -C dest.git fsck &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'setup for unpack-objects dry-run test' '
> > +     PACK=$(echo main | git pack-objects --progress --revs test) &&
> > +     unset GIT_ALLOC_LIMIT &&
> > +     git init --bare unpack-test.git
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with large threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 2m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with small threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 1m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_done
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-21  3:42   ` Han Xin
@ 2021-10-21 22:47     ` Philip Oakley
  0 siblings, 0 replies; 57+ messages in thread
From: Philip Oakley @ 2021-10-21 22:47 UTC (permalink / raw)
  To: Han Xin; +Cc: Han Xin, Jiang Xin, Git List

On 21/10/2021 04:42, Han Xin wrote:
>>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> Can we use size_t for the `size`, and possibly `nr`, to improve
>> compatibility with Windows systems where unsigned long is only 32 bits?
>>
>> There has been some work in the past on providing large file support on
>> Windows, which requires numerous long -> size_t changes.
>>
>> Philip
> Thanks for your review. I'm not sure if I should do this change in this patch,
> it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
> `write_object()` and many others.
>
I was mainly raising the issue regarding the 4GB (sometime 2GB)
limitations on Windows which has been a problem for many years.

I had been thinking of not changing the `nr` (number of objects limit)
as 2G objects is hopefully already sufficient, even for thargest of
repos (though IIUC their index file size did break the 32bit size limit).

Staying with the existing types won't make the situation any worse, so
from that perspective the change isn't needed.
--
Philip

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
@ 2021-11-03  1:48 ` Han Xin
  2021-11-03 10:07   ` Philip Oakley
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-03  1:48 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any more suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-11-03  1:48 ` Han Xin
@ 2021-11-03 10:07   ` Philip Oakley
  0 siblings, 0 replies; 57+ messages in thread
From: Philip Oakley @ 2021-11-03 10:07 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

(replies to the alibaba-inc.com aren't getting through for me)

On 03/11/2021 01:48, Han Xin wrote:
> Any more suggestions?
>
> Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> When calling "unpack_non_delta_entry()", will allocate full memory for
>> the whole size of the unpacked object and write the buffer to loose file
>> on disk. This may lead to OOM for the git-unpack-objects process when
>> unpacking a very large object.

Is it possible to split the patch into smaller pieces, taking each item
separately?

For large files (as above), it should be possible to stream the
unpacking direct to disk, in the same way that the zlib reading is
chunked. However having the same 'code' in two places would need to be
addressed (the DRY principle).

At the moment on LLP64 systems (Windows) there is already a long (32bit)
vs size_t (64bit) problem there (zlib stream), and the size_t problem
then permeates the wider codebase.

The normal Git file operations does tend to memory map whole files, but
here it looks like you can bypass that.
>>
>> In function "unpack_delta_entry()", will also allocate full memory to
>> buffer the whole delta, but since there will be no delta for an object
>> larger than "core.bigFileThreshold", this issue is moderate.

What does 'moderate' mean here? Does it mean there is a simple test that
allows you to side step the whole problem?

>>
>> To resolve the OOM issue in "git-unpack-objects", we can unpack large
>> object to file in stream, and use the setting of "core.bigFileThreshold" as
>> the threshold for large object.

Is this "core.bigFileThreshold" the core element? If so, it is too far
down the commit message. The readers have already (potentially) misread
the message and reacted too soon.  Perhaps: "use `core.bigFileThreshold`
to avoid mmap OOM limits when unpacking".

--
Philip
>>
>> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>> ---
>>  builtin/unpack-objects.c          |  41 +++++++-
>>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>>  object-store.h                    |   9 ++
>>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>>  4 files changed, 279 insertions(+), 12 deletions(-)
>>  create mode 100755 t/t5590-receive-unpack-objects.sh
>>
>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>> index 4a9466295b..8ac77e60a8 100644
>> --- a/builtin/unpack-objects.c
>> +++ b/builtin/unpack-objects.c
>> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>>         }
>>  }
>>
>> +static void fill_stream(struct git_zstream *stream)
>> +{
>> +       stream->next_in = fill(1);
>> +       stream->avail_in = len;
>> +}
>> +
>> +static void use_stream(struct git_zstream *stream)
>> +{
>> +       use(len - stream->avail_in);
>> +}
>> +
>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> +{
>> +       struct git_zstream_reader reader;
>> +       struct object_id *oid = &obj_list[nr].oid;
>> +
>> +       reader.fill = &fill_stream;
>> +       reader.use = &use_stream;
>> +
>> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
>> +                                    oid, dry_run))
>> +               die("failed to write object in stream");
>> +       if (strict && !dry_run) {
>> +               struct blob *blob = lookup_blob(the_repository, oid);
>> +               if (blob)
>> +                       blob->object.flags |= FLAG_WRITTEN;
>> +               else
>> +                       die("invalid blob object from stream");
>> +       }
>> +       obj_list[nr].obj = NULL;
>> +}
>> +
>>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>>                                    unsigned nr)
>>  {
>> -       void *buf = get_data(size);
>> +       void *buf;
>> +
>> +       /* Write large blob in stream without allocating full buffer. */
>> +       if (type == OBJ_BLOB && size > big_file_threshold) {
>> +               write_stream_blob(nr, size);
>> +               return;
>> +       }
>>
>> +       buf = get_data(size);
>>         if (!dry_run && buf)
>>                 write_object(nr, type, buf, size);
>>         else
>> diff --git a/object-file.c b/object-file.c
>> index a8be899481..06c1693675 100644
>> --- a/object-file.c
>> +++ b/object-file.c
>> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>>         return fd;
>>  }
>>
>> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
>> +                              int fd, unsigned char *compressed,
>> +                              int compressed_len, const void *buf,
>> +                              size_t len, int flush)
>> +{
>> +       int ret;
>> +
>> +       stream->next_in = (void *)buf;
>> +       stream->avail_in = len;
>> +       do {
>> +               unsigned char *in0 = stream->next_in;
>> +               ret = git_deflate(stream, flush);
>> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
>> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
>> +                       die(_("unable to write loose object file"));
>> +               stream->next_out = compressed;
>> +               stream->avail_out = compressed_len;
>> +       } while (ret == Z_OK);
>> +
>> +       return ret;
>> +}
>> +
>>  static int write_loose_object(const struct object_id *oid, char *hdr,
>>                               int hdrlen, const void *buf, unsigned long len,
>>                               time_t mtime)
>> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>>
>>         /* Then the data itself.. */
>> -       stream.next_in = (void *)buf;
>> -       stream.avail_in = len;
>> -       do {
>> -               unsigned char *in0 = stream.next_in;
>> -               ret = git_deflate(&stream, Z_FINISH);
>> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>> -                       die(_("unable to write loose object file"));
>> -               stream.next_out = compressed;
>> -               stream.avail_out = sizeof(compressed);
>> -       } while (ret == Z_OK);
>> +       ret = write_object_buffer(&stream, &c, fd, compressed,
>> +                                 sizeof(compressed), buf, len,
>> +                                 Z_FINISH);
>>
>>         if (ret != Z_STREAM_END)
>>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
>> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>>  }
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid,
>> +                            int dry_run)
>> +{
>> +       git_zstream istream, ostream;
>> +       unsigned char buf[8192], compressed[4096];
>> +       char hdr[MAX_HEADER_LEN];
>> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
>> +       int ret = 0;
>> +       git_hash_ctx c;
>> +       struct strbuf tmp_file = STRBUF_INIT;
>> +       struct strbuf filename = STRBUF_INIT;
>> +
>> +       /* Write tmpfile in objects dir, because oid is unknown */
>> +       if (!dry_run) {
>> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
>> +               strbuf_addch(&filename, '/');
>> +               fd = create_tmpfile(&tmp_file, filename.buf);
>> +               if (fd < 0) {
>> +                       if (errno == EACCES)
>> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
>> +                                       get_object_directory());
>> +                       else
>> +                               ret = error_errno(_("unable to create temporary file"));
>> +                       goto cleanup;
>> +               }
>> +       }
>> +
>> +       memset(&istream, 0, sizeof(istream));
>> +       istream.next_out = buf;
>> +       istream.avail_out = sizeof(buf);
>> +       git_inflate_init(&istream);
>> +
>> +       if (!dry_run) {
>> +               /* Set it up */
>> +               git_deflate_init(&ostream, zlib_compression_level);
>> +               ostream.next_out = compressed;
>> +               ostream.avail_out = sizeof(compressed);
>> +               the_hash_algo->init_fn(&c);
>> +
>> +               /* First header */
>> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
>> +                               (uintmax_t)len) + 1;
>> +               ostream.next_in = (unsigned char *)hdr;
>> +               ostream.avail_in = hdrlen;
>> +               while (git_deflate(&ostream, 0) == Z_OK)
>> +                       ; /* nothing */
>> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
>> +       }
>> +
>> +       /* Then the data itself */
>> +       do {
>> +               unsigned char *last_out = istream.next_out;
>> +               reader->fill(&istream);
>> +               istatus = git_inflate(&istream, 0);
>> +               if (istatus == Z_STREAM_END)
>> +                       flush = Z_FINISH;
>> +               reader->use(&istream);
>> +               if (!dry_run)
>> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
>> +                                                     sizeof(compressed), last_out,
>> +                                                     istream.next_out - last_out,
>> +                                                     flush);
>> +               istream.next_out = buf;
>> +               istream.avail_out = sizeof(buf);
>> +       } while (istatus == Z_OK);
>> +
>> +       if (istream.total_out != len || istatus != Z_STREAM_END)
>> +               die( _("inflate returned %d"), istatus);
>> +       git_inflate_end(&istream);
>> +
>> +       if (dry_run)
>> +               goto cleanup;
>> +
>> +       if (ostatus != Z_STREAM_END)
>> +               die(_("unable to deflate new object (%d)"), ostatus);
>> +       ostatus = git_deflate_end_gently(&ostream);
>> +       if (ostatus != Z_OK)
>> +               die(_("deflateEnd on object failed (%d)"), ostatus);
>> +       the_hash_algo->final_fn(oid->hash, &c);
>> +       close_loose_object(fd);
>> +
>> +       /* We get the oid now */
>> +       loose_object_path(the_repository, &filename, oid);
>> +
>> +       dirlen = directory_size(filename.buf);
>> +       if (dirlen) {
>> +               struct strbuf dir = STRBUF_INIT;
>> +               /*
>> +                * Make sure the directory exists; note that the contents
>> +                * of the buffer are undefined after mkstemp returns an
>> +                * error, so we have to rewrite the whole buffer from
>> +                * scratch.
>> +                */
>> +               strbuf_add(&dir, filename.buf, dirlen - 1);
>> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
>> +                       unlink_or_warn(tmp_file.buf);
>> +                       strbuf_release(&dir);
>> +                       ret = -1;
>> +                       goto cleanup;
>> +               }
>> +               strbuf_release(&dir);
>> +       }
>> +
>> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
>> +
>> +cleanup:
>> +       strbuf_release(&tmp_file);
>> +       strbuf_release(&filename);
>> +       return ret;
>> +}
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags)
>> diff --git a/object-store.h b/object-store.h
>> index d24915ced1..12b113ef93 100644
>> --- a/object-store.h
>> +++ b/object-store.h
>> @@ -33,6 +33,11 @@ struct object_directory {
>>         char *path;
>>  };
>>
>> +struct git_zstream_reader {
>> +       void (*fill)(struct git_zstream *);
>> +       void (*use)(struct git_zstream *);
>> +};
>> +
>>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>>         struct object_directory *, 1, fspathhash, fspatheq)
>>
>> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>>  int write_object_file(const void *buf, unsigned long len,
>>                       const char *type, struct object_id *oid);
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid, int dry_run);
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags);
>> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
>> new file mode 100755
>> index 0000000000..7e63dfc0db
>> --- /dev/null
>> +++ b/t/t5590-receive-unpack-objects.sh
>> @@ -0,0 +1,92 @@
>> +#!/bin/sh
>> +#
>> +# Copyright (c) 2021 Han Xin
>> +#
>> +
>> +test_description='Test unpack-objects when receive pack'
>> +
>> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
>> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>> +
>> +. ./test-lib.sh
>> +
>> +test_expect_success "create commit with big blobs (1.5 MB)" '
>> +       test-tool genrandom foo 1500000 >big-blob &&
>> +       test_commit --append foo big-blob &&
>> +       test-tool genrandom bar 1500000 >big-blob &&
>> +       test_commit --append bar big-blob &&
>> +       (
>> +               cd .git &&
>> +               find objects/?? -type f | sort
>> +       ) >expect &&
>> +       git repack -ad
>> +'
>> +
>> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
>> +       GIT_ALLOC_LIMIT=1m &&
>> +       export GIT_ALLOC_LIMIT
>> +'
>> +
>> +test_expect_success 'prepare dest repository' '
>> +       git init --bare dest.git &&
>> +       git -C dest.git config core.bigFileThreshold 2m &&
>> +       git -C dest.git config receive.unpacklimit 100
>> +'
>> +
>> +test_expect_success 'fail to push: cannot allocate' '
>> +       test_must_fail git push dest.git HEAD 2>err &&
>> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       ! test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'set a lower bigfile threshold' '
>> +       git -C dest.git config core.bigFileThreshold 1m
>> +'
>> +
>> +test_expect_success 'unpack big object in stream' '
>> +       git push dest.git HEAD &&
>> +       git -C dest.git fsck &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'setup for unpack-objects dry-run test' '
>> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
>> +       unset GIT_ALLOC_LIMIT &&
>> +       git init --bare unpack-test.git
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with large threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 2m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with small threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 1m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_done
>> --
>> 2.33.0.1.g09a6bb964f.dirty
>>


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (2 preceding siblings ...)
  2021-11-03  1:48 ` Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  4:59   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Refactor write_loose_object() to support inputstream, in the same way
that zlib reading is chunked.

Using "in_stream" instead of "void *buf", we needn't to allocate enough
memory in advance, and only part of the contents will be read when
called "in_stream.read()".

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index 02b7970274..1ad2cb579c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct input_data_from_buffer {
+	const char *buf;
+	unsigned long len;
+};
+
+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
+{
+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
+
+	if (input->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = input->len;
+	input->len = 0;
+	return input->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const char *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream->data, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct input_data_from_buffer data;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..f1b67e9100 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const char *(*read)(void* data, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (3 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:42   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We will use "write_loose_object()" later to handle large blob object,
which needs to work in dry_run mode.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1ad2cb579c..b0838c847e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, struct input_stream *in_stream,
+			      int dry_run,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
+	if (!dry_run) {
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (flags & HASH_SILENT)
+				return -1;
+			else if (errno == EACCES)
+				return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			else
+				return error_errno(_("unable to create temporary file"));
+		}
 	}
 
 	/* Set it up */
@@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		unsigned char *in0 = stream.next_in;
 		ret = git_deflate(&stream, Z_FINISH);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
@@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
+	if (dry_run)
+		return 0;
+
 	close_loose_object(fd);
 
 	if (mtime) {
@@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
 
 cleanup:
 	free(header);
@@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	data.buf = buf;
 	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
 	free(buf);
 
 	return ret;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (4 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:49   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When read input stream, oid can't get before reading all, and it will be
filled after reading.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index b0838c847e..8393659f0d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const char *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else
+		loose_object_path(the_repository, &filename, oid);
 
 	if (!dry_run) {
 		fd = create_tmpfile(&tmp_file, filename.buf);
@@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
@@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		/* copy oid */
+		oidcpy((struct object_id *)oid, &parano_oid);
+		/* We get the oid now */
+		loose_object_path(the_repository, &filename, oid);
+
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			/*
+			 * Make sure the directory exists; note that the
+			 * contents of the buffer are undefined after mkstemp
+			 * returns an error, so we have to rewrite the whole
+			 * buffer from scratch.
+			 */
+			strbuf_reset(&dir);
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (5 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:56   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Read input stream repeatedly in write_loose_object() unless reach the
end, so that we can divide the large blob write into many small blocks.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 8393659f0d..e333448c54 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const char *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream->data, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 5/6] object-store.h: add write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (6 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

For large loose object files, that should be possible to stream it
direct to disk with "write_loose_object()".
Unlike "write_object_file()", you need to implement an "input_stream"
instead of giving void *buf.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 8 ++++----
 object-store.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index e333448c54..60eb29db97 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,10 +1878,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 	return input->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      int dry_run,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret = 0;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index f1b67e9100..f6faa8d6d3 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,11 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (7 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  7:14   ` Jiang Xin
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use "core.bigFileThreshold" to avoid OOM
limits when called "get_data()".

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
 t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..6c757d823b 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_data_from_zstream {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
+{
+	struct input_data_from_zstream *input = data;
+	git_zstream *zstream = input->zstream;
+	void *in = fill(1);
+
+	if (!len || input->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = input->buf;
+	zstream->avail_out = sizeof(input->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	input->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(input->buf) - zstream->avail_out;
+
+	return (const char *)input->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_data_from_zstream data;
+	struct input_stream in_stream = {
+		.read = read_inflate_in_stream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-18  4:59   ` Jiang Xin
  2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 1 reply; 57+ messages in thread
From: Jiang Xin @ 2021-11-18  4:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>

It would be better to provide a cover letter describing changes in v2, such as:

* Make "write_loose_object()" a public method, so we can
   reuse it in "unpack_non_delta_entry()".
   (But I doubt we can use "write_object_file_flags()" public
     function, without make this change.)

* Add an new interface "input_stream" as an argument for
   "write_loose_object()", so that we can feed data to
   "write_loose_object()" from buffer or from zlib stream.

> Refactor write_loose_object() to support inputstream, in the same way
> that zlib reading is chunked.

In the beginning of your commit log, you should describe the problem, such as:

We used to read the full content of a blob into buffer in
"unpack_non_delta_entry()" by calling:

    void *buf = get_data(size);

This will consume lots of memory for a very big blob object.

> Using "in_stream" instead of "void *buf", we needn't to allocate enough
> memory in advance, and only part of the contents will be read when
> called "in_stream.read()".
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 02b7970274..1ad2cb579c 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +struct input_data_from_buffer {
> +       const char *buf;
> +       unsigned long len;
> +};
> +
> +static const char *read_input_stream_from_buffer(void *data, unsigned long *len)

Use "const void *" for the type of return variable, just like input
argument for write_loose_object()?

> +{
> +       struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
> +
> +       if (input->len == 0) {
> +               *len = 0;
> +               return NULL;
> +       }
> +       *len = input->len;
> +       input->len = 0;
> +       return input->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -                             int hdrlen, const void *buf, unsigned long len,
> +                             int hdrlen, struct input_stream *in_stream,
>                               time_t mtime, unsigned flags)
>  {
>         int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         struct object_id parano_oid;
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
> +       const char *buf;

Can we use the same prototype as the original:  "const void *buf" ?

> +       unsigned long len;
>
>         loose_object_path(the_repository, &filename, oid);
>
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> +       buf = in_stream->read(in_stream->data, &len);
>         stream.next_in = (void *)buf;
>         stream.avail_in = len;
>         do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen = sizeof(hdr);
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* Normally if we have it in the pack then we do not bother writing
>          * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>         char *header;
>         int hdrlen, status = 0;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* type string, SP, %lu of the length plus NUL must fit this */
>         hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen;
>         int ret;
> +       struct input_data_from_buffer data;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = &data,
> +       };
>
>         if (has_loose_object(oid))
>                 return 0;
>         buf = read_object(the_repository, oid, &type, &len);
>         if (!buf)
>                 return error(_("cannot read object for %s"), oid_to_hex(oid));
> +       data.buf = buf;
> +       data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>         free(buf);
>
>         return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..f1b67e9100 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct input_stream {
> +       const char *(*read)(void* data, unsigned long *len);
> +       void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-18  5:42   ` Jiang Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Jiang Xin @ 2021-11-18  5:42 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We will use "write_loose_object()" later to handle large blob object,
> which needs to work in dry_run mode.

The dry_run mode comes from "builtin/unpack-object.c", throw the
buffer read from "get_data()".
So why not add "dry_run" to "get_data()" instead?

If we have a dry_run version of get_data, such as "get_data(size,
dry_run)", we do not have to add dry_run mode for ”
write_loose_object()".

See: git grep -A5 get_data builtin/unpack-objects.c
builtin/unpack-objects.c:       void *buf = get_data(size);
builtin/unpack-objects.c-
builtin/unpack-objects.c-       if (!dry_run && buf)
builtin/unpack-objects.c-               write_object(nr, type, buf, size);
builtin/unpack-objects.c-       else
builtin/unpack-objects.c-               free(buf);
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }


> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 32 +++++++++++++++++++-------------
>  1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 1ad2cb579c..b0838c847e 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
>
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, struct input_stream *in_stream,
> +                             int dry_run,
>                               time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         loose_object_path(the_repository, &filename, oid);
>
> -       fd = create_tmpfile(&tmp_file, filename.buf);
> -       if (fd < 0) {
> -               if (flags & HASH_SILENT)
> -                       return -1;
> -               else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -               else
> -                       return error_errno(_("unable to create temporary file"));
> +       if (!dry_run) {
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (flags & HASH_SILENT)
> +                               return -1;
> +                       else if (errno == EACCES)
> +                               return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       else
> +                               return error_errno(_("unable to create temporary file"));
> +               }
>         }
>
>         /* Set it up */
> @@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 unsigned char *in0 = stream.next_in;
>                 ret = git_deflate(&stream, Z_FINISH);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +               if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
>                 stream.next_out = compressed;
>                 stream.avail_out = sizeof(compressed);
> @@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> +       if (dry_run)
> +               return 0;
> +
>         close_loose_object(fd);
>
>         if (mtime) {
> @@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         data.buf = buf;
>         data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
>         free(buf);
>
>         return ret;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-18  5:49   ` Jiang Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Jiang Xin @ 2021-11-18  5:49 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When read input stream, oid can't get before reading all, and it will be
> filled after reading.

Under what circumstances is the oid a null oid?  Can we get the oid
from “obj_list[nr].oid” ?
See unpack_non_delta_entry() of builtin/unpack-objects.c.

> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 34 ++++++++++++++++++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index b0838c847e..8393659f0d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         const char *buf;
>         unsigned long len;
>
> -       loose_object_path(the_repository, &filename, oid);
> +       if (is_null_oid(oid)) {
> +               /* When oid is not determined, save tmp file to odb path. */
> +               strbuf_reset(&filename);
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +       } else
> +               loose_object_path(the_repository, &filename, oid);
>
>         if (!dry_run) {
>                 fd = create_tmpfile(&tmp_file, filename.buf);
> @@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>                     ret);
>         the_hash_algo->final_oid_fn(&parano_oid, &c);
> -       if (!oideq(oid, &parano_oid))
> +       if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> @@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         close_loose_object(fd);
>
> +       if (is_null_oid(oid)) {
> +               int dirlen;
> +
> +               /* copy oid */
> +               oidcpy((struct object_id *)oid, &parano_oid);
> +               /* We get the oid now */
> +               loose_object_path(the_repository, &filename, oid);
> +
> +               dirlen = directory_size(filename.buf);
> +               if (dirlen) {
> +                       struct strbuf dir = STRBUF_INIT;
> +                       /*
> +                        * Make sure the directory exists; note that the
> +                        * contents of the buffer are undefined after mkstemp
> +                        * returns an error, so we have to rewrite the whole
> +                        * buffer from scratch.
> +                        */
> +                       strbuf_reset(&dir);
> +                       strbuf_add(&dir, filename.buf, dirlen - 1);
> +                       if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +                               return -1;
> +               }
> +       }
> +
>         if (mtime) {
>                 struct utimbuf utb;
>                 utb.actime = mtime;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-18  5:56   ` Jiang Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Jiang Xin @ 2021-11-18  5:56 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Read input stream repeatedly in write_loose_object() unless reach the
> end, so that we can divide the large blob write into many small blocks.

In order to prepare the stream version of "write_loose_object()", we need ...

>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 8393659f0d..e333448c54 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
>         const char *buf;
> -       unsigned long len;
> +       int flush = 0;
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> @@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       buf = in_stream->read(in_stream->data, &len);
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
>         do {
>                 unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> +               if (!stream.avail_in) {
> +                       if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {

if ((buf = in_stream->read(in_stream->data, &stream.avail_in)) != NULL) {

Or split this long line into:

    buf = in_stream->read(in_stream->data, &stream.avail_in);
    if (buf) {

> +                               stream.next_in = (void *)buf;
> +                               in0 = (unsigned char *)buf;
> +                       } else
> +                               flush = Z_FINISH;

Add {} around this single line, see:

  https://github.com/git/git/blob/master/Documentation/CodingGuidelines#L279-L289

> +               }
> +               ret = git_deflate(&stream, flush);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>                 if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-18  4:59   ` Jiang Xin
@ 2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 0 replies; 57+ messages in thread
From: Junio C Hamano @ 2021-11-18  6:45 UTC (permalink / raw)
  To: Jiang Xin; +Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Jiang Xin <worldhello.net@gmail.com> writes:

> On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> It would be better to provide a cover letter describing changes in v2, such as:
>
> * Make "write_loose_object()" a public method, so we can
>    reuse it in "unpack_non_delta_entry()".
>    (But I doubt we can use "write_object_file_flags()" public
>      function, without make this change.)
>
> * Add an new interface "input_stream" as an argument for
>    "write_loose_object()", so that we can feed data to
>    "write_loose_object()" from buffer or from zlib stream.
>
>> Refactor write_loose_object() to support inputstream, in the same way
>> that zlib reading is chunked.
>
> In the beginning of your commit log, you should describe the problem, such as:
>
> We used to read the full content of a blob into buffer in
> "unpack_non_delta_entry()" by calling:
>
>     void *buf = get_data(size);
>
> This will consume lots of memory for a very big blob object.

I was not sure where "in_stream" came from---"use X insteads of Y",
when X is what these patches invent and introduce, does not make a
good explanation without explaining what X is, what problem X is
attempting to solve and how.

Thanks for helping to clarify the proposed log message.  

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-18  7:14   ` Jiang Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Jiang Xin @ 2021-11-18  7:14 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use "core.bigFileThreshold" to avoid OOM
> limits when called "get_data()".
>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
>  t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
>  2 files changed, 167 insertions(+), 1 deletion(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..6c757d823b 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +struct input_data_from_zstream {
> +       git_zstream *zstream;
> +       unsigned char buf[4096];
> +       int status;
> +};
> +
> +static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
> +{
> +       struct input_data_from_zstream *input = data;
> +       git_zstream *zstream = input->zstream;
> +       void *in = fill(1);
> +
> +       if (!len || input->status == Z_STREAM_END) {
> +               *readlen = 0;
> +               return NULL;
> +       }
> +
> +       zstream->next_out = input->buf;
> +       zstream->avail_out = sizeof(input->buf);
> +       zstream->next_in = in;
> +       zstream->avail_in = len;
> +
> +       input->status = git_inflate(zstream, 0);
> +       use(len - zstream->avail_in);
> +       *readlen = sizeof(input->buf) - zstream->avail_out;
> +
> +       return (const char *)input->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       char hdr[32];
> +       int hdrlen;
> +       git_zstream zstream;
> +       struct input_data_from_zstream data;
> +       struct input_stream in_stream = {
> +               .read = read_inflate_in_stream,
> +               .data = &data,
> +       };
> +       struct object_id *oid = &obj_list[nr].oid;
> +       int ret;
> +
> +       memset(&zstream, 0, sizeof(zstream));
> +       memset(&data, 0, sizeof(data));
> +       data.zstream = &zstream;
> +       git_inflate_init(&zstream);
> +
> +       /* Generate the header */
> +       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +       if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
> +               die(_("failed to write object in stream %d"), ret);
> +
> +       if (zstream.total_out != size || data.status != Z_STREAM_END)
> +               die(_("inflate returned %d"), data.status);
> +       git_inflate_end(&zstream);
> +
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {

Default size of big_file_threshold is 512m.  Can we use
"write_stream_blob" for all objects?  Can we get a more suitable
threshold through some benchmark data?

> +               write_stream_blob(nr, size);
> +               return;
> +       }

--
Jiang Xin

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 0/5] unpack large objects in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (8 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29  7:01   ` Han Xin
                     ` (6 more replies)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                   ` (4 subsequent siblings)
  14 siblings, 7 replies; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Although we do not recommend users push large binary files to the git repositories, 
it's difficult to prevent them from doing so. Once, we found a problem with a surge 
in memory usage on the server. The source of the problem is that a user submitted 
a single object with a size of 15GB. Once someone initiates a git push, the git 
process will immediately allocate 15G of memory, resulting in an OOM risk.

Through further analysis, we found that when we execute git unpack-objects, in 
unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate 
memory equal to the size of the object. This is quite a scary thing, because the 
pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.

I got inspiration from the deflate process of zlib, maybe it would be a good idea 
to change unpack-objects to stream deflate.

Changes since v2:
* Rewrite commit messages and make changes suggested by Jiang Xin.
* Remove the commit "object-file.c: add dry_run mode for write_loose_object()" and
  use a new commit "unpack-objects.c: add dry_run mode for get_data()" instead.

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            | 92 +++++++++++++++++++++++++--
 object-file.c                       | 98 +++++++++++++++++++++++++----
 object-store.h                      |  9 +++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++
 4 files changed, 257 insertions(+), 18 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v2:
1:  01672f50a0 ! 1:  8640b04f6d object-file: refactor write_loose_object() to support inputstream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file: refactor write_loose_object() to support inputstream
    +    object-file: refactor write_loose_object() to read buffer from stream
     
    -    Refactor write_loose_object() to support inputstream, in the same way
    -    that zlib reading is chunked.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    Using "in_stream" instead of "void *buf", we needn't to allocate enough
    -    memory in advance, and only part of the contents will be read when
    -    called "in_stream.read()".
    +    This can be improved by feeding data to "write_loose_object()" in a
    +    stream. The input stream is implemented as an interface. In the first
    +    step, we make a simple implementation, feeding the entire buffer in the
    +    "stream" to "write_loose_object()" as a refactor.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
      	return fd;
      }
      
    -+struct input_data_from_buffer {
    -+	const char *buf;
    ++struct simple_input_stream_data {
    ++	const void *buf;
     +	unsigned long len;
     +};
     +
    -+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
    ++static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
     +{
    -+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
    ++	struct simple_input_stream_data *data = in_stream->data;
     +
    -+	if (input->len == 0) {
    ++	if (data->len == 0) {
     +		*len = 0;
     +		return NULL;
     +	}
    -+	*len = input->len;
    -+	input->len = 0;
    -+	return input->buf;
    ++	*len = data->len;
    ++	data->len = 0;
    ++	return data->buf;
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	struct object_id parano_oid;
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    -+	const char *buf;
    ++	const void *buf;
     +	unsigned long len;
      
      	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    -+	buf = in_stream->read(in_stream->data, &len);
    ++	buf = in_stream->read(in_stream, &len);
      	stream.next_in = (void *)buf;
      	stream.avail_in = len;
      	do {
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen = sizeof(hdr);
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
      	char *header;
      	int hdrlen, status = 0;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen;
      	int ret;
    -+	struct input_data_from_buffer data;
    ++	struct simple_input_stream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    ++		.read = feed_simple_input_stream,
     +		.data = &data,
     +	};
      
    @@ object-store.h: struct object_directory {
      };
      
     +struct input_stream {
    -+	const char *(*read)(void* data, unsigned long *len);
    ++	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
     +};
     +
2:  a309b7e391 < -:  ---------- object-file.c: add dry_run mode for write_loose_object()
3:  b0a5b53710 ! 2:  d4a2caf2bd object-file.c: handle nil oid in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: handle nil oid in write_loose_object()
    +    object-file.c: handle undetermined oid in write_loose_object()
     
    -    When read input stream, oid can't get before reading all, and it will be
    -    filled after reading.
    +    When streaming a large blob object to "write_loose_object()", we have no
    +    chance to run "write_object_file_prepare()" to calculate the oid in
    +    advance. So we need to handle undetermined oid in function
    +    "write_loose_object()".
    +
    +    In the original implementation, we know the oid and we can write the
    +    temporary file in the same directory as the final object, but for an
    +    object with an undetermined oid, we don't know the exact directory for
    +    the object, so we have to save the temporary file in ".git/objects/"
    +    directory instead.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 	const char *buf;
    + 	const void *buf;
      	unsigned long len;
      
     -	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +		strbuf_reset(&filename);
     +		strbuf_addstr(&filename, the_repository->objects->odb->path);
     +		strbuf_addch(&filename, '/');
    -+	} else
    ++	} else {
     +		loose_object_path(the_repository, &filename, oid);
    ++	}
      
    - 	if (!dry_run) {
    - 		fd = create_tmpfile(&tmp_file, filename.buf);
    + 	fd = create_tmpfile(&tmp_file, filename.buf);
    + 	if (fd < 0) {
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
      		    ret);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
      
    -@@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 
      	close_loose_object(fd);
      
     +	if (is_null_oid(oid)) {
     +		int dirlen;
     +
    -+		/* copy oid */
     +		oidcpy((struct object_id *)oid, &parano_oid);
    -+		/* We get the oid now */
     +		loose_object_path(the_repository, &filename, oid);
     +
    ++		/* We finally know the object path, and create the missing dir. */
     +		dirlen = directory_size(filename.buf);
     +		if (dirlen) {
     +			struct strbuf dir = STRBUF_INIT;
    -+			/*
    -+			 * Make sure the directory exists; note that the
    -+			 * contents of the buffer are undefined after mkstemp
    -+			 * returns an error, so we have to rewrite the whole
    -+			 * buffer from scratch.
    -+			 */
    -+			strbuf_reset(&dir);
     +			strbuf_add(&dir, filename.buf, dirlen - 1);
     +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
     +				return -1;
    ++			if (adjust_shared_perm(dir.buf))
    ++				return -1;
    ++			strbuf_release(&dir);
     +		}
     +	}
     +
4:  09d438b692 ! 3:  2575900449 object-file.c: read input stream repeatedly in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: read input stream repeatedly in write_loose_object()
    +    object-file.c: read stream in a loop in write_loose_object()
     
    -    Read input stream repeatedly in write_loose_object() unless reach the
    -    end, so that we can divide the large blob write into many small blocks.
    +    In order to prepare the stream version of "write_loose_object()", read
    +    the input stream in a loop in "write_loose_object()", so that we can
    +    feed the contents of large blob object to "write_loose_object()" using
    +    a small fixed buffer.
     
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    - 	const char *buf;
    + 	const void *buf;
     -	unsigned long len;
     +	int flush = 0;
      
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    --	buf = in_stream->read(in_stream->data, &len);
    +-	buf = in_stream->read(in_stream, &len);
     -	stream.next_in = (void *)buf;
     -	stream.avail_in = len;
      	do {
      		unsigned char *in0 = stream.next_in;
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
    -+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
    ++			buf = in_stream->read(in_stream, &stream.avail_in);
    ++			if (buf) {
     +				stream.next_in = (void *)buf;
     +				in0 = (unsigned char *)buf;
    -+			} else
    ++			} else {
     +				flush = Z_FINISH;
    ++			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
    - 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
    + 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
5:  9fb188d437 < -:  ---------- object-store.h: add write_loose_object()
-:  ---------- > 4:  ca93ecc780 unpack-objects.c: add dry_run mode for get_data()
6:  80468a6fbc ! 5:  39a072ee2a unpack-objects: unpack large object in stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack large object in stream
    +    unpack-objects: unpack_non_delta_entry() read data in a stream
     
    -    When calling "unpack_non_delta_entry()", will allocate full memory for
    -    the whole size of the unpacked object and write the buffer to loose file
    -    on disk. This may lead to OOM for the git-unpack-objects process when
    -    unpacking a very large object.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    In function "unpack_delta_entry()", will also allocate full memory to
    -    buffer the whole delta, but since there will be no delta for an object
    -    larger than "core.bigFileThreshold", this issue is moderate.
    +    By implementing a zstream version of input_stream interface, we can use
    +    a small fixed buffer for "unpack_non_delta_entry()".
     
    -    To resolve the OOM issue in "git-unpack-objects", we can unpack large
    -    object to file in stream, and use "core.bigFileThreshold" to avoid OOM
    -    limits when called "get_data()".
    +    However, unpack non-delta objects from a stream instead of from an entrie
    +    buffer will have 10% performance penalty. Therefore, only unpack object
    +    larger than the "big_file_threshold" in zstream. See the following
    +    benchmarks:
     
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    +          Range (min … max):    9.786 s … 10.603 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    +          Range (min … max):    9.884 s … 12.192 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    +          Range (min … max):    2.639 s …  2.743 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    +          Range (min … max):    2.679 s …  3.125 s    10 runs
    +
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## builtin/unpack-objects.c ##
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      	}
      }
      
    -+struct input_data_from_zstream {
    ++struct input_zstream_data {
     +	git_zstream *zstream;
     +	unsigned char buf[4096];
     +	int status;
     +};
     +
    -+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
    ++static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
     +{
    -+	struct input_data_from_zstream *input = data;
    -+	git_zstream *zstream = input->zstream;
    ++	struct input_zstream_data *data = in_stream->data;
    ++	git_zstream *zstream = data->zstream;
     +	void *in = fill(1);
     +
    -+	if (!len || input->status == Z_STREAM_END) {
    ++	if (!len || data->status == Z_STREAM_END) {
     +		*readlen = 0;
     +		return NULL;
     +	}
     +
    -+	zstream->next_out = input->buf;
    -+	zstream->avail_out = sizeof(input->buf);
    ++	zstream->next_out = data->buf;
    ++	zstream->avail_out = sizeof(data->buf);
     +	zstream->next_in = in;
     +	zstream->avail_in = len;
     +
    -+	input->status = git_inflate(zstream, 0);
    ++	data->status = git_inflate(zstream, 0);
     +	use(len - zstream->avail_in);
    -+	*readlen = sizeof(input->buf) - zstream->avail_out;
    ++	*readlen = sizeof(data->buf) - zstream->avail_out;
     +
    -+	return (const char *)input->buf;
    ++	return data->buf;
     +}
     +
     +static void write_stream_blob(unsigned nr, unsigned long size)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	char hdr[32];
     +	int hdrlen;
     +	git_zstream zstream;
    -+	struct input_data_from_zstream data;
    ++	struct input_zstream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_inflate_in_stream,
    ++		.read = feed_input_zstream,
     +		.data = &data,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	/* Generate the header */
     +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
     +
    -+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
    ++	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
     +		die(_("failed to write object in stream %d"), ret);
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      static void unpack_non_delta_entry(enum object_type type, unsigned long size,
      				   unsigned nr)
      {
    --	void *buf = get_data(size);
    +-	void *buf = get_data(size, dry_run);
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (type == OBJ_BLOB && size > big_file_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
      
    -+	buf = get_data(size);
    ++	buf = get_data(size, dry_run);
      	if (!dry_run && buf)
      		write_object(nr, type, buf, size);
      	else
     
    - ## t/t5590-receive-unpack-objects.sh (new) ##
    + ## object-file.c ##
    +@@ object-file.c: static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
    + 	return data->buf;
    + }
    + 
    +-static int write_loose_object(const struct object_id *oid, char *hdr,
    +-			      int hdrlen, struct input_stream *in_stream,
    +-			      time_t mtime, unsigned flags)
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags)
    + {
    + 	int fd, ret;
    + 	unsigned char compressed[4096];
    +
    + ## object-store.h ##
    +@@ object-store.h: int hash_object_file(const struct git_hash_algo *algo, const void *buf,
    + 		     unsigned long len, const char *type,
    + 		     struct object_id *oid);
    + 
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags);
    ++
    + int write_object_file_flags(const void *buf, unsigned long len,
    + 			    const char *type, struct object_id *oid,
    + 			    unsigned flags);
    +
    + ## t/t5590-unpack-non-delta-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    @@ t/t5590-receive-unpack-objects.sh (new)
     +		cd .git &&
     +		find objects/?? -type f | sort
     +	) >expect &&
    -+	git repack -ad
    ++	PACK=$(echo main | git pack-objects --progress --revs test)
     +'
     +
     +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
    @@ t/t5590-receive-unpack-objects.sh (new)
     +	git -C dest.git config receive.unpacklimit 100
     +'
     +
    -+test_expect_success 'fail to push: cannot allocate' '
    -+	test_must_fail git push dest.git HEAD 2>err &&
    -+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
    ++test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	test_i18ngrep "fatal: attempting to allocate" err &&
     +	(
     +		cd dest.git &&
     +		find objects/?? -type f | sort
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'unpack big object in stream' '
    -+	git push dest.git HEAD &&
    ++	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
     +		cd dest.git &&
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'setup for unpack-objects dry-run test' '
    -+	PACK=$(echo main | git pack-objects --progress --revs test) &&
    -+	unset GIT_ALLOC_LIMIT &&
     +	git init --bare unpack-test.git
     +'
     +
    -+test_expect_success 'unpack-objects dry-run with large threshold' '
    -+	(
    -+		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 2m &&
    -+		git unpack-objects -n <../test-$PACK.pack
    -+	) &&
    -+	(
    -+		cd unpack-test.git &&
    -+		find objects/ -type f
    -+	) >actual &&
    -+	test_must_be_empty actual
    -+'
    -+
    -+test_expect_success 'unpack-objects dry-run with small threshold' '
    ++test_expect_success 'unpack-objects dry-run' '
     +	(
     +		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 1m &&
     +		git unpack-objects -n <../test-$PACK.pack
     +	) &&
     +	(
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (9 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-23 23:24   ` Junio C Hamano
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                   ` (3 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index c3d866a287..227f53a0de 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..ccc1fc9c1a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (10 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 15:10   ` Derrick Stolee
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 227f53a0de..78fd2a5d39 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (11 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 78fd2a5d39..93bcfaca50 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,12 +1925,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			if (buf) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (12 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (13 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 17:37   ` Derrick Stolee
  14 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
      Range (min … max):    9.786 s … 10.603 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
      Range (min … max):    9.884 s … 12.192 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
      Range (min … max):    2.639 s …  2.743 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
      Range (min … max):    2.679 s …  3.125 s    10 runs

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 76 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 +++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bfc254a236 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index 93bcfaca50..bd7631f7ef 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index ccc1fc9c1a..cbd95c47e2 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-23 23:24   ` Junio C Hamano
  2021-11-24  9:00     ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Junio C Hamano @ 2021-11-23 23:24 UTC (permalink / raw)
  To: Han Xin; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Han Xin <chiyutianyi@gmail.com> writes:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.

Possibly a stupid question (not a review).

How does this compare with "struct git_istream" implemented for a
few existing codepaths?  It seems that the existing users are
pack-objects, index-pack and archive and all of them use the
interface to obtain data given an object name without having to grab
everything in core at once.

If we are adding a new streaming interface to go in the opposite
direction, i.e. from the working tree data to object store, I would
understand it as a complementary interface (but then I suspect there
is a half of it already in bulk-checkin API), but I am not sure how
this new thing fits in the larger picture.



> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index c3d866a287..227f53a0de 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};
> +
> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {
> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -			      int hdrlen, const void *buf, unsigned long len,
> +			      int hdrlen, struct input_stream *in_stream,
>  			      time_t mtime, unsigned flags)
>  {
>  	int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	struct object_id parano_oid;
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
> +	const void *buf;
> +	unsigned long len;
>  
>  	loose_object_path(the_repository, &filename, oid);
>  
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> +	buf = in_stream->read(in_stream, &len);
>  	stream.next_in = (void *)buf;
>  	stream.avail_in = len;
>  	do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen = sizeof(hdr);
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* Normally if we have it in the pack then we do not bother writing
>  	 * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  				  &hdrlen);
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		return 0;
> -	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>  
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> +	data.buf = buf;
> +	data.len = len;
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>  	free(buf);
>  
>  	return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..ccc1fc9c1a 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-23 23:24   ` Junio C Hamano
@ 2021-11-24  9:00     ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-24  9:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Junio C Hamano <gitster@pobox.com> writes:

>
> Han Xin <chiyutianyi@gmail.com> writes:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
>
> Possibly a stupid question (not a review).
>
> How does this compare with "struct git_istream" implemented for a
> few existing codepaths?  It seems that the existing users are
> pack-objects, index-pack and archive and all of them use the
> interface to obtain data given an object name without having to grab
> everything in core at once.
>
> If we are adding a new streaming interface to go in the opposite
> direction, i.e. from the working tree data to object store, I would
> understand it as a complementary interface (but then I suspect there
> is a half of it already in bulk-checkin API), but I am not sure how
> this new thing fits in the larger picture.
>

Thank you for your reply.

Before starting to make this patch, I did consider whether I should
reuse "struct  git_istream" to solve the problem, but I found that in the
process of git unpack-objects, the data comes from stdin, and we
cannot get an oid in advance until the whole object data is read.
Also, we can't do "lseek()“ on stdin to change the data reading position.

I compared the implementation of "bulk-checkin", and they do have
some similarities.
I think the difference in the reverse implementation is that we do not
always clearly know where the boundary of the target data is. For
example, in the process of "unpack-objects", the "buffer" has been
partially read after calling "fill()". And the "buffer" remaining after
reading cannot be discarded because it is the beginning of the next
object.
Perhaps "struct input_stream" can make some improvements to
"index_bulk_checkin()", so that it can read from an inner buffer in
addition to reading from "fd" if necessary.

>
>
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  5 +++++
> >  2 files changed, 51 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index c3d866a287..227f53a0de 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
> > +
> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> > -                           int hdrlen, const void *buf, unsigned long len,
> > +                           int hdrlen, struct input_stream *in_stream,
> >                             time_t mtime, unsigned flags)
> >  {
> >       int fd, ret;
> > @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       struct object_id parano_oid;
> >       static struct strbuf tmp_file = STRBUF_INIT;
> >       static struct strbuf filename = STRBUF_INIT;
> > +     const void *buf;
> > +     unsigned long len;
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > +     buf = in_stream->read(in_stream, &len);
> >       stream.next_in = (void *)buf;
> >       stream.avail_in = len;
> >       do {
> > @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >  {
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen = sizeof(hdr);
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* Normally if we have it in the pack then we do not bother writing
> >        * it out into .git/objects/??/?{38} file.
> > @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >                                 &hdrlen);
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               return 0;
> > -     return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> > +     return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> >  }
> >
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
> > +     data.buf = buf;
> > +     data.len = len;
> >       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> > -     ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> > +     ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> >       free(buf);
> >
> >       return ret;
> > diff --git a/object-store.h b/object-store.h
> > index 952efb6a4b..ccc1fc9c1a 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -34,6 +34,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-29  7:01   ` Han Xin
  2021-11-29 19:12     ` Jeff King
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-29  7:01 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

Han Xin <chiyutianyi@gmail.com> writes:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Although we do not recommend users push large binary files to the git repositories,
> it's difficult to prevent them from doing so. Once, we found a problem with a surge
> in memory usage on the server. The source of the problem is that a user submitted
> a single object with a size of 15GB. Once someone initiates a git push, the git
> process will immediately allocate 15G of memory, resulting in an OOM risk.
>
> Through further analysis, we found that when we execute git unpack-objects, in
> unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> memory equal to the size of the object. This is quite a scary thing, because the
> pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
>
> I got inspiration from the deflate process of zlib, maybe it would be a good idea
> to change unpack-objects to stream deflate.
>

Hi, Jeff.

I hope you can share with me how Github solves this problem.

As you said in your reply at:
https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
"we don't have a match in unpack-objects, but we always run index-pack
on incoming packs".

In the original implementation of "index-pack", for objects larger than
big_file_threshold, "fixed_buf" with a size of 8192 will be used to
complete the calculation of "oid".

I tried the implementation in jk/no-more-unpack-objects, as you noted:
  /* XXX This will expand too-large objects! */
  if (!data)
  data = new_data = get_data_from_pack(obj_entry);
If the conditions of --unpack are given, there will be risks here.
When I create an object larger than 1GB and execute index-pack, the
result is as follows:
  $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
  fatal: attempting to allocate 1228800001 over limit 1073741824

Looking forward to your reply.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-29 15:10   ` Derrick Stolee
  2021-11-29 20:44     ` Junio C Hamano
  0 siblings, 1 reply; 57+ messages in thread
From: Derrick Stolee @ 2021-11-29 15:10 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
> 
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.

My first reaction is to not write into .git/objects/ directly, but
instead make a .git/objects/tmp/ directory and write within that
directory. The idea is to prevent leaving stale files in the
.git/objects/ directory if the process terminates strangely (say,
a power outage or segfault).

If this was an interesting idea to pursue, it does leave a question:
should we clean up the tmp/ directory when it is empty? That would
require adding a check in finalize_object_file() that is probably
best left unchecked (the lstat() would add a cost per loose object
write that is probably too costly). I would rather leave an empty
tmp/ directory than add that cost per loose object write.

I suppose another way to do it would be to register the check as
an event at the end of the process, so we only check once, and
that only happens if we created a loose object with this streaming
method.

With all of these complications in mind, I think cleaning up the
stale tmp/ directory could (at the very least) be delayed to another
commit or patch series. Hopefully adding the directory is not too
much complication to add here.

> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

Here, you could instead of the strbuf_addch() do

	strbuf_add(&filename, "/tmp/", 5);
	if (safe_create_leading_directories(filename.buf)) {
		error(_("failed to create '%s'"));
		strbuf_release(&filename);
		return -1;
	}		

> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				return -1;
> +			if (adjust_shared_perm(dir.buf))
> +				return -1;
> +			strbuf_release(&dir);
> +		}
> +	}
> +

Upon first reading I was asking "where is the file rename?" but
it is part of finalize_object_file() which is called further down.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-11-29 17:37   ` Derrick Stolee
  2021-11-30 13:49     ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Derrick Stolee @ 2021-11-29 17:37 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
> 
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
> 
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
>       Range (min … max):    9.786 s … 10.603 s    10 runs
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
>       Range (min … max):    9.884 s … 12.192 s    10 runs

It seems that you want us to compare this pair of results, and
hyperfine can assist with that by including multiple benchmarks
(with labels, using '-n') as follows:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
  Range (min … max):   20.741 s … 20.909 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
  Range (min … max):   26.419 s … 26.611 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
  Range (min … max):   26.416 s … 26.739 s    10 runs
 
Summary
  'old' ran
    1.27 ± 0.00 times faster than 'new'
    1.27 ± 0.01 times faster than 'new (small threshold)'

(Here, 'old' is testing a compiled version of the latest 'master'
branch, while 'new' has your patches applied on top.)

Notice from this example I had a pack with many small objects (mostly
commits and trees) and I see that this change introduces significant
overhead to this case.

It would be nice to understand this overhead and fix it before taking
this change any further.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29  7:01   ` Han Xin
@ 2021-11-29 19:12     ` Jeff King
  2021-11-30  2:57       ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Jeff King @ 2021-11-29 19:12 UTC (permalink / raw)
  To: Han Xin; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Mon, Nov 29, 2021 at 03:01:47PM +0800, Han Xin wrote:

> Han Xin <chiyutianyi@gmail.com> writes:
> >
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Although we do not recommend users push large binary files to the git repositories,
> > it's difficult to prevent them from doing so. Once, we found a problem with a surge
> > in memory usage on the server. The source of the problem is that a user submitted
> > a single object with a size of 15GB. Once someone initiates a git push, the git
> > process will immediately allocate 15G of memory, resulting in an OOM risk.
> >
> > Through further analysis, we found that when we execute git unpack-objects, in
> > unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> > memory equal to the size of the object. This is quite a scary thing, because the
> > pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
> >
> > I got inspiration from the deflate process of zlib, maybe it would be a good idea
> > to change unpack-objects to stream deflate.
> >
> 
> Hi, Jeff.
> 
> I hope you can share with me how Github solves this problem.
> 
> As you said in your reply at:
> https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
> "we don't have a match in unpack-objects, but we always run index-pack
> on incoming packs".
> 
> In the original implementation of "index-pack", for objects larger than
> big_file_threshold, "fixed_buf" with a size of 8192 will be used to
> complete the calculation of "oid".

We set transfer.unpackLimit to "1", so we never run unpack-objects at
all. We always run index-pack, and every push, no matter how small,
results in a pack.

We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
custom code in index-pack to detect large objects (where our definition
of "large" is 100MB by default):

  - for large blobs, we do index it as normal, writing the oid out to a
    file which is then processed by a pre-receive hook (since people
    often push up large files accidentally, the hook generates a nice
    error message, including finding the path at which the blob is
    referenced)

  - for other large objects, we die immediately (with an error message).
    100MB commit messages aren't a common user error, and it closes off
    a whole set of possible integer-overflow parsing attacks (e.g.,
    index-pack in strict-mode will run every tree through fsck_tree(),
    so there's otherwise nothing stopping you from having a 4GB filename
    in a tree).

> I tried the implementation in jk/no-more-unpack-objects, as you noted:
>   /* XXX This will expand too-large objects! */
>   if (!data)
>   data = new_data = get_data_from_pack(obj_entry);
> If the conditions of --unpack are given, there will be risks here.
> When I create an object larger than 1GB and execute index-pack, the
> result is as follows:
>   $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
>   fatal: attempting to allocate 1228800001 over limit 1073741824

Yeah, that issue was one of the reasons I never sent the "index-pack
--unpack" code to the list. We don't actually use those patches at
GitHub. It was something I was working on for upstream but never
finished.

-Peff

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 15:10   ` Derrick Stolee
@ 2021-11-29 20:44     ` Junio C Hamano
  2021-11-29 22:18       ` Derrick Stolee
  0 siblings, 1 reply; 57+ messages in thread
From: Junio C Hamano @ 2021-11-29 20:44 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Derrick Stolee <stolee@gmail.com> writes:

> My first reaction is to not write into .git/objects/ directly, but
> instead make a .git/objects/tmp/ directory and write within that
> directory. The idea is to prevent leaving stale files in the
> .git/objects/ directory if the process terminates strangely (say,
> a power outage or segfault).

Even if we know the name of the object we are writing beforehand, I
do not think it is a good idea to open-write-close the final object
file.  The approach we already use everywhere is to write into a
tmpfile/lockfile and rename it to the final name 

object-file.c::write_loose_object() uses create_tmpfile() to prepare
a temporary file whose name begins with "tmp_obj_", so that "gc" can
recognize stale ones and remove them.

> If this was an interesting idea to pursue, it does leave a question:
> should we clean up the tmp/ directory when it is empty? That would
> require adding a check in finalize_object_file() that is probably
> best left unchecked (the lstat() would add a cost per loose object
> write that is probably too costly). I would rather leave an empty
> tmp/ directory than add that cost per loose object write.

I am not sure why we want a new tmp/ directory.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 20:44     ` Junio C Hamano
@ 2021-11-29 22:18       ` Derrick Stolee
  2021-11-30  3:23         ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Derrick Stolee @ 2021-11-29 22:18 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> Derrick Stolee <stolee@gmail.com> writes:
> 
>> My first reaction is to not write into .git/objects/ directly, but
>> instead make a .git/objects/tmp/ directory and write within that
>> directory. The idea is to prevent leaving stale files in the
>> .git/objects/ directory if the process terminates strangely (say,
>> a power outage or segfault).
> 
> Even if we know the name of the object we are writing beforehand, I
> do not think it is a good idea to open-write-close the final object
> file.  The approach we already use everywhere is to write into a
> tmpfile/lockfile and rename it to the final name 
> 
> object-file.c::write_loose_object() uses create_tmpfile() to prepare
> a temporary file whose name begins with "tmp_obj_", so that "gc" can
> recognize stale ones and remove them.

The only difference is that the tmp_obj_* file would go into the
loose object directory corresponding to the first two hex characters
of the OID, but that no longer happens now.
 
>> If this was an interesting idea to pursue, it does leave a question:
>> should we clean up the tmp/ directory when it is empty? That would
>> require adding a check in finalize_object_file() that is probably
>> best left unchecked (the lstat() would add a cost per loose object
>> write that is probably too costly). I would rather leave an empty
>> tmp/ directory than add that cost per loose object write.
> 
> I am not sure why we want a new tmp/ directory.

I'm just thinking of a case where this fails repeatedly I would
rather have those failed tmp_obj_* files isolated in their own
directory. It's an extremely minor point, so I'm fine to drop
the recommendation.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29 19:12     ` Jeff King
@ 2021-11-30  2:57       ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-30  2:57 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 3:12 AM Jeff King <peff@peff.net> wrote:
> We set transfer.unpackLimit to "1", so we never run unpack-objects at
> all. We always run index-pack, and every push, no matter how small,
> results in a pack.
>
> We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
> custom code in index-pack to detect large objects (where our definition
> of "large" is 100MB by default):
>
>   - for large blobs, we do index it as normal, writing the oid out to a
>     file which is then processed by a pre-receive hook (since people
>     often push up large files accidentally, the hook generates a nice
>     error message, including finding the path at which the blob is
>     referenced)
>
>   - for other large objects, we die immediately (with an error message).
>     100MB commit messages aren't a common user error, and it closes off
>     a whole set of possible integer-overflow parsing attacks (e.g.,
>     index-pack in strict-mode will run every tree through fsck_tree(),
>     so there's otherwise nothing stopping you from having a 4GB filename
>     in a tree).

Thank you very much for sharing.

The way Github handles it reminds me of what Shawn Pearce introduced in
"Scaling up JGit". I guess "mulit-pack-index" and "bitmap" must play an
important role in this.

I will seriously consider this solution, thanks a lot.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 22:18       ` Derrick Stolee
@ 2021-11-30  3:23         ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-11-30  3:23 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 6:18 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> > Derrick Stolee <stolee@gmail.com> writes:
> >
> >> My first reaction is to not write into .git/objects/ directly, but
> >> instead make a .git/objects/tmp/ directory and write within that
> >> directory. The idea is to prevent leaving stale files in the
> >> .git/objects/ directory if the process terminates strangely (say,
> >> a power outage or segfault).
> >
> > Even if we know the name of the object we are writing beforehand, I
> > do not think it is a good idea to open-write-close the final object
> > file.  The approach we already use everywhere is to write into a
> > tmpfile/lockfile and rename it to the final name
> >
> > object-file.c::write_loose_object() uses create_tmpfile() to prepare
> > a temporary file whose name begins with "tmp_obj_", so that "gc" can
> > recognize stale ones and remove them.
>
> The only difference is that the tmp_obj_* file would go into the
> loose object directory corresponding to the first two hex characters
> of the OID, but that no longer happens now.
>

At the beginning of this patch, I did save the temporary object in a
two hex characters directory of "null_oid", but this is also a very
strange behavior. "Gc" will indeed clean up these tmp_obj_* files, no
matter if they are in .git/objects/ or .git/objects/xx.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-29 17:37   ` Derrick Stolee
@ 2021-11-30 13:49     ` Han Xin
  2021-11-30 18:38       ` Derrick Stolee
  0 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-11-30 13:49 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/21/2021 10:32 PM, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > By implementing a zstream version of input_stream interface, we can use
> > a small fixed buffer for "unpack_non_delta_entry()".
> >
> > However, unpack non-delta objects from a stream instead of from an entrie
> > buffer will have 10% performance penalty. Therefore, only unpack object
> > larger than the "big_file_threshold" in zstream. See the following
> > benchmarks:
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
> >       Range (min … max):    9.786 s … 10.603 s    10 runs
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
> >       Range (min … max):    9.884 s … 12.192 s    10 runs
>
> It seems that you want us to compare this pair of results, and
> hyperfine can assist with that by including multiple benchmarks
> (with labels, using '-n') as follows:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>   Range (min … max):   20.741 s … 20.909 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>   Range (min … max):   26.419 s … 26.611 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>   Range (min … max):   26.416 s … 26.739 s    10 runs
>
> Summary
>   'old' ran
>     1.27 ± 0.00 times faster than 'new'
>     1.27 ± 0.01 times faster than 'new (small threshold)'
>
> (Here, 'old' is testing a compiled version of the latest 'master'
> branch, while 'new' has your patches applied on top.)
>
> Notice from this example I had a pack with many small objects (mostly
> commits and trees) and I see that this change introduces significant
> overhead to this case.
>
> It would be nice to understand this overhead and fix it before taking
> this change any further.
>
> Thanks,
> -Stolee

Can you show me the specific information of the repository you
tested, so that I can analyze it further.

I test this repository, but did not meet the problem:

 Unpacking objects: 100% (18345/18345), 43.15 MiB

hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' 'git -C dest.git unpack-objects <big.pack' \
        -n 'new' 'new/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' 'new/git -c
core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
Benchmark 1: old
  Time (mean ± σ):     17.403 s ±  0.880 s    [User: 4.996 s, System: 11.803 s]
  Range (min … max):   15.911 s … 19.368 s    10 runs

Benchmark 2: new
  Time (mean ± σ):     17.788 s ±  0.199 s    [User: 5.054 s, System: 12.257 s]
  Range (min … max):   17.420 s … 18.195 s    10 runs

Benchmark 3: new (small threshold)
  Time (mean ± σ):     18.433 s ±  0.711 s    [User: 4.982 s, System: 12.338 s]
  Range (min … max):   17.518 s … 19.775 s    10 runs

Summary
  'old' ran
    1.02 ± 0.05 times faster than 'new'
    1.06 ± 0.07 times faster than 'new (small threshold)'

Thanks,
- Han Xin

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 13:49     ` Han Xin
@ 2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  0 siblings, 2 replies; 57+ messages in thread
From: Derrick Stolee @ 2021-11-30 18:38 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 11/30/2021 8:49 AM, Han Xin wrote:
> On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>> $ hyperfine \
>>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>>
>> Benchmark 1: old
>>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>>   Range (min … max):   20.741 s … 20.909 s    10 runs
>>
>> Benchmark 2: new
>>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>>   Range (min … max):   26.419 s … 26.611 s    10 runs
>>
>> Benchmark 3: new (small threshold)
>>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>>   Range (min … max):   26.416 s … 26.739 s    10 runs
>>
>> Summary
>>   'old' ran
>>     1.27 ± 0.00 times faster than 'new'
>>     1.27 ± 0.01 times faster than 'new (small threshold)'
>>
>> (Here, 'old' is testing a compiled version of the latest 'master'
>> branch, while 'new' has your patches applied on top.)
>>
>> Notice from this example I had a pack with many small objects (mostly
>> commits and trees) and I see that this change introduces significant
>> overhead to this case.
>>
>> It would be nice to understand this overhead and fix it before taking
>> this change any further.
>>
>> Thanks,
>> -Stolee
> 
> Can you show me the specific information of the repository you
> tested, so that I can analyze it further.

I used a pack-file from an internal repo. It happened to be using
partial clone, so here is a repro with the git/git repository
after cloning this way:

$ git clone --no-checkout --filter=blob:none https://github.com/git/git

(copy the large .pack from git/.git/objects/pack/ to big.pack)

$ hyperfine \
	--prepare 'rm -rf dest.git && git init --bare dest.git' \
	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
  Range (min … max):   82.042 s … 83.587 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
  Range (min … max):   100.866 s … 102.633 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
  Range (min … max):   100.639 s … 101.375 s    10 runs
 
Summary
  'old' ran
    1.22 ± 0.01 times faster than 'new (small threshold)'
    1.23 ± 0.01 times faster than 'new'

I'm also able to repro this with a smaller repo (microsoft/scalar)
so the tests complete much faster:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'

Benchmark 1: old
  Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
  Range (min … max):    3.269 s …  3.351 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
  Range (min … max):    3.378 s …  3.679 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
  Range (min … max):    3.359 s …  3.747 s    10 runs
 
Summary
  'old' ran
    1.09 ± 0.04 times faster than 'new (small threshold)'
    1.09 ± 0.03 times faster than 'new'

It's not the same relative overhead, but still significant.

These pack-files contain (mostly) small objects, no large blobs.
I know that's not the target of your efforts, but it would be
good to avoid a regression here.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 57+ messages in thread

* "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...])
  2021-11-30 18:38       ` Derrick Stolee
@ 2021-12-01 20:37         ` Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  1 sibling, 0 replies; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-01 20:37 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Han Xin, David Peter


I hadn't sent a shameless plug for my "git hyperfine" script to the
list, perhaps this is a good time. It's just a thin shellscript wrapper
around "hyperfine" that I wrote the other day, which...

On Tue, Nov 30 2021, Derrick Stolee wrote:

> [...]
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
> 	--prepare 'rm -rf dest.git && git init --bare dest.git' \
> 	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>  
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>  
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>  
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'

...adds enough sugar around "hyperfine" itself to do this as e.g. (the
"-s" is a feature I submitted to hyperfine itself, it's not in a release
yet[1], but in this case you could also use "-p"):

    git hyperfine -L rev v2.20.0,origin/master \
        -s 'if ! test -d redis.git; then git clone --bare --filter=blob:none https://github.com/redis/redis; fi && make' \
        -p 'rm -rf dest.git; git init --bare dest.git' \
        './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)'

The sugar being that for each named "rev" parameter it'll set up "git
worktree" for you, so under the hood each of those is chdir-ing to the
respective revision of:
    
    $ git worktree list
    [...]
    /run/user/1001/git-hyperfine/origin/master  abe6bb39053 (detached HEAD)
    /run/user/1001/git-hyperfine/v2.33.0        225bc32a989 (detached HEAD)

That they're named revisions and not git-rev-parse'd is intentional,
since you'll benefit from faster incremental "make" (even if using
"ccache"). I'm typically benchmarking HEAD~1,HEAD~0.

The output will then use those "rev" parameters, and be e.g.:
    
    Benchmark 1: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0
      Time (mean ± σ):      6.678 s ±  0.046 s    [User: 4.525 s, System: 2.117 s]
      Range (min … max):    6.619 s …  6.765 s    10 runs
     
    Benchmark 2: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master
      Time (mean ± σ):      6.756 s ±  0.074 s    [User: 4.586 s, System: 2.134 s]
      Range (min … max):    6.691 s …  6.941 s    10 runs
     
    Summary
      './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0' ran
        1.01 ± 0.01 times faster than './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master'

I think if you're routinely benchmarking N different git versions you'll
find it handy, it also has configurable hook support (using git config),
so e.g. it's easy to copy your config.mak in-place in the
worktrees. E.g. my config is:

    $ git -P config --get-regexp '^hyperfine'
    hyperfine.run-dir $XDG_RUNTIME_DIR/git-hyperfine
    hyperfine.xargs-options -r
    hyperfine.hook.setup ~/g/git.meta/config.mak.sh

It's hosted at https://github.com/avar/git-hyperfine/ and
https://gitlab.com/avar/git-hyperfine/; It's implemented in (portable)
POSIX shell script.

There's surely some bugs in it, one known one is that unlike hyperfine
it doesn't accept there being spaces in the parameters to -L, because
I'm screwing up some quoting-within-quoting in the (shellscript)
implementation (suggestions for that particular one most welcome).

I hacked it up after this suggestion from Jeff King[2] of moving t/perf
over to it.

I haven't done any of that legwork, but I think a wrapper like
"git-hyperfine" that prepares worktrees for the N revisions we're
benchmarking is a good direction to go in.

We don't use git-worktrees in t/perf, but probably could for most/all
tests. In any case it would be easy to have the script setup the revs to
be benchmarked in some hookable custom manner to have it do exactly what
t/perf/run is doing now.

1. https://github.com/sharkdp/hyperfine/commit/017d55a
2. https://lore.kernel.org/git/YV+zFqi4VmBVJYex@coredump.intra.peff.net/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
@ 2021-12-02  7:33         ` Han Xin
  2021-12-02 13:53           ` Derrick Stolee
  1 sibling, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-12-02  7:33 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'
>
> I'm also able to repro this with a smaller repo (microsoft/scalar)
> so the tests complete much faster:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
>   Range (min … max):    3.269 s …  3.351 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
>   Range (min … max):    3.378 s …  3.679 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
>   Range (min … max):    3.359 s …  3.747 s    10 runs
>
> Summary
>   'old' ran
>     1.09 ± 0.04 times faster than 'new (small threshold)'
>     1.09 ± 0.03 times faster than 'new'
>
> It's not the same relative overhead, but still significant.
>
> These pack-files contain (mostly) small objects, no large blobs.
> I know that's not the target of your efforts, but it would be
> good to avoid a regression here.
>
> Thanks,
> -Stolee

With your help, I did catch this performance problem, which was
introduced in this patch:
https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/

This patch changes the original data reading ino to stream reading, but
its problem is that even for the original reading of the whole object data,
it still generates an additional git_deflate() and subsequent transfer.

I will fix it in a follow-up patch.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-02 13:53           ` Derrick Stolee
  0 siblings, 0 replies; 57+ messages in thread
From: Derrick Stolee @ 2021-12-02 13:53 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 12/2/2021 2:33 AM, Han Xin wrote:
> On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>> These pack-files contain (mostly) small objects, no large blobs.
>> I know that's not the target of your efforts, but it would be
>> good to avoid a regression here.
>>
>> Thanks,
>> -Stolee
> 
> With your help, I did catch this performance problem, which was
> introduced in this patch:
> https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/
> 
> This patch changes the original data reading ino to stream reading, but
> its problem is that even for the original reading of the whole object data,
> it still generates an additional git_deflate() and subsequent transfer.

I'm glad you found it!

> I will fix it in a follow-up patch.

Looking forward to it.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v3:
* Add "size" to "struct input_stream" which used by following commits.

* Increase the buffer size of "struct input_zstream_data" from 4096 to
  8192, which is consistent with the "fixed_buf" in the "index-pack.c".

* Refactor "read stream in a loop in write_loose_object()" which
  introduced a performance problem reported by Derrick Stolee[1].

* Rewrite benchmarks in "unpack-objects: unpack_non_delta_entry() read
  data in a stream" with sugguestions by Derrick Stolee[1] and
  Ævar Arnfjörð Bjarmason[2]. 
  Now use "scalar.git" to benchmark, which contains more than 28000
  objects and 96 objects larger than 16kB.

1. https://lore.kernel.org/git/8ff89e50-1b80-7932-f0e2-af401ee04bb1@gmail.com/
2. https://lore.kernel.org/git/211201.86r1aw9gbd.gmgdl@evledraar.gmail.com/

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            |  93 +++++++++++++++++++++++--
 object-file.c                       | 102 ++++++++++++++++++++++++----
 object-store.h                      |  10 +++
 t/t5590-unpack-non-delta-objects.sh |  76 +++++++++++++++++++++
 4 files changed, 262 insertions(+), 19 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v3:
1:  8640b04f6d ! 1:  af707ef304 object-file: refactor write_loose_object() to read buffer from stream
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* Normally if we have it in the pack then we do not bother writing
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* type string, SP, %lu of the length plus NUL must fit this */
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	if (has_loose_object(oid))
      		return 0;
      	buf = read_object(the_repository, oid, &type, &len);
    ++	in_stream.size = len;
      	if (!buf)
      		return error(_("cannot read object for %s"), oid_to_hex(oid));
     +	data.buf = buf;
    @@ object-store.h: struct object_directory {
     +struct input_stream {
     +	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
    ++	size_t size;
     +};
     +
      KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
2:  d4a2caf2bd = 2:  321ad90d8e object-file.c: handle undetermined oid in write_loose_object()
3:  2575900449 ! 3:  1992ac39af object-file.c: read stream in a loop in write_loose_object()
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
     +			buf = in_stream->read(in_stream, &stream.avail_in);
    -+			if (buf) {
    -+				stream.next_in = (void *)buf;
    -+				in0 = (unsigned char *)buf;
    -+			} else {
    ++			stream.next_in = (void *)buf;
    ++			in0 = (unsigned char *)buf;
    ++			/* All data has been read. */
    ++			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
     +				flush = Z_FINISH;
    -+			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
      		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
    + 		stream.next_out = compressed;
    + 		stream.avail_out = sizeof(compressed);
    +-	} while (ret == Z_OK);
    ++	} while (ret == Z_OK || ret == Z_BUF_ERROR);
    + 
    + 	if (ret != Z_STREAM_END)
    + 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
4:  ca93ecc780 = 4:  c41eb06533 unpack-objects.c: add dry_run mode for get_data()
5:  39a072ee2a ! 5:  9427775bdc unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Commit message
         larger than the "big_file_threshold" in zstream. See the following
         benchmarks:
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    -          Range (min … max):    9.786 s … 10.603 s    10 runs
    +        hyperfine \
    +          --setup \
    +          'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
    +          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +          -n 'old' 'git -C dest.git unpack-objects <small.pack' \
    +          -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
    +          -n 'new (small threshold)' \
    +          'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    +        Benchmark 1: old
    +          Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
    +          Range (min … max):    6.018 s …  6.189 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    -          Range (min … max):    9.884 s … 12.192 s    10 runs
    +        Benchmark 2: new
    +          Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
    +          Range (min … max):    6.030 s …  6.142 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    -          Range (min … max):    2.639 s …  2.743 s    10 runs
    +        Benchmark 3: new (small threshold)
    +          Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
    +          Range (min … max):    6.711 s …  6.809 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    -          Range (min … max):    2.679 s …  3.125 s    10 runs
    +        Summary
    +          'old' ran
    +            1.00 ± 0.01 times faster than 'new'
    +            1.11 ± 0.01 times faster than 'new (small threshold)'
     
    +    Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      
     +struct input_zstream_data {
     +	git_zstream *zstream;
    -+	unsigned char buf[4096];
    ++	unsigned char buf[8192];
     +	int status;
     +};
     +
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	struct input_stream in_stream = {
     +		.read = feed_input_zstream,
     +		.data = &data,
    ++		.size = size,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
     +	int ret;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  6 ++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb972cdccd..82656f7428 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,14 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1997,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
+	in_stream.size = len;
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..a84d891d60 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	size_t size;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (2 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
                     ` (2 subsequent siblings)
  6 siblings, 2 replies; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 82656f7428..1c41587bfb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (3 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1c41587bfb..fa54e39c2c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,18 +1925,23 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)buf;
+			in0 = (unsigned char *)buf;
+			/* All data has been read. */
+			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (4 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (5 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
                       ` (2 more replies)
  6 siblings, 3 replies; 57+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      -n 'old' 'git -C dest.git unpack-objects <small.pack' \
      -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
      -n 'new (small threshold)' \
      'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    Benchmark 1: old
      Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
      Range (min … max):    6.018 s …  6.189 s    10 runs

    Benchmark 2: new
      Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
      Range (min … max):    6.030 s …  6.142 s    10 runs

    Benchmark 3: new (small threshold)
      Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
      Range (min … max):    6.711 s …  6.809 s    10 runs

    Summary
      'old' ran
        1.00 ± 0.01 times faster than 'new'
        1.11 ± 0.01 times faster than 'new (small threshold)'

Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 77 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bedc494e2d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,86 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+		.size = size,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index fa54e39c2c..71d510614b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index a84d891d60..ac5b11ec16 100644
--- a/object-store.h
+++ b/object-store.h
@@ -229,6 +229,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 0 replies; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:07 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git' \
>       -n 'old' 'git -C dest.git unpack-objects <small.pack' \
>       -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
>       -n 'new (small threshold)' \
>       'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
>     Benchmark 1: old
>       Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
>       Range (min … max):    6.018 s …  6.189 s    10 runs
>
>     Benchmark 2: new
>       Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
>       Range (min … max):    6.030 s …  6.142 s    10 runs
>
>     Benchmark 3: new (small threshold)
>       Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
>       Range (min … max):    6.711 s …  6.809 s    10 runs
>
>     Summary
>       'old' ran
>         1.00 ± 0.01 times faster than 'new'
>         1.11 ± 0.01 times faster than 'new (small threshold)'

So before we wrote used core.bigfilethreshold for two things (or more?):
Whether we show a diff for it (we mark it "binary") and whether it's
split into a loose object.

Now it's three things, we've added a "this is a threshold when we'll
stream the object" to that.

Might it make sense to squash something like this in, so we can have our
cake & eat it too?

With this I get, where HEAD~0 is this change:
    
    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0' ran
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.06 ± 0.14 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.20 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

I.e. it's 5% slower, not 20% (haven't looked into why), but we'll not
stream out 16k..128MB objects (maybe the repo has even bigger ones?)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a1..601b7a2418f 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index bedc494e2db..94ce275c807 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -400,7 +400,7 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 	void *buf;
 
 	/* Write large blob in stream without allocating full buffer. */
-	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
 		write_stream_blob(nr, size);
 		return;
 	}
diff --git a/cache.h b/cache.h
index eba12487b99..4037c7fd849 100644
--- a/cache.h
+++ b/cache.h
@@ -964,6 +964,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a706..7b122a142a8 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 9da7f3c1a19..4fcc3de7417 100644
--- a/environment.c
+++ b/environment.c
@@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:51       ` Han Xin
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:21 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);

Why are we breaking the promise that "oid" is constant here? I tested
locally with the below on top, and it seems to work (at least no tests
broke). Isn't it preferrable to the cast & the caller having its "oid"
changed?

diff --git a/object-file.c b/object-file.c
index 71d510614b9..d014e6942ea 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	close_loose_object(fd);
 
 	if (is_null_oid(oid)) {
+		struct object_id oid2;
 		int dirlen;
 
-		oidcpy((struct object_id *)oid, &parano_oid);
-		loose_object_path(the_repository, &filename, oid);
+		oidcpy(&oid2, &parano_oid);
+		loose_object_path(the_repository, &filename, &oid2);
 
 		/* We finally know the object path, and create the missing dir. */
 		dirlen = directory_size(filename.buf);

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:07       ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:28 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  6 ++++++
>  2 files changed, 55 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb972cdccd..82656f7428 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};

I see why you picked "const void *buf" here, over say const char *, it's
what "struct input_stream" uses.

But why not use size_t for the length, as input_stream does?

> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {

nit: if (!data->len)...

> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;

But isn't the body of this functin the same as:

        *len = data->len;
        if (!len)
                return NULL;
        data->len = 0;
        return data->buf;

I.e. you don't need the condition for setting "*len" if it's 0, then
data->len is also 0. You just want to return NULL afterwards, and not
set (harmless, but no need) data->len to 0)< or return data->buf.
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

Maybe it's that I'm unused to it, but I find this a bit more readable:
	
	@@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
	 {
	 	char hdr[MAX_HEADER_LEN];
	 	int hdrlen = sizeof(hdr);
	+	struct simple_input_stream_data tmp = {
	+		.buf = buf,
	+		.len = len,
	+	};
	 	struct input_stream in_stream = {
	 		.read = feed_simple_input_stream,
	-		.data = (void *)&(struct simple_input_stream_data) {
	-			.buf = buf,
	-			.len = len,
	-		},
	+		.data = (void *)&tmp,
	 		.size = len,
	 	};
	
Yes there's a temporary variable, but no denser inline casting. Also
easier to strep through in a debugger (which will have the type
information on "tmp".

>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

ditto..

>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
> +	in_stream.size = len;

Why are we setting this here?...

>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));

...Insted of after this point, as we may error and never use it?

> +	data.buf = buf;
> +	data.len = len;

Probably won't matter,  just a nit...

> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +	size_t size;
> +};
> +

Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:12       ` Han Xin
  1 sibling, 1 reply; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:41 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);

Why re-use this & leak memory? An existing strbuf use in this function
doesn't leak in the same way. Just release it as in the below patch on
top (the ret v.s. err variable naming is a bit confused, maybe could do
with a prep cleanup step.).

> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

And once we do that this could just become:

	strbuf_addf($filename, "%s/", ...)

Is there's existing uses of this pattern, so mayb e not worth it, but it
allows you to remove the braces on the if/else.

diff --git a/object-file.c b/object-file.c
index 8bd89e7b7ba..2b52f3fc1cc 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 		       int hdrlen, struct input_stream *in_stream,
 		       time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret, err = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
-		strbuf_reset(&filename);
 		strbuf_addstr(&filename, the_repository->objects->odb->path);
 		strbuf_addch(&filename, '/');
 	} else {
@@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			err = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			err = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			struct strbuf dir = STRBUF_INIT;
 			strbuf_add(&dir, filename.buf, dirlen - 1);
 			if (mkdir(dir.buf, 0777) && errno != EEXIST)
-				return -1;
-			if (adjust_shared_perm(dir.buf))
-				return -1;
-			strbuf_release(&dir);
+				err = -1;
+			else if (adjust_shared_perm(dir.buf))
+				err = -1;
+			else
+				strbuf_release(&dir);
+			if (err < 0)
+				goto cleanup;
 		}
 	}
 
@@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	return err;
 }
 
 static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 0 replies; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:54 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> new file mode 100755
> index 0000000000..01d950d119
> --- /dev/null
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -0,0 +1,76 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort

...are thse...

> +	) >expect &&
> +	PACK=$(echo main | git pack-objects --progress --revs test)

Is --progress needed?

> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100

I think it would be better to just (could roll this into a function):

	test_when_finished "rm -rf dest.git" &&
	git init dest.git &&
	git -C dest.git config ...

Then you can use it with e.g. --run=3-4 and not have it error out
because of skipped setup.

A lot of our tests fail like that, but in this case fixing it seems
trivial.



> +'
> +
> +test_expect_success 'fail to unpack-objects: cannot allocate' '
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	test_i18ngrep "fatal: attempting to allocate" err &&

nit: just "grep", not "test_i18ngrep"

> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort

..."find" needed over just globbing?:

    obj=$(echo objects/*/*)

?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:20       ` Han Xin
  0 siblings, 1 reply; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> +	unsigned long bufsize = dry_run ? 4096 : size;
> +	void *buf = xmallocz(bufsize);

It's probably nothing, but in your CL you note that you changed another
hardcoding from 4k to 8k, should this one still be 4k?

It's probably fine, just wondering...

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 0 replies; 57+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 14:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [..]
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +	char hdr[32];
> +	int hdrlen;
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +		.size = size,
> +	};
> +	struct object_id *oid = &obj_list[nr].oid;
> +	int ret;
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));
> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	/* Generate the header */
> +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
> +		die(_("failed to write object in stream %d"), ret);
> +
> +	if (zstream.total_out != size || data.status != Z_STREAM_END)
> +		die(_("inflate returned %d"), data.status);
> +	git_inflate_end(&zstream);
> +
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}

Just a side-note, I think (but am not 100% sure) that these existing
occurances aren't needed due to our use of CALLOC_ARRAY():
    
    diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
    index 4a9466295ba..00b349412c5 100644
    --- a/builtin/unpack-objects.c
    +++ b/builtin/unpack-objects.c
    @@ -248,7 +248,6 @@ static void write_object(unsigned nr, enum object_type type,
                            die("failed to write object");
                    added_object(nr, type, buf, size);
                    free(buf);
    -               obj_list[nr].obj = NULL;
            } else if (type == OBJ_BLOB) {
                    struct blob *blob;
                    if (write_object_file(buf, size, type_name(type),
    @@ -262,7 +261,6 @@ static void write_object(unsigned nr, enum object_type type,
                            blob->object.flags |= FLAG_WRITTEN;
                    else
                            die("invalid blob object");
    -               obj_list[nr].obj = NULL;
            } else {
                    struct object *obj;
                    int eaten;

The reason I'm noting it is that the same seems to be true of your new
addition here. I.e. are these assignments to NULL needed?

Anyway, the reason I started poking at this it tha this
write_stream_blob() seems to duplicate much of write_object(). AFAICT
only the writing part is really different, the part where we
lookup_blob() after, set FLAG_WRITTEN etc. is all the same.

Why can't we call write_object() here?

The obvious answer seems to be that the call to write_object_file()
isn't prepared to do the sort of streaming that you want, so instead
you're bypassing it and calling write_loose_object() directly.

I haven't tried this myself, but isn't a better and cleaner approach
here to not add another meaning to what is_null_oid() means, but to just
add a HASH_STREAM flag that'll get passed down as "unsigned flags" to
write_loose_object()? See FLAG_BITS in object.h.

Then the "obj_list[nr].obj" here could also become
"obj_list[nr].obj.flags |= (1u<<12)" or whatever (but that wouldn't
strictly be needed I think.

But by adding the "HASH_STREAM" flag you could I think stop duplicating
the "Generate the header" etc. here and call write_object_file_flags().

I don't so much care about how it's done within unpack-objects.c, but
not having another meaning to is_null_oid() in play would be really
nice, and it this case it seems entirely avoidable.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:07       ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-06  2:07 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:41 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  6 ++++++
> >  2 files changed, 55 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb972cdccd..82656f7428 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
>
> I see why you picked "const void *buf" here, over say const char *, it's
> what "struct input_stream" uses.
>
> But why not use size_t for the length, as input_stream does?
>

Yes, "size_t" will be better here.

> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
>
> nit: if (!data->len)...
>

Will apply.

> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
>
> But isn't the body of this functin the same as:
>
>         *len = data->len;
>         if (!len)
>                 return NULL;
>         data->len = 0;
>         return data->buf;
>
> I.e. you don't need the condition for setting "*len" if it's 0, then
> data->len is also 0. You just want to return NULL afterwards, and not
> set (harmless, but no need) data->len to 0)< or return data->buf.

Will apply.

> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> Maybe it's that I'm unused to it, but I find this a bit more readable:
>
>         @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>          {
>                 char hdr[MAX_HEADER_LEN];
>                 int hdrlen = sizeof(hdr);
>         +       struct simple_input_stream_data tmp = {
>         +               .buf = buf,
>         +               .len = len,
>         +       };
>                 struct input_stream in_stream = {
>                         .read = feed_simple_input_stream,
>         -               .data = (void *)&(struct simple_input_stream_data) {
>         -                       .buf = buf,
>         -                       .len = len,
>         -               },
>         +               .data = (void *)&tmp,
>                         .size = len,
>                 };
>
> Yes there's a temporary variable, but no denser inline casting. Also
> easier to strep through in a debugger (which will have the type
> information on "tmp".
>

Will apply.

> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> ditto..
>
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> > +     in_stream.size = len;
>
> Why are we setting this here?...
>

Yes, putting "in_stream.size=len;" here was a stupid decision.

> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
>
> ...Insted of after this point, as we may error and never use it?
>
> > +     data.buf = buf;
> > +     data.len = len;
>
> Probably won't matter,  just a nit...
>
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +     size_t size;
> > +};
> > +
>
> Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:51       ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-06  2:51 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:27 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +     } else {
> > +             loose_object_path(the_repository, &filename, oid);
> > +     }
> >
> >       fd = create_tmpfile(&tmp_file, filename.buf);
> >       if (fd < 0) {
> > @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >               die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> >                   ret);
> >       the_hash_algo->final_oid_fn(&parano_oid, &c);
> > -     if (!oideq(oid, &parano_oid))
> > +     if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
> >               die(_("confused by unstable object source data for %s"),
> >                   oid_to_hex(oid));
> >
> >       close_loose_object(fd);
> >
> > +     if (is_null_oid(oid)) {
> > +             int dirlen;
> > +
> > +             oidcpy((struct object_id *)oid, &parano_oid);
> > +             loose_object_path(the_repository, &filename, oid);
>
> Why are we breaking the promise that "oid" is constant here? I tested
> locally with the below on top, and it seems to work (at least no tests
> broke). Isn't it preferrable to the cast & the caller having its "oid"
> changed?
>
> diff --git a/object-file.c b/object-file.c
> index 71d510614b9..d014e6942ea 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         close_loose_object(fd);
>
>         if (is_null_oid(oid)) {
> +               struct object_id oid2;
>                 int dirlen;
>
> -               oidcpy((struct object_id *)oid, &parano_oid);
> -               loose_object_path(the_repository, &filename, oid);
> +               oidcpy(&oid2, &parano_oid);
> +               loose_object_path(the_repository, &filename, &oid2);
>
>                 /* We finally know the object path, and create the missing dir. */
>                 dirlen = directory_size(filename.buf);

Maybe I should change the promise that "oid" is constant in
"write_loose_object()".

The original write_object_file_flags() defines a variable "oid", and
completes the calculation of the "oid" in
"write_object_file_prepare()" which will be passed to
"write_loose_object()".

If a null oid is maintained after calling "write_loose_object()",
"--strict" will become meaningless, although it does not break existing
test cases.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:12       ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-06  3:12 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:54 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
>
> Why re-use this & leak memory? An existing strbuf use in this function
> doesn't leak in the same way. Just release it as in the below patch on
> top (the ret v.s. err variable naming is a bit confused, maybe could do
> with a prep cleanup step.).
>
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
>
> And once we do that this could just become:
>
>         strbuf_addf($filename, "%s/", ...)
>
> Is there's existing uses of this pattern, so mayb e not worth it, but it
> allows you to remove the braces on the if/else.
>
> diff --git a/object-file.c b/object-file.c
> index 8bd89e7b7ba..2b52f3fc1cc 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                        int hdrlen, struct input_stream *in_stream,
>                        time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret, err = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> -               strbuf_reset(&filename);
>                 strbuf_addstr(&filename, the_repository->objects->odb->path);
>                 strbuf_addch(&filename, '/');
>         } else {
> @@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         fd = create_tmpfile(&tmp_file, filename.buf);
>         if (fd < 0) {
>                 if (flags & HASH_SILENT)
> -                       return -1;
> +                       err = -1;
>                 else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>                 else
> -                       return error_errno(_("unable to create temporary file"));
> +                       err = error_errno(_("unable to create temporary file"));
> +               goto cleanup;
>         }
>
>         /* Set it up */
> @@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         struct strbuf dir = STRBUF_INIT;
>                         strbuf_add(&dir, filename.buf, dirlen - 1);
>                         if (mkdir(dir.buf, 0777) && errno != EEXIST)
> -                               return -1;
> -                       if (adjust_shared_perm(dir.buf))
> -                               return -1;
> -                       strbuf_release(&dir);
> +                               err = -1;
> +                       else if (adjust_shared_perm(dir.buf))
> +                               err = -1;
> +                       else
> +                               strbuf_release(&dir);
> +                       if (err < 0)
> +                               goto cleanup;
>                 }
>         }
>
> @@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         warning_errno(_("failed utime() on %s"), tmp_file.buf);
>         }
>
> -       return finalize_object_file(tmp_file.buf, filename.buf);
> +       err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +       strbuf_release(&filename);
> +       return err;
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)

Yes, this will be much better. Will apply.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:20       ` Han Xin
  0 siblings, 0 replies; 57+ messages in thread
From: Han Xin @ 2021-12-06  3:20 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 10:00 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > +     unsigned long bufsize = dry_run ? 4096 : size;
> > +     void *buf = xmallocz(bufsize);
>
> It's probably nothing, but in your CL you note that you changed another
> hardcoding from 4k to 8k, should this one still be 4k?
>
> It's probably fine, just wondering...

Yes, I think this is an omission from my work.

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2021-12-06  3:22 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
2021-10-19  7:37 ` Han Xin
2021-10-20 14:42 ` Philip Oakley
2021-10-21  3:42   ` Han Xin
2021-10-21 22:47     ` Philip Oakley
2021-11-03  1:48 ` Han Xin
2021-11-03 10:07   ` Philip Oakley
2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
2021-11-18  4:59   ` Jiang Xin
2021-11-18  6:45     ` Junio C Hamano
2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
2021-11-18  5:42   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
2021-11-18  5:49   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
2021-11-18  5:56   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
2021-11-18  7:14   ` Jiang Xin
2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
2021-11-29  7:01   ` Han Xin
2021-11-29 19:12     ` Jeff King
2021-11-30  2:57       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 " Han Xin
2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
2021-12-06  2:07       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
2021-12-06  2:51       ` Han Xin
2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
2021-12-06  3:12       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
2021-12-06  3:20       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
2021-11-23 23:24   ` Junio C Hamano
2021-11-24  9:00     ` Han Xin
2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
2021-11-29 15:10   ` Derrick Stolee
2021-11-29 20:44     ` Junio C Hamano
2021-11-29 22:18       ` Derrick Stolee
2021-11-30  3:23         ` Han Xin
2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-11-29 17:37   ` Derrick Stolee
2021-11-30 13:49     ` Han Xin
2021-11-30 18:38       ` Derrick Stolee
2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-02 13:53           ` Derrick Stolee

Code repositories for project(s) associated with this inbox:

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).