git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / code / Atom feed
* [PATCH] unpack-objects: unpack large object in stream
@ 2021-10-09  8:20 Han Xin
  2021-10-19  7:37 ` Han Xin
                   ` (14 more replies)
  0 siblings, 15 replies; 165+ messages in thread
From: Han Xin @ 2021-10-09  8:20 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use the setting of "core.bigFileThreshold" as
the threshold for large object.

Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          |  41 +++++++-
 object-file.c                     | 149 +++++++++++++++++++++++++++---
 object-store.h                    |   9 ++
 t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
 4 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8ac77e60a8 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+static void fill_stream(struct git_zstream *stream)
+{
+	stream->next_in = fill(1);
+	stream->avail_in = len;
+}
+
+static void use_stream(struct git_zstream *stream)
+{
+	use(len - stream->avail_in);
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	struct git_zstream_reader reader;
+	struct object_id *oid = &obj_list[nr].oid;
+
+	reader.fill = &fill_stream;
+	reader.use = &use_stream;
+
+	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
+				     oid, dry_run))
+		die("failed to write object in stream");
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index a8be899481..06c1693675 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
+			       int fd, unsigned char *compressed,
+			       int compressed_len, const void *buf,
+			       size_t len, int flush)
+{
+	int ret;
+
+	stream->next_in = (void *)buf;
+	stream->avail_in = len;
+	do {
+		unsigned char *in0 = stream->next_in;
+		ret = git_deflate(stream, flush);
+		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream->next_out = compressed;
+		stream->avail_out = compressed_len;
+	} while (ret == Z_OK);
+
+	return ret;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime)
@@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
-	do {
-		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	ret = write_object_buffer(&stream, &c, fd, compressed,
+				  sizeof(compressed), buf, len,
+				  Z_FINISH);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
@@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
 	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
 }
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid,
+			     int dry_run)
+{
+	git_zstream istream, ostream;
+	unsigned char buf[8192], compressed[4096];
+	char hdr[MAX_HEADER_LEN];
+	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
+	int ret = 0;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+
+	/* Write tmpfile in objects dir, because oid is unknown */
+	if (!dry_run) {
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (errno == EACCES)
+				ret = error(_("insufficient permission for adding an object to repository database %s"),
+					get_object_directory());
+			else
+				ret = error_errno(_("unable to create temporary file"));
+			goto cleanup;
+		}
+	}
+
+	memset(&istream, 0, sizeof(istream));
+	istream.next_out = buf;
+	istream.avail_out = sizeof(buf);
+	git_inflate_init(&istream);
+
+	if (!dry_run) {
+		/* Set it up */
+		git_deflate_init(&ostream, zlib_compression_level);
+		ostream.next_out = compressed;
+		ostream.avail_out = sizeof(compressed);
+		the_hash_algo->init_fn(&c);
+
+		/* First header */
+		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
+				(uintmax_t)len) + 1;
+		ostream.next_in = (unsigned char *)hdr;
+		ostream.avail_in = hdrlen;
+		while (git_deflate(&ostream, 0) == Z_OK)
+			; /* nothing */
+		the_hash_algo->update_fn(&c, hdr, hdrlen);
+	}
+
+	/* Then the data itself */
+	do {
+		unsigned char *last_out = istream.next_out;
+		reader->fill(&istream);
+		istatus = git_inflate(&istream, 0);
+		if (istatus == Z_STREAM_END)
+			flush = Z_FINISH;
+		reader->use(&istream);
+		if (!dry_run)
+			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
+						      sizeof(compressed), last_out,
+						      istream.next_out - last_out,
+						      flush);
+		istream.next_out = buf;
+		istream.avail_out = sizeof(buf);
+	} while (istatus == Z_OK);
+
+	if (istream.total_out != len || istatus != Z_STREAM_END)
+		die( _("inflate returned %d"), istatus);
+	git_inflate_end(&istream);
+
+	if (dry_run)
+		goto cleanup;
+
+	if (ostatus != Z_STREAM_END)
+		die(_("unable to deflate new object (%d)"), ostatus);
+	ostatus = git_deflate_end_gently(&ostream);
+	if (ostatus != Z_OK)
+		die(_("deflateEnd on object failed (%d)"), ostatus);
+	the_hash_algo->final_fn(oid->hash, &c);
+	close_loose_object(fd);
+
+	/* We get the oid now */
+	loose_object_path(the_repository, &filename, oid);
+
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		/*
+		 * Make sure the directory exists; note that the contents
+		 * of the buffer are undefined after mkstemp returns an
+		 * error, so we have to rewrite the whole buffer from
+		 * scratch.
+		 */
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
+			unlink_or_warn(tmp_file.buf);
+			strbuf_release(&dir);
+			ret = -1;
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	ret = finalize_object_file(tmp_file.buf, filename.buf);
+
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return ret;
+}
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags)
diff --git a/object-store.h b/object-store.h
index d24915ced1..12b113ef93 100644
--- a/object-store.h
+++ b/object-store.h
@@ -33,6 +33,11 @@ struct object_directory {
 	char *path;
 };
 
+struct git_zstream_reader {
+	void (*fill)(struct git_zstream *);
+	void (*use)(struct git_zstream *);
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 int write_object_file(const void *buf, unsigned long len,
 		      const char *type, struct object_id *oid);
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid, int dry_run);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.0.1.g09a6bb964f.dirty


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
@ 2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-10-19  7:37 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
@ 2021-10-20 14:42 ` Philip Oakley
  2021-10-21  3:42   ` Han Xin
  2021-11-03  1:48 ` Han Xin
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Philip Oakley @ 2021-10-20 14:42 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

On 09/10/2021 09:20, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +static void fill_stream(struct git_zstream *stream)
> +{
> +	stream->next_in = fill(1);
> +	stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +	use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)

Can we use size_t for the `size`, and possibly `nr`, to improve
compatibility with Windows systems where unsigned long is only 32 bits?

There has been some work in the past on providing large file support on
Windows, which requires numerous long -> size_t changes.

Philip
> +{
> +	struct git_zstream_reader reader;
> +	struct object_id *oid = &obj_list[nr].oid;
> +
> +	reader.fill = &fill_stream;
> +	reader.use = &use_stream;
> +
> +	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +				     oid, dry_run))
> +		die("failed to write object in stream");
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  				   unsigned nr)
>  {
> -	void *buf = get_data(size);
> +	void *buf;
> +
> +	/* Write large blob in stream without allocating full buffer. */
> +	if (type == OBJ_BLOB && size > big_file_threshold) {
> +		write_stream_blob(nr, size);
> +		return;
> +	}
>  
> +	buf = get_data(size);
>  	if (!dry_run && buf)
>  		write_object(nr, type, buf, size);
>  	else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +			       int fd, unsigned char *compressed,
> +			       int compressed_len, const void *buf,
> +			       size_t len, int flush)
> +{
> +	int ret;
> +
> +	stream->next_in = (void *)buf;
> +	stream->avail_in = len;
> +	do {
> +		unsigned char *in0 = stream->next_in;
> +		ret = git_deflate(stream, flush);
> +		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream->next_out = compressed;
> +		stream->avail_out = compressed_len;
> +	} while (ret == Z_OK);
> +
> +	return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> -	stream.next_in = (void *)buf;
> -	stream.avail_in = len;
> -	do {
> -		unsigned char *in0 = stream.next_in;
> -		ret = git_deflate(&stream, Z_FINISH);
> -		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -			die(_("unable to write loose object file"));
> -		stream.next_out = compressed;
> -		stream.avail_out = sizeof(compressed);
> -	} while (ret == Z_OK);
> +	ret = write_object_buffer(&stream, &c, fd, compressed,
> +				  sizeof(compressed), buf, len,
> +				  Z_FINISH);
>  
>  	if (ret != Z_STREAM_END)
>  		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>  	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid,
> +			     int dry_run)
> +{
> +	git_zstream istream, ostream;
> +	unsigned char buf[8192], compressed[4096];
> +	char hdr[MAX_HEADER_LEN];
> +	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +	int ret = 0;
> +	git_hash_ctx c;
> +	struct strbuf tmp_file = STRBUF_INIT;
> +	struct strbuf filename = STRBUF_INIT;
> +
> +	/* Write tmpfile in objects dir, because oid is unknown */
> +	if (!dry_run) {
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +		fd = create_tmpfile(&tmp_file, filename.buf);
> +		if (fd < 0) {
> +			if (errno == EACCES)
> +				ret = error(_("insufficient permission for adding an object to repository database %s"),
> +					get_object_directory());
> +			else
> +				ret = error_errno(_("unable to create temporary file"));
> +			goto cleanup;
> +		}
> +	}
> +
> +	memset(&istream, 0, sizeof(istream));
> +	istream.next_out = buf;
> +	istream.avail_out = sizeof(buf);
> +	git_inflate_init(&istream);
> +
> +	if (!dry_run) {
> +		/* Set it up */
> +		git_deflate_init(&ostream, zlib_compression_level);
> +		ostream.next_out = compressed;
> +		ostream.avail_out = sizeof(compressed);
> +		the_hash_algo->init_fn(&c);
> +
> +		/* First header */
> +		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +				(uintmax_t)len) + 1;
> +		ostream.next_in = (unsigned char *)hdr;
> +		ostream.avail_in = hdrlen;
> +		while (git_deflate(&ostream, 0) == Z_OK)
> +			; /* nothing */
> +		the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	}
> +
> +	/* Then the data itself */
> +	do {
> +		unsigned char *last_out = istream.next_out;
> +		reader->fill(&istream);
> +		istatus = git_inflate(&istream, 0);
> +		if (istatus == Z_STREAM_END)
> +			flush = Z_FINISH;
> +		reader->use(&istream);
> +		if (!dry_run)
> +			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +						      sizeof(compressed), last_out,
> +						      istream.next_out - last_out,
> +						      flush);
> +		istream.next_out = buf;
> +		istream.avail_out = sizeof(buf);
> +	} while (istatus == Z_OK);
> +
> +	if (istream.total_out != len || istatus != Z_STREAM_END)
> +		die( _("inflate returned %d"), istatus);
> +	git_inflate_end(&istream);
> +
> +	if (dry_run)
> +		goto cleanup;
> +
> +	if (ostatus != Z_STREAM_END)
> +		die(_("unable to deflate new object (%d)"), ostatus);
> +	ostatus = git_deflate_end_gently(&ostream);
> +	if (ostatus != Z_OK)
> +		die(_("deflateEnd on object failed (%d)"), ostatus);
> +	the_hash_algo->final_fn(oid->hash, &c);
> +	close_loose_object(fd);
> +
> +	/* We get the oid now */
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		/*
> +		 * Make sure the directory exists; note that the contents
> +		 * of the buffer are undefined after mkstemp returns an
> +		 * error, so we have to rewrite the whole buffer from
> +		 * scratch.
> +		 */
> +		strbuf_add(&dir, filename.buf, dirlen - 1);
> +		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +			unlink_or_warn(tmp_file.buf);
> +			strbuf_release(&dir);
> +			ret = -1;
> +			goto cleanup;
> +		}
> +		strbuf_release(&dir);
> +	}
> +
> +	ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);
> +	return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct git_zstream_reader {
> +	void (*fill)(struct git_zstream *);
> +	void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)
>  
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>  		      const char *type, struct object_id *oid);
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort
> +	) >expect &&
> +	git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +	test_must_fail git push dest.git HEAD 2>err &&
> +	test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +	git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +	git push dest.git HEAD &&
> +	git -C dest.git fsck &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +	PACK=$(echo main | git pack-objects --progress --revs test) &&
> +	unset GIT_ALLOC_LIMIT &&
> +	git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 2m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 1m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_done


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-20 14:42 ` Philip Oakley
@ 2021-10-21  3:42   ` Han Xin
  2021-10-21 22:47     ` Philip Oakley
  0 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-10-21  3:42 UTC (permalink / raw)
  To: Philip Oakley; +Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Han Xin

Philip Oakley <philipoakley@iee.email> 于2021年10月20日周三 下午10:43写道:
>
> On 09/10/2021 09:20, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When calling "unpack_non_delta_entry()", will allocate full memory for
> > the whole size of the unpacked object and write the buffer to loose file
> > on disk. This may lead to OOM for the git-unpack-objects process when
> > unpacking a very large object.
> >
> > In function "unpack_delta_entry()", will also allocate full memory to
> > buffer the whole delta, but since there will be no delta for an object
> > larger than "core.bigFileThreshold", this issue is moderate.
> >
> > To resolve the OOM issue in "git-unpack-objects", we can unpack large
> > object to file in stream, and use the setting of "core.bigFileThreshold" as
> > the threshold for large object.
> >
> > Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  builtin/unpack-objects.c          |  41 +++++++-
> >  object-file.c                     | 149 +++++++++++++++++++++++++++---
> >  object-store.h                    |   9 ++
> >  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
> >  4 files changed, 279 insertions(+), 12 deletions(-)
> >  create mode 100755 t/t5590-receive-unpack-objects.sh
> >
> > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> > index 4a9466295b..8ac77e60a8 100644
> > --- a/builtin/unpack-objects.c
> > +++ b/builtin/unpack-objects.c
> > @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
> >       }
> >  }
> >
> > +static void fill_stream(struct git_zstream *stream)
> > +{
> > +     stream->next_in = fill(1);
> > +     stream->avail_in = len;
> > +}
> > +
> > +static void use_stream(struct git_zstream *stream)
> > +{
> > +     use(len - stream->avail_in);
> > +}
> > +
> > +static void write_stream_blob(unsigned nr, unsigned long size)
>
> Can we use size_t for the `size`, and possibly `nr`, to improve
> compatibility with Windows systems where unsigned long is only 32 bits?
>
> There has been some work in the past on providing large file support on
> Windows, which requires numerous long -> size_t changes.
>
> Philip

Thanks for your review. I'm not sure if I should do this change in this patch,
it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
`write_object()` and many others.

> > +{
> > +     struct git_zstream_reader reader;
> > +     struct object_id *oid = &obj_list[nr].oid;
> > +
> > +     reader.fill = &fill_stream;
> > +     reader.use = &use_stream;
> > +
> > +     if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> > +                                  oid, dry_run))
> > +             die("failed to write object in stream");
> > +     if (strict && !dry_run) {
> > +             struct blob *blob = lookup_blob(the_repository, oid);
> > +             if (blob)
> > +                     blob->object.flags |= FLAG_WRITTEN;
> > +             else
> > +                     die("invalid blob object from stream");
> > +     }
> > +     obj_list[nr].obj = NULL;
> > +}
> > +
> >  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
> >                                  unsigned nr)
> >  {
> > -     void *buf = get_data(size);
> > +     void *buf;
> > +
> > +     /* Write large blob in stream without allocating full buffer. */
> > +     if (type == OBJ_BLOB && size > big_file_threshold) {
> > +             write_stream_blob(nr, size);
> > +             return;
> > +     }
> >
> > +     buf = get_data(size);
> >       if (!dry_run && buf)
> >               write_object(nr, type, buf, size);
> >       else
> > diff --git a/object-file.c b/object-file.c
> > index a8be899481..06c1693675 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> > +                            int fd, unsigned char *compressed,
> > +                            int compressed_len, const void *buf,
> > +                            size_t len, int flush)
> > +{
> > +     int ret;
> > +
> > +     stream->next_in = (void *)buf;
> > +     stream->avail_in = len;
> > +     do {
> > +             unsigned char *in0 = stream->next_in;
> > +             ret = git_deflate(stream, flush);
> > +             the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> > +             if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> > +                     die(_("unable to write loose object file"));
> > +             stream->next_out = compressed;
> > +             stream->avail_out = compressed_len;
> > +     } while (ret == Z_OK);
> > +
> > +     return ret;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> >                             int hdrlen, const void *buf, unsigned long len,
> >                             time_t mtime)
> > @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > -     stream.next_in = (void *)buf;
> > -     stream.avail_in = len;
> > -     do {
> > -             unsigned char *in0 = stream.next_in;
> > -             ret = git_deflate(&stream, Z_FINISH);
> > -             the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> > -             if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> > -                     die(_("unable to write loose object file"));
> > -             stream.next_out = compressed;
> > -             stream.avail_out = sizeof(compressed);
> > -     } while (ret == Z_OK);
> > +     ret = write_object_buffer(&stream, &c, fd, compressed,
> > +                               sizeof(compressed), buf, len,
> > +                               Z_FINISH);
> >
> >       if (ret != Z_STREAM_END)
> >               die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> > @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
> >       return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
> >  }
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid,
> > +                          int dry_run)
> > +{
> > +     git_zstream istream, ostream;
> > +     unsigned char buf[8192], compressed[4096];
> > +     char hdr[MAX_HEADER_LEN];
> > +     int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> > +     int ret = 0;
> > +     git_hash_ctx c;
> > +     struct strbuf tmp_file = STRBUF_INIT;
> > +     struct strbuf filename = STRBUF_INIT;
> > +
> > +     /* Write tmpfile in objects dir, because oid is unknown */
> > +     if (!dry_run) {
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +             fd = create_tmpfile(&tmp_file, filename.buf);
> > +             if (fd < 0) {
> > +                     if (errno == EACCES)
> > +                             ret = error(_("insufficient permission for adding an object to repository database %s"),
> > +                                     get_object_directory());
> > +                     else
> > +                             ret = error_errno(_("unable to create temporary file"));
> > +                     goto cleanup;
> > +             }
> > +     }
> > +
> > +     memset(&istream, 0, sizeof(istream));
> > +     istream.next_out = buf;
> > +     istream.avail_out = sizeof(buf);
> > +     git_inflate_init(&istream);
> > +
> > +     if (!dry_run) {
> > +             /* Set it up */
> > +             git_deflate_init(&ostream, zlib_compression_level);
> > +             ostream.next_out = compressed;
> > +             ostream.avail_out = sizeof(compressed);
> > +             the_hash_algo->init_fn(&c);
> > +
> > +             /* First header */
> > +             hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> > +                             (uintmax_t)len) + 1;
> > +             ostream.next_in = (unsigned char *)hdr;
> > +             ostream.avail_in = hdrlen;
> > +             while (git_deflate(&ostream, 0) == Z_OK)
> > +                     ; /* nothing */
> > +             the_hash_algo->update_fn(&c, hdr, hdrlen);
> > +     }
> > +
> > +     /* Then the data itself */
> > +     do {
> > +             unsigned char *last_out = istream.next_out;
> > +             reader->fill(&istream);
> > +             istatus = git_inflate(&istream, 0);
> > +             if (istatus == Z_STREAM_END)
> > +                     flush = Z_FINISH;
> > +             reader->use(&istream);
> > +             if (!dry_run)
> > +                     ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> > +                                                   sizeof(compressed), last_out,
> > +                                                   istream.next_out - last_out,
> > +                                                   flush);
> > +             istream.next_out = buf;
> > +             istream.avail_out = sizeof(buf);
> > +     } while (istatus == Z_OK);
> > +
> > +     if (istream.total_out != len || istatus != Z_STREAM_END)
> > +             die( _("inflate returned %d"), istatus);
> > +     git_inflate_end(&istream);
> > +
> > +     if (dry_run)
> > +             goto cleanup;
> > +
> > +     if (ostatus != Z_STREAM_END)
> > +             die(_("unable to deflate new object (%d)"), ostatus);
> > +     ostatus = git_deflate_end_gently(&ostream);
> > +     if (ostatus != Z_OK)
> > +             die(_("deflateEnd on object failed (%d)"), ostatus);
> > +     the_hash_algo->final_fn(oid->hash, &c);
> > +     close_loose_object(fd);
> > +
> > +     /* We get the oid now */
> > +     loose_object_path(the_repository, &filename, oid);
> > +
> > +     dirlen = directory_size(filename.buf);
> > +     if (dirlen) {
> > +             struct strbuf dir = STRBUF_INIT;
> > +             /*
> > +              * Make sure the directory exists; note that the contents
> > +              * of the buffer are undefined after mkstemp returns an
> > +              * error, so we have to rewrite the whole buffer from
> > +              * scratch.
> > +              */
> > +             strbuf_add(&dir, filename.buf, dirlen - 1);
> > +             if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> > +                     unlink_or_warn(tmp_file.buf);
> > +                     strbuf_release(&dir);
> > +                     ret = -1;
> > +                     goto cleanup;
> > +             }
> > +             strbuf_release(&dir);
> > +     }
> > +
> > +     ret = finalize_object_file(tmp_file.buf, filename.buf);
> > +
> > +cleanup:
> > +     strbuf_release(&tmp_file);
> > +     strbuf_release(&filename);
> > +     return ret;
> > +}
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags)
> > diff --git a/object-store.h b/object-store.h
> > index d24915ced1..12b113ef93 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -33,6 +33,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct git_zstream_reader {
> > +     void (*fill)(struct git_zstream *);
> > +     void (*use)(struct git_zstream *);
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)
> >
> > @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
> >  int write_object_file(const void *buf, unsigned long len,
> >                     const char *type, struct object_id *oid);
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid, int dry_run);
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags);
> > diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> > new file mode 100755
> > index 0000000000..7e63dfc0db
> > --- /dev/null
> > +++ b/t/t5590-receive-unpack-objects.sh
> > @@ -0,0 +1,92 @@
> > +#!/bin/sh
> > +#
> > +# Copyright (c) 2021 Han Xin
> > +#
> > +
> > +test_description='Test unpack-objects when receive pack'
> > +
> > +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> > +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> > +
> > +. ./test-lib.sh
> > +
> > +test_expect_success "create commit with big blobs (1.5 MB)" '
> > +     test-tool genrandom foo 1500000 >big-blob &&
> > +     test_commit --append foo big-blob &&
> > +     test-tool genrandom bar 1500000 >big-blob &&
> > +     test_commit --append bar big-blob &&
> > +     (
> > +             cd .git &&
> > +             find objects/?? -type f | sort
> > +     ) >expect &&
> > +     git repack -ad
> > +'
> > +
> > +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> > +     GIT_ALLOC_LIMIT=1m &&
> > +     export GIT_ALLOC_LIMIT
> > +'
> > +
> > +test_expect_success 'prepare dest repository' '
> > +     git init --bare dest.git &&
> > +     git -C dest.git config core.bigFileThreshold 2m &&
> > +     git -C dest.git config receive.unpacklimit 100
> > +'
> > +
> > +test_expect_success 'fail to push: cannot allocate' '
> > +     test_must_fail git push dest.git HEAD 2>err &&
> > +     test_i18ngrep "remote: fatal: attempting to allocate" err &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     ! test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'set a lower bigfile threshold' '
> > +     git -C dest.git config core.bigFileThreshold 1m
> > +'
> > +
> > +test_expect_success 'unpack big object in stream' '
> > +     git push dest.git HEAD &&
> > +     git -C dest.git fsck &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'setup for unpack-objects dry-run test' '
> > +     PACK=$(echo main | git pack-objects --progress --revs test) &&
> > +     unset GIT_ALLOC_LIMIT &&
> > +     git init --bare unpack-test.git
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with large threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 2m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with small threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 1m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_done
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-21  3:42   ` Han Xin
@ 2021-10-21 22:47     ` Philip Oakley
  0 siblings, 0 replies; 165+ messages in thread
From: Philip Oakley @ 2021-10-21 22:47 UTC (permalink / raw)
  To: Han Xin; +Cc: Han Xin, Jiang Xin, Git List

On 21/10/2021 04:42, Han Xin wrote:
>>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> Can we use size_t for the `size`, and possibly `nr`, to improve
>> compatibility with Windows systems where unsigned long is only 32 bits?
>>
>> There has been some work in the past on providing large file support on
>> Windows, which requires numerous long -> size_t changes.
>>
>> Philip
> Thanks for your review. I'm not sure if I should do this change in this patch,
> it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
> `write_object()` and many others.
>
I was mainly raising the issue regarding the 4GB (sometime 2GB)
limitations on Windows which has been a problem for many years.

I had been thinking of not changing the `nr` (number of objects limit)
as 2G objects is hopefully already sufficient, even for thargest of
repos (though IIUC their index file size did break the 32bit size limit).

Staying with the existing types won't make the situation any worse, so
from that perspective the change isn't needed.
--
Philip

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
@ 2021-11-03  1:48 ` Han Xin
  2021-11-03 10:07   ` Philip Oakley
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-03  1:48 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any more suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-11-03  1:48 ` Han Xin
@ 2021-11-03 10:07   ` Philip Oakley
  0 siblings, 0 replies; 165+ messages in thread
From: Philip Oakley @ 2021-11-03 10:07 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

(replies to the alibaba-inc.com aren't getting through for me)

On 03/11/2021 01:48, Han Xin wrote:
> Any more suggestions?
>
> Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道:
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> When calling "unpack_non_delta_entry()", will allocate full memory for
>> the whole size of the unpacked object and write the buffer to loose file
>> on disk. This may lead to OOM for the git-unpack-objects process when
>> unpacking a very large object.

Is it possible to split the patch into smaller pieces, taking each item
separately?

For large files (as above), it should be possible to stream the
unpacking direct to disk, in the same way that the zlib reading is
chunked. However having the same 'code' in two places would need to be
addressed (the DRY principle).

At the moment on LLP64 systems (Windows) there is already a long (32bit)
vs size_t (64bit) problem there (zlib stream), and the size_t problem
then permeates the wider codebase.

The normal Git file operations does tend to memory map whole files, but
here it looks like you can bypass that.
>>
>> In function "unpack_delta_entry()", will also allocate full memory to
>> buffer the whole delta, but since there will be no delta for an object
>> larger than "core.bigFileThreshold", this issue is moderate.

What does 'moderate' mean here? Does it mean there is a simple test that
allows you to side step the whole problem?

>>
>> To resolve the OOM issue in "git-unpack-objects", we can unpack large
>> object to file in stream, and use the setting of "core.bigFileThreshold" as
>> the threshold for large object.

Is this "core.bigFileThreshold" the core element? If so, it is too far
down the commit message. The readers have already (potentially) misread
the message and reacted too soon.  Perhaps: "use `core.bigFileThreshold`
to avoid mmap OOM limits when unpacking".

--
Philip
>>
>> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>> ---
>>  builtin/unpack-objects.c          |  41 +++++++-
>>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>>  object-store.h                    |   9 ++
>>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>>  4 files changed, 279 insertions(+), 12 deletions(-)
>>  create mode 100755 t/t5590-receive-unpack-objects.sh
>>
>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>> index 4a9466295b..8ac77e60a8 100644
>> --- a/builtin/unpack-objects.c
>> +++ b/builtin/unpack-objects.c
>> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>>         }
>>  }
>>
>> +static void fill_stream(struct git_zstream *stream)
>> +{
>> +       stream->next_in = fill(1);
>> +       stream->avail_in = len;
>> +}
>> +
>> +static void use_stream(struct git_zstream *stream)
>> +{
>> +       use(len - stream->avail_in);
>> +}
>> +
>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> +{
>> +       struct git_zstream_reader reader;
>> +       struct object_id *oid = &obj_list[nr].oid;
>> +
>> +       reader.fill = &fill_stream;
>> +       reader.use = &use_stream;
>> +
>> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
>> +                                    oid, dry_run))
>> +               die("failed to write object in stream");
>> +       if (strict && !dry_run) {
>> +               struct blob *blob = lookup_blob(the_repository, oid);
>> +               if (blob)
>> +                       blob->object.flags |= FLAG_WRITTEN;
>> +               else
>> +                       die("invalid blob object from stream");
>> +       }
>> +       obj_list[nr].obj = NULL;
>> +}
>> +
>>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>>                                    unsigned nr)
>>  {
>> -       void *buf = get_data(size);
>> +       void *buf;
>> +
>> +       /* Write large blob in stream without allocating full buffer. */
>> +       if (type == OBJ_BLOB && size > big_file_threshold) {
>> +               write_stream_blob(nr, size);
>> +               return;
>> +       }
>>
>> +       buf = get_data(size);
>>         if (!dry_run && buf)
>>                 write_object(nr, type, buf, size);
>>         else
>> diff --git a/object-file.c b/object-file.c
>> index a8be899481..06c1693675 100644
>> --- a/object-file.c
>> +++ b/object-file.c
>> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>>         return fd;
>>  }
>>
>> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
>> +                              int fd, unsigned char *compressed,
>> +                              int compressed_len, const void *buf,
>> +                              size_t len, int flush)
>> +{
>> +       int ret;
>> +
>> +       stream->next_in = (void *)buf;
>> +       stream->avail_in = len;
>> +       do {
>> +               unsigned char *in0 = stream->next_in;
>> +               ret = git_deflate(stream, flush);
>> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
>> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
>> +                       die(_("unable to write loose object file"));
>> +               stream->next_out = compressed;
>> +               stream->avail_out = compressed_len;
>> +       } while (ret == Z_OK);
>> +
>> +       return ret;
>> +}
>> +
>>  static int write_loose_object(const struct object_id *oid, char *hdr,
>>                               int hdrlen, const void *buf, unsigned long len,
>>                               time_t mtime)
>> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>>
>>         /* Then the data itself.. */
>> -       stream.next_in = (void *)buf;
>> -       stream.avail_in = len;
>> -       do {
>> -               unsigned char *in0 = stream.next_in;
>> -               ret = git_deflate(&stream, Z_FINISH);
>> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>> -                       die(_("unable to write loose object file"));
>> -               stream.next_out = compressed;
>> -               stream.avail_out = sizeof(compressed);
>> -       } while (ret == Z_OK);
>> +       ret = write_object_buffer(&stream, &c, fd, compressed,
>> +                                 sizeof(compressed), buf, len,
>> +                                 Z_FINISH);
>>
>>         if (ret != Z_STREAM_END)
>>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
>> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>>  }
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid,
>> +                            int dry_run)
>> +{
>> +       git_zstream istream, ostream;
>> +       unsigned char buf[8192], compressed[4096];
>> +       char hdr[MAX_HEADER_LEN];
>> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
>> +       int ret = 0;
>> +       git_hash_ctx c;
>> +       struct strbuf tmp_file = STRBUF_INIT;
>> +       struct strbuf filename = STRBUF_INIT;
>> +
>> +       /* Write tmpfile in objects dir, because oid is unknown */
>> +       if (!dry_run) {
>> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
>> +               strbuf_addch(&filename, '/');
>> +               fd = create_tmpfile(&tmp_file, filename.buf);
>> +               if (fd < 0) {
>> +                       if (errno == EACCES)
>> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
>> +                                       get_object_directory());
>> +                       else
>> +                               ret = error_errno(_("unable to create temporary file"));
>> +                       goto cleanup;
>> +               }
>> +       }
>> +
>> +       memset(&istream, 0, sizeof(istream));
>> +       istream.next_out = buf;
>> +       istream.avail_out = sizeof(buf);
>> +       git_inflate_init(&istream);
>> +
>> +       if (!dry_run) {
>> +               /* Set it up */
>> +               git_deflate_init(&ostream, zlib_compression_level);
>> +               ostream.next_out = compressed;
>> +               ostream.avail_out = sizeof(compressed);
>> +               the_hash_algo->init_fn(&c);
>> +
>> +               /* First header */
>> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
>> +                               (uintmax_t)len) + 1;
>> +               ostream.next_in = (unsigned char *)hdr;
>> +               ostream.avail_in = hdrlen;
>> +               while (git_deflate(&ostream, 0) == Z_OK)
>> +                       ; /* nothing */
>> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
>> +       }
>> +
>> +       /* Then the data itself */
>> +       do {
>> +               unsigned char *last_out = istream.next_out;
>> +               reader->fill(&istream);
>> +               istatus = git_inflate(&istream, 0);
>> +               if (istatus == Z_STREAM_END)
>> +                       flush = Z_FINISH;
>> +               reader->use(&istream);
>> +               if (!dry_run)
>> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
>> +                                                     sizeof(compressed), last_out,
>> +                                                     istream.next_out - last_out,
>> +                                                     flush);
>> +               istream.next_out = buf;
>> +               istream.avail_out = sizeof(buf);
>> +       } while (istatus == Z_OK);
>> +
>> +       if (istream.total_out != len || istatus != Z_STREAM_END)
>> +               die( _("inflate returned %d"), istatus);
>> +       git_inflate_end(&istream);
>> +
>> +       if (dry_run)
>> +               goto cleanup;
>> +
>> +       if (ostatus != Z_STREAM_END)
>> +               die(_("unable to deflate new object (%d)"), ostatus);
>> +       ostatus = git_deflate_end_gently(&ostream);
>> +       if (ostatus != Z_OK)
>> +               die(_("deflateEnd on object failed (%d)"), ostatus);
>> +       the_hash_algo->final_fn(oid->hash, &c);
>> +       close_loose_object(fd);
>> +
>> +       /* We get the oid now */
>> +       loose_object_path(the_repository, &filename, oid);
>> +
>> +       dirlen = directory_size(filename.buf);
>> +       if (dirlen) {
>> +               struct strbuf dir = STRBUF_INIT;
>> +               /*
>> +                * Make sure the directory exists; note that the contents
>> +                * of the buffer are undefined after mkstemp returns an
>> +                * error, so we have to rewrite the whole buffer from
>> +                * scratch.
>> +                */
>> +               strbuf_add(&dir, filename.buf, dirlen - 1);
>> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
>> +                       unlink_or_warn(tmp_file.buf);
>> +                       strbuf_release(&dir);
>> +                       ret = -1;
>> +                       goto cleanup;
>> +               }
>> +               strbuf_release(&dir);
>> +       }
>> +
>> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
>> +
>> +cleanup:
>> +       strbuf_release(&tmp_file);
>> +       strbuf_release(&filename);
>> +       return ret;
>> +}
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags)
>> diff --git a/object-store.h b/object-store.h
>> index d24915ced1..12b113ef93 100644
>> --- a/object-store.h
>> +++ b/object-store.h
>> @@ -33,6 +33,11 @@ struct object_directory {
>>         char *path;
>>  };
>>
>> +struct git_zstream_reader {
>> +       void (*fill)(struct git_zstream *);
>> +       void (*use)(struct git_zstream *);
>> +};
>> +
>>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>>         struct object_directory *, 1, fspathhash, fspatheq)
>>
>> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>>  int write_object_file(const void *buf, unsigned long len,
>>                       const char *type, struct object_id *oid);
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid, int dry_run);
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags);
>> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
>> new file mode 100755
>> index 0000000000..7e63dfc0db
>> --- /dev/null
>> +++ b/t/t5590-receive-unpack-objects.sh
>> @@ -0,0 +1,92 @@
>> +#!/bin/sh
>> +#
>> +# Copyright (c) 2021 Han Xin
>> +#
>> +
>> +test_description='Test unpack-objects when receive pack'
>> +
>> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
>> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>> +
>> +. ./test-lib.sh
>> +
>> +test_expect_success "create commit with big blobs (1.5 MB)" '
>> +       test-tool genrandom foo 1500000 >big-blob &&
>> +       test_commit --append foo big-blob &&
>> +       test-tool genrandom bar 1500000 >big-blob &&
>> +       test_commit --append bar big-blob &&
>> +       (
>> +               cd .git &&
>> +               find objects/?? -type f | sort
>> +       ) >expect &&
>> +       git repack -ad
>> +'
>> +
>> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
>> +       GIT_ALLOC_LIMIT=1m &&
>> +       export GIT_ALLOC_LIMIT
>> +'
>> +
>> +test_expect_success 'prepare dest repository' '
>> +       git init --bare dest.git &&
>> +       git -C dest.git config core.bigFileThreshold 2m &&
>> +       git -C dest.git config receive.unpacklimit 100
>> +'
>> +
>> +test_expect_success 'fail to push: cannot allocate' '
>> +       test_must_fail git push dest.git HEAD 2>err &&
>> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       ! test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'set a lower bigfile threshold' '
>> +       git -C dest.git config core.bigFileThreshold 1m
>> +'
>> +
>> +test_expect_success 'unpack big object in stream' '
>> +       git push dest.git HEAD &&
>> +       git -C dest.git fsck &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'setup for unpack-objects dry-run test' '
>> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
>> +       unset GIT_ALLOC_LIMIT &&
>> +       git init --bare unpack-test.git
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with large threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 2m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with small threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 1m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_done
>> --
>> 2.33.0.1.g09a6bb964f.dirty
>>


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (2 preceding siblings ...)
  2021-11-03  1:48 ` Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  4:59   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Refactor write_loose_object() to support inputstream, in the same way
that zlib reading is chunked.

Using "in_stream" instead of "void *buf", we needn't to allocate enough
memory in advance, and only part of the contents will be read when
called "in_stream.read()".

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index 02b7970274..1ad2cb579c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct input_data_from_buffer {
+	const char *buf;
+	unsigned long len;
+};
+
+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
+{
+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
+
+	if (input->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = input->len;
+	input->len = 0;
+	return input->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const char *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream->data, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct input_data_from_buffer data;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..f1b67e9100 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const char *(*read)(void* data, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (3 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:42   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We will use "write_loose_object()" later to handle large blob object,
which needs to work in dry_run mode.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1ad2cb579c..b0838c847e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, struct input_stream *in_stream,
+			      int dry_run,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
+	if (!dry_run) {
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (flags & HASH_SILENT)
+				return -1;
+			else if (errno == EACCES)
+				return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			else
+				return error_errno(_("unable to create temporary file"));
+		}
 	}
 
 	/* Set it up */
@@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		unsigned char *in0 = stream.next_in;
 		ret = git_deflate(&stream, Z_FINISH);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
@@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
+	if (dry_run)
+		return 0;
+
 	close_loose_object(fd);
 
 	if (mtime) {
@@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
 
 cleanup:
 	free(header);
@@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	data.buf = buf;
 	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
 	free(buf);
 
 	return ret;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (4 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:49   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When read input stream, oid can't get before reading all, and it will be
filled after reading.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index b0838c847e..8393659f0d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const char *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else
+		loose_object_path(the_repository, &filename, oid);
 
 	if (!dry_run) {
 		fd = create_tmpfile(&tmp_file, filename.buf);
@@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
@@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		/* copy oid */
+		oidcpy((struct object_id *)oid, &parano_oid);
+		/* We get the oid now */
+		loose_object_path(the_repository, &filename, oid);
+
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			/*
+			 * Make sure the directory exists; note that the
+			 * contents of the buffer are undefined after mkstemp
+			 * returns an error, so we have to rewrite the whole
+			 * buffer from scratch.
+			 */
+			strbuf_reset(&dir);
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (5 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:56   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Read input stream repeatedly in write_loose_object() unless reach the
end, so that we can divide the large blob write into many small blocks.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 8393659f0d..e333448c54 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const char *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream->data, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 5/6] object-store.h: add write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (6 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

For large loose object files, that should be possible to stream it
direct to disk with "write_loose_object()".
Unlike "write_object_file()", you need to implement an "input_stream"
instead of giving void *buf.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 8 ++++----
 object-store.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index e333448c54..60eb29db97 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,10 +1878,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 	return input->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      int dry_run,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret = 0;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index f1b67e9100..f6faa8d6d3 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,11 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (7 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  7:14   ` Jiang Xin
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use "core.bigFileThreshold" to avoid OOM
limits when called "get_data()".

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
 t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..6c757d823b 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_data_from_zstream {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
+{
+	struct input_data_from_zstream *input = data;
+	git_zstream *zstream = input->zstream;
+	void *in = fill(1);
+
+	if (!len || input->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = input->buf;
+	zstream->avail_out = sizeof(input->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	input->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(input->buf) - zstream->avail_out;
+
+	return (const char *)input->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_data_from_zstream data;
+	struct input_stream in_stream = {
+		.read = read_inflate_in_stream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-18  4:59   ` Jiang Xin
  2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 1 reply; 165+ messages in thread
From: Jiang Xin @ 2021-11-18  4:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>

It would be better to provide a cover letter describing changes in v2, such as:

* Make "write_loose_object()" a public method, so we can
   reuse it in "unpack_non_delta_entry()".
   (But I doubt we can use "write_object_file_flags()" public
     function, without make this change.)

* Add an new interface "input_stream" as an argument for
   "write_loose_object()", so that we can feed data to
   "write_loose_object()" from buffer or from zlib stream.

> Refactor write_loose_object() to support inputstream, in the same way
> that zlib reading is chunked.

In the beginning of your commit log, you should describe the problem, such as:

We used to read the full content of a blob into buffer in
"unpack_non_delta_entry()" by calling:

    void *buf = get_data(size);

This will consume lots of memory for a very big blob object.

> Using "in_stream" instead of "void *buf", we needn't to allocate enough
> memory in advance, and only part of the contents will be read when
> called "in_stream.read()".
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 02b7970274..1ad2cb579c 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +struct input_data_from_buffer {
> +       const char *buf;
> +       unsigned long len;
> +};
> +
> +static const char *read_input_stream_from_buffer(void *data, unsigned long *len)

Use "const void *" for the type of return variable, just like input
argument for write_loose_object()?

> +{
> +       struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
> +
> +       if (input->len == 0) {
> +               *len = 0;
> +               return NULL;
> +       }
> +       *len = input->len;
> +       input->len = 0;
> +       return input->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -                             int hdrlen, const void *buf, unsigned long len,
> +                             int hdrlen, struct input_stream *in_stream,
>                               time_t mtime, unsigned flags)
>  {
>         int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         struct object_id parano_oid;
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
> +       const char *buf;

Can we use the same prototype as the original:  "const void *buf" ?

> +       unsigned long len;
>
>         loose_object_path(the_repository, &filename, oid);
>
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> +       buf = in_stream->read(in_stream->data, &len);
>         stream.next_in = (void *)buf;
>         stream.avail_in = len;
>         do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen = sizeof(hdr);
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* Normally if we have it in the pack then we do not bother writing
>          * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>         char *header;
>         int hdrlen, status = 0;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* type string, SP, %lu of the length plus NUL must fit this */
>         hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen;
>         int ret;
> +       struct input_data_from_buffer data;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = &data,
> +       };
>
>         if (has_loose_object(oid))
>                 return 0;
>         buf = read_object(the_repository, oid, &type, &len);
>         if (!buf)
>                 return error(_("cannot read object for %s"), oid_to_hex(oid));
> +       data.buf = buf;
> +       data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>         free(buf);
>
>         return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..f1b67e9100 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct input_stream {
> +       const char *(*read)(void* data, unsigned long *len);
> +       void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-18  5:42   ` Jiang Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-11-18  5:42 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We will use "write_loose_object()" later to handle large blob object,
> which needs to work in dry_run mode.

The dry_run mode comes from "builtin/unpack-object.c", throw the
buffer read from "get_data()".
So why not add "dry_run" to "get_data()" instead?

If we have a dry_run version of get_data, such as "get_data(size,
dry_run)", we do not have to add dry_run mode for ”
write_loose_object()".

See: git grep -A5 get_data builtin/unpack-objects.c
builtin/unpack-objects.c:       void *buf = get_data(size);
builtin/unpack-objects.c-
builtin/unpack-objects.c-       if (!dry_run && buf)
builtin/unpack-objects.c-               write_object(nr, type, buf, size);
builtin/unpack-objects.c-       else
builtin/unpack-objects.c-               free(buf);
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }


> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 32 +++++++++++++++++++-------------
>  1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 1ad2cb579c..b0838c847e 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
>
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, struct input_stream *in_stream,
> +                             int dry_run,
>                               time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         loose_object_path(the_repository, &filename, oid);
>
> -       fd = create_tmpfile(&tmp_file, filename.buf);
> -       if (fd < 0) {
> -               if (flags & HASH_SILENT)
> -                       return -1;
> -               else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -               else
> -                       return error_errno(_("unable to create temporary file"));
> +       if (!dry_run) {
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (flags & HASH_SILENT)
> +                               return -1;
> +                       else if (errno == EACCES)
> +                               return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       else
> +                               return error_errno(_("unable to create temporary file"));
> +               }
>         }
>
>         /* Set it up */
> @@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 unsigned char *in0 = stream.next_in;
>                 ret = git_deflate(&stream, Z_FINISH);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +               if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
>                 stream.next_out = compressed;
>                 stream.avail_out = sizeof(compressed);
> @@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> +       if (dry_run)
> +               return 0;
> +
>         close_loose_object(fd);
>
>         if (mtime) {
> @@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         data.buf = buf;
>         data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
>         free(buf);
>
>         return ret;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-18  5:49   ` Jiang Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-11-18  5:49 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When read input stream, oid can't get before reading all, and it will be
> filled after reading.

Under what circumstances is the oid a null oid?  Can we get the oid
from “obj_list[nr].oid” ?
See unpack_non_delta_entry() of builtin/unpack-objects.c.

> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 34 ++++++++++++++++++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index b0838c847e..8393659f0d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         const char *buf;
>         unsigned long len;
>
> -       loose_object_path(the_repository, &filename, oid);
> +       if (is_null_oid(oid)) {
> +               /* When oid is not determined, save tmp file to odb path. */
> +               strbuf_reset(&filename);
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +       } else
> +               loose_object_path(the_repository, &filename, oid);
>
>         if (!dry_run) {
>                 fd = create_tmpfile(&tmp_file, filename.buf);
> @@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>                     ret);
>         the_hash_algo->final_oid_fn(&parano_oid, &c);
> -       if (!oideq(oid, &parano_oid))
> +       if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> @@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         close_loose_object(fd);
>
> +       if (is_null_oid(oid)) {
> +               int dirlen;
> +
> +               /* copy oid */
> +               oidcpy((struct object_id *)oid, &parano_oid);
> +               /* We get the oid now */
> +               loose_object_path(the_repository, &filename, oid);
> +
> +               dirlen = directory_size(filename.buf);
> +               if (dirlen) {
> +                       struct strbuf dir = STRBUF_INIT;
> +                       /*
> +                        * Make sure the directory exists; note that the
> +                        * contents of the buffer are undefined after mkstemp
> +                        * returns an error, so we have to rewrite the whole
> +                        * buffer from scratch.
> +                        */
> +                       strbuf_reset(&dir);
> +                       strbuf_add(&dir, filename.buf, dirlen - 1);
> +                       if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +                               return -1;
> +               }
> +       }
> +
>         if (mtime) {
>                 struct utimbuf utb;
>                 utb.actime = mtime;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-18  5:56   ` Jiang Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-11-18  5:56 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Read input stream repeatedly in write_loose_object() unless reach the
> end, so that we can divide the large blob write into many small blocks.

In order to prepare the stream version of "write_loose_object()", we need ...

>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 8393659f0d..e333448c54 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
>         const char *buf;
> -       unsigned long len;
> +       int flush = 0;
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> @@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       buf = in_stream->read(in_stream->data, &len);
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
>         do {
>                 unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> +               if (!stream.avail_in) {
> +                       if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {

if ((buf = in_stream->read(in_stream->data, &stream.avail_in)) != NULL) {

Or split this long line into:

    buf = in_stream->read(in_stream->data, &stream.avail_in);
    if (buf) {

> +                               stream.next_in = (void *)buf;
> +                               in0 = (unsigned char *)buf;
> +                       } else
> +                               flush = Z_FINISH;

Add {} around this single line, see:

  https://github.com/git/git/blob/master/Documentation/CodingGuidelines#L279-L289

> +               }
> +               ret = git_deflate(&stream, flush);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>                 if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-18  4:59   ` Jiang Xin
@ 2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 0 replies; 165+ messages in thread
From: Junio C Hamano @ 2021-11-18  6:45 UTC (permalink / raw)
  To: Jiang Xin; +Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Jiang Xin <worldhello.net@gmail.com> writes:

> On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> It would be better to provide a cover letter describing changes in v2, such as:
>
> * Make "write_loose_object()" a public method, so we can
>    reuse it in "unpack_non_delta_entry()".
>    (But I doubt we can use "write_object_file_flags()" public
>      function, without make this change.)
>
> * Add an new interface "input_stream" as an argument for
>    "write_loose_object()", so that we can feed data to
>    "write_loose_object()" from buffer or from zlib stream.
>
>> Refactor write_loose_object() to support inputstream, in the same way
>> that zlib reading is chunked.
>
> In the beginning of your commit log, you should describe the problem, such as:
>
> We used to read the full content of a blob into buffer in
> "unpack_non_delta_entry()" by calling:
>
>     void *buf = get_data(size);
>
> This will consume lots of memory for a very big blob object.

I was not sure where "in_stream" came from---"use X insteads of Y",
when X is what these patches invent and introduce, does not make a
good explanation without explaining what X is, what problem X is
attempting to solve and how.

Thanks for helping to clarify the proposed log message.  

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-18  7:14   ` Jiang Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-11-18  7:14 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use "core.bigFileThreshold" to avoid OOM
> limits when called "get_data()".
>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
>  t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
>  2 files changed, 167 insertions(+), 1 deletion(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..6c757d823b 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +struct input_data_from_zstream {
> +       git_zstream *zstream;
> +       unsigned char buf[4096];
> +       int status;
> +};
> +
> +static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
> +{
> +       struct input_data_from_zstream *input = data;
> +       git_zstream *zstream = input->zstream;
> +       void *in = fill(1);
> +
> +       if (!len || input->status == Z_STREAM_END) {
> +               *readlen = 0;
> +               return NULL;
> +       }
> +
> +       zstream->next_out = input->buf;
> +       zstream->avail_out = sizeof(input->buf);
> +       zstream->next_in = in;
> +       zstream->avail_in = len;
> +
> +       input->status = git_inflate(zstream, 0);
> +       use(len - zstream->avail_in);
> +       *readlen = sizeof(input->buf) - zstream->avail_out;
> +
> +       return (const char *)input->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       char hdr[32];
> +       int hdrlen;
> +       git_zstream zstream;
> +       struct input_data_from_zstream data;
> +       struct input_stream in_stream = {
> +               .read = read_inflate_in_stream,
> +               .data = &data,
> +       };
> +       struct object_id *oid = &obj_list[nr].oid;
> +       int ret;
> +
> +       memset(&zstream, 0, sizeof(zstream));
> +       memset(&data, 0, sizeof(data));
> +       data.zstream = &zstream;
> +       git_inflate_init(&zstream);
> +
> +       /* Generate the header */
> +       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +       if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
> +               die(_("failed to write object in stream %d"), ret);
> +
> +       if (zstream.total_out != size || data.status != Z_STREAM_END)
> +               die(_("inflate returned %d"), data.status);
> +       git_inflate_end(&zstream);
> +
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {

Default size of big_file_threshold is 512m.  Can we use
"write_stream_blob" for all objects?  Can we get a more suitable
threshold through some benchmark data?

> +               write_stream_blob(nr, size);
> +               return;
> +       }

--
Jiang Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 0/5] unpack large objects in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (8 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29  7:01   ` Han Xin
                     ` (6 more replies)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                   ` (4 subsequent siblings)
  14 siblings, 7 replies; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Although we do not recommend users push large binary files to the git repositories, 
it's difficult to prevent them from doing so. Once, we found a problem with a surge 
in memory usage on the server. The source of the problem is that a user submitted 
a single object with a size of 15GB. Once someone initiates a git push, the git 
process will immediately allocate 15G of memory, resulting in an OOM risk.

Through further analysis, we found that when we execute git unpack-objects, in 
unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate 
memory equal to the size of the object. This is quite a scary thing, because the 
pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.

I got inspiration from the deflate process of zlib, maybe it would be a good idea 
to change unpack-objects to stream deflate.

Changes since v2:
* Rewrite commit messages and make changes suggested by Jiang Xin.
* Remove the commit "object-file.c: add dry_run mode for write_loose_object()" and
  use a new commit "unpack-objects.c: add dry_run mode for get_data()" instead.

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            | 92 +++++++++++++++++++++++++--
 object-file.c                       | 98 +++++++++++++++++++++++++----
 object-store.h                      |  9 +++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++
 4 files changed, 257 insertions(+), 18 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v2:
1:  01672f50a0 ! 1:  8640b04f6d object-file: refactor write_loose_object() to support inputstream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file: refactor write_loose_object() to support inputstream
    +    object-file: refactor write_loose_object() to read buffer from stream
     
    -    Refactor write_loose_object() to support inputstream, in the same way
    -    that zlib reading is chunked.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    Using "in_stream" instead of "void *buf", we needn't to allocate enough
    -    memory in advance, and only part of the contents will be read when
    -    called "in_stream.read()".
    +    This can be improved by feeding data to "write_loose_object()" in a
    +    stream. The input stream is implemented as an interface. In the first
    +    step, we make a simple implementation, feeding the entire buffer in the
    +    "stream" to "write_loose_object()" as a refactor.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
      	return fd;
      }
      
    -+struct input_data_from_buffer {
    -+	const char *buf;
    ++struct simple_input_stream_data {
    ++	const void *buf;
     +	unsigned long len;
     +};
     +
    -+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
    ++static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
     +{
    -+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
    ++	struct simple_input_stream_data *data = in_stream->data;
     +
    -+	if (input->len == 0) {
    ++	if (data->len == 0) {
     +		*len = 0;
     +		return NULL;
     +	}
    -+	*len = input->len;
    -+	input->len = 0;
    -+	return input->buf;
    ++	*len = data->len;
    ++	data->len = 0;
    ++	return data->buf;
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	struct object_id parano_oid;
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    -+	const char *buf;
    ++	const void *buf;
     +	unsigned long len;
      
      	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    -+	buf = in_stream->read(in_stream->data, &len);
    ++	buf = in_stream->read(in_stream, &len);
      	stream.next_in = (void *)buf;
      	stream.avail_in = len;
      	do {
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen = sizeof(hdr);
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
      	char *header;
      	int hdrlen, status = 0;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen;
      	int ret;
    -+	struct input_data_from_buffer data;
    ++	struct simple_input_stream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    ++		.read = feed_simple_input_stream,
     +		.data = &data,
     +	};
      
    @@ object-store.h: struct object_directory {
      };
      
     +struct input_stream {
    -+	const char *(*read)(void* data, unsigned long *len);
    ++	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
     +};
     +
2:  a309b7e391 < -:  ---------- object-file.c: add dry_run mode for write_loose_object()
3:  b0a5b53710 ! 2:  d4a2caf2bd object-file.c: handle nil oid in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: handle nil oid in write_loose_object()
    +    object-file.c: handle undetermined oid in write_loose_object()
     
    -    When read input stream, oid can't get before reading all, and it will be
    -    filled after reading.
    +    When streaming a large blob object to "write_loose_object()", we have no
    +    chance to run "write_object_file_prepare()" to calculate the oid in
    +    advance. So we need to handle undetermined oid in function
    +    "write_loose_object()".
    +
    +    In the original implementation, we know the oid and we can write the
    +    temporary file in the same directory as the final object, but for an
    +    object with an undetermined oid, we don't know the exact directory for
    +    the object, so we have to save the temporary file in ".git/objects/"
    +    directory instead.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 	const char *buf;
    + 	const void *buf;
      	unsigned long len;
      
     -	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +		strbuf_reset(&filename);
     +		strbuf_addstr(&filename, the_repository->objects->odb->path);
     +		strbuf_addch(&filename, '/');
    -+	} else
    ++	} else {
     +		loose_object_path(the_repository, &filename, oid);
    ++	}
      
    - 	if (!dry_run) {
    - 		fd = create_tmpfile(&tmp_file, filename.buf);
    + 	fd = create_tmpfile(&tmp_file, filename.buf);
    + 	if (fd < 0) {
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
      		    ret);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
      
    -@@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 
      	close_loose_object(fd);
      
     +	if (is_null_oid(oid)) {
     +		int dirlen;
     +
    -+		/* copy oid */
     +		oidcpy((struct object_id *)oid, &parano_oid);
    -+		/* We get the oid now */
     +		loose_object_path(the_repository, &filename, oid);
     +
    ++		/* We finally know the object path, and create the missing dir. */
     +		dirlen = directory_size(filename.buf);
     +		if (dirlen) {
     +			struct strbuf dir = STRBUF_INIT;
    -+			/*
    -+			 * Make sure the directory exists; note that the
    -+			 * contents of the buffer are undefined after mkstemp
    -+			 * returns an error, so we have to rewrite the whole
    -+			 * buffer from scratch.
    -+			 */
    -+			strbuf_reset(&dir);
     +			strbuf_add(&dir, filename.buf, dirlen - 1);
     +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
     +				return -1;
    ++			if (adjust_shared_perm(dir.buf))
    ++				return -1;
    ++			strbuf_release(&dir);
     +		}
     +	}
     +
4:  09d438b692 ! 3:  2575900449 object-file.c: read input stream repeatedly in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: read input stream repeatedly in write_loose_object()
    +    object-file.c: read stream in a loop in write_loose_object()
     
    -    Read input stream repeatedly in write_loose_object() unless reach the
    -    end, so that we can divide the large blob write into many small blocks.
    +    In order to prepare the stream version of "write_loose_object()", read
    +    the input stream in a loop in "write_loose_object()", so that we can
    +    feed the contents of large blob object to "write_loose_object()" using
    +    a small fixed buffer.
     
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    - 	const char *buf;
    + 	const void *buf;
     -	unsigned long len;
     +	int flush = 0;
      
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    --	buf = in_stream->read(in_stream->data, &len);
    +-	buf = in_stream->read(in_stream, &len);
     -	stream.next_in = (void *)buf;
     -	stream.avail_in = len;
      	do {
      		unsigned char *in0 = stream.next_in;
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
    -+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
    ++			buf = in_stream->read(in_stream, &stream.avail_in);
    ++			if (buf) {
     +				stream.next_in = (void *)buf;
     +				in0 = (unsigned char *)buf;
    -+			} else
    ++			} else {
     +				flush = Z_FINISH;
    ++			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
    - 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
    + 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
5:  9fb188d437 < -:  ---------- object-store.h: add write_loose_object()
-:  ---------- > 4:  ca93ecc780 unpack-objects.c: add dry_run mode for get_data()
6:  80468a6fbc ! 5:  39a072ee2a unpack-objects: unpack large object in stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack large object in stream
    +    unpack-objects: unpack_non_delta_entry() read data in a stream
     
    -    When calling "unpack_non_delta_entry()", will allocate full memory for
    -    the whole size of the unpacked object and write the buffer to loose file
    -    on disk. This may lead to OOM for the git-unpack-objects process when
    -    unpacking a very large object.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    In function "unpack_delta_entry()", will also allocate full memory to
    -    buffer the whole delta, but since there will be no delta for an object
    -    larger than "core.bigFileThreshold", this issue is moderate.
    +    By implementing a zstream version of input_stream interface, we can use
    +    a small fixed buffer for "unpack_non_delta_entry()".
     
    -    To resolve the OOM issue in "git-unpack-objects", we can unpack large
    -    object to file in stream, and use "core.bigFileThreshold" to avoid OOM
    -    limits when called "get_data()".
    +    However, unpack non-delta objects from a stream instead of from an entrie
    +    buffer will have 10% performance penalty. Therefore, only unpack object
    +    larger than the "big_file_threshold" in zstream. See the following
    +    benchmarks:
     
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    +          Range (min … max):    9.786 s … 10.603 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    +          Range (min … max):    9.884 s … 12.192 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    +          Range (min … max):    2.639 s …  2.743 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    +          Range (min … max):    2.679 s …  3.125 s    10 runs
    +
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## builtin/unpack-objects.c ##
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      	}
      }
      
    -+struct input_data_from_zstream {
    ++struct input_zstream_data {
     +	git_zstream *zstream;
     +	unsigned char buf[4096];
     +	int status;
     +};
     +
    -+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
    ++static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
     +{
    -+	struct input_data_from_zstream *input = data;
    -+	git_zstream *zstream = input->zstream;
    ++	struct input_zstream_data *data = in_stream->data;
    ++	git_zstream *zstream = data->zstream;
     +	void *in = fill(1);
     +
    -+	if (!len || input->status == Z_STREAM_END) {
    ++	if (!len || data->status == Z_STREAM_END) {
     +		*readlen = 0;
     +		return NULL;
     +	}
     +
    -+	zstream->next_out = input->buf;
    -+	zstream->avail_out = sizeof(input->buf);
    ++	zstream->next_out = data->buf;
    ++	zstream->avail_out = sizeof(data->buf);
     +	zstream->next_in = in;
     +	zstream->avail_in = len;
     +
    -+	input->status = git_inflate(zstream, 0);
    ++	data->status = git_inflate(zstream, 0);
     +	use(len - zstream->avail_in);
    -+	*readlen = sizeof(input->buf) - zstream->avail_out;
    ++	*readlen = sizeof(data->buf) - zstream->avail_out;
     +
    -+	return (const char *)input->buf;
    ++	return data->buf;
     +}
     +
     +static void write_stream_blob(unsigned nr, unsigned long size)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	char hdr[32];
     +	int hdrlen;
     +	git_zstream zstream;
    -+	struct input_data_from_zstream data;
    ++	struct input_zstream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_inflate_in_stream,
    ++		.read = feed_input_zstream,
     +		.data = &data,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	/* Generate the header */
     +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
     +
    -+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
    ++	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
     +		die(_("failed to write object in stream %d"), ret);
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      static void unpack_non_delta_entry(enum object_type type, unsigned long size,
      				   unsigned nr)
      {
    --	void *buf = get_data(size);
    +-	void *buf = get_data(size, dry_run);
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (type == OBJ_BLOB && size > big_file_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
      
    -+	buf = get_data(size);
    ++	buf = get_data(size, dry_run);
      	if (!dry_run && buf)
      		write_object(nr, type, buf, size);
      	else
     
    - ## t/t5590-receive-unpack-objects.sh (new) ##
    + ## object-file.c ##
    +@@ object-file.c: static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
    + 	return data->buf;
    + }
    + 
    +-static int write_loose_object(const struct object_id *oid, char *hdr,
    +-			      int hdrlen, struct input_stream *in_stream,
    +-			      time_t mtime, unsigned flags)
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags)
    + {
    + 	int fd, ret;
    + 	unsigned char compressed[4096];
    +
    + ## object-store.h ##
    +@@ object-store.h: int hash_object_file(const struct git_hash_algo *algo, const void *buf,
    + 		     unsigned long len, const char *type,
    + 		     struct object_id *oid);
    + 
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags);
    ++
    + int write_object_file_flags(const void *buf, unsigned long len,
    + 			    const char *type, struct object_id *oid,
    + 			    unsigned flags);
    +
    + ## t/t5590-unpack-non-delta-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    @@ t/t5590-receive-unpack-objects.sh (new)
     +		cd .git &&
     +		find objects/?? -type f | sort
     +	) >expect &&
    -+	git repack -ad
    ++	PACK=$(echo main | git pack-objects --progress --revs test)
     +'
     +
     +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
    @@ t/t5590-receive-unpack-objects.sh (new)
     +	git -C dest.git config receive.unpacklimit 100
     +'
     +
    -+test_expect_success 'fail to push: cannot allocate' '
    -+	test_must_fail git push dest.git HEAD 2>err &&
    -+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
    ++test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	test_i18ngrep "fatal: attempting to allocate" err &&
     +	(
     +		cd dest.git &&
     +		find objects/?? -type f | sort
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'unpack big object in stream' '
    -+	git push dest.git HEAD &&
    ++	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
     +		cd dest.git &&
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'setup for unpack-objects dry-run test' '
    -+	PACK=$(echo main | git pack-objects --progress --revs test) &&
    -+	unset GIT_ALLOC_LIMIT &&
     +	git init --bare unpack-test.git
     +'
     +
    -+test_expect_success 'unpack-objects dry-run with large threshold' '
    -+	(
    -+		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 2m &&
    -+		git unpack-objects -n <../test-$PACK.pack
    -+	) &&
    -+	(
    -+		cd unpack-test.git &&
    -+		find objects/ -type f
    -+	) >actual &&
    -+	test_must_be_empty actual
    -+'
    -+
    -+test_expect_success 'unpack-objects dry-run with small threshold' '
    ++test_expect_success 'unpack-objects dry-run' '
     +	(
     +		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 1m &&
     +		git unpack-objects -n <../test-$PACK.pack
     +	) &&
     +	(
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (9 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-23 23:24   ` Junio C Hamano
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                   ` (3 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index c3d866a287..227f53a0de 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..ccc1fc9c1a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (10 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 15:10   ` Derrick Stolee
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 227f53a0de..78fd2a5d39 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (11 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 78fd2a5d39..93bcfaca50 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,12 +1925,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			if (buf) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (12 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (13 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 17:37   ` Derrick Stolee
  14 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
      Range (min … max):    9.786 s … 10.603 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
      Range (min … max):    9.884 s … 12.192 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
      Range (min … max):    2.639 s …  2.743 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
      Range (min … max):    2.679 s …  3.125 s    10 runs

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 76 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 +++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bfc254a236 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index 93bcfaca50..bd7631f7ef 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index ccc1fc9c1a..cbd95c47e2 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-23 23:24   ` Junio C Hamano
  2021-11-24  9:00     ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Junio C Hamano @ 2021-11-23 23:24 UTC (permalink / raw)
  To: Han Xin; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Han Xin <chiyutianyi@gmail.com> writes:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.

Possibly a stupid question (not a review).

How does this compare with "struct git_istream" implemented for a
few existing codepaths?  It seems that the existing users are
pack-objects, index-pack and archive and all of them use the
interface to obtain data given an object name without having to grab
everything in core at once.

If we are adding a new streaming interface to go in the opposite
direction, i.e. from the working tree data to object store, I would
understand it as a complementary interface (but then I suspect there
is a half of it already in bulk-checkin API), but I am not sure how
this new thing fits in the larger picture.



> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index c3d866a287..227f53a0de 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};
> +
> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {
> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -			      int hdrlen, const void *buf, unsigned long len,
> +			      int hdrlen, struct input_stream *in_stream,
>  			      time_t mtime, unsigned flags)
>  {
>  	int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	struct object_id parano_oid;
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
> +	const void *buf;
> +	unsigned long len;
>  
>  	loose_object_path(the_repository, &filename, oid);
>  
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> +	buf = in_stream->read(in_stream, &len);
>  	stream.next_in = (void *)buf;
>  	stream.avail_in = len;
>  	do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen = sizeof(hdr);
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* Normally if we have it in the pack then we do not bother writing
>  	 * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  				  &hdrlen);
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		return 0;
> -	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>  
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> +	data.buf = buf;
> +	data.len = len;
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>  	free(buf);
>  
>  	return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..ccc1fc9c1a 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-23 23:24   ` Junio C Hamano
@ 2021-11-24  9:00     ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-24  9:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Junio C Hamano <gitster@pobox.com> writes:

>
> Han Xin <chiyutianyi@gmail.com> writes:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
>
> Possibly a stupid question (not a review).
>
> How does this compare with "struct git_istream" implemented for a
> few existing codepaths?  It seems that the existing users are
> pack-objects, index-pack and archive and all of them use the
> interface to obtain data given an object name without having to grab
> everything in core at once.
>
> If we are adding a new streaming interface to go in the opposite
> direction, i.e. from the working tree data to object store, I would
> understand it as a complementary interface (but then I suspect there
> is a half of it already in bulk-checkin API), but I am not sure how
> this new thing fits in the larger picture.
>

Thank you for your reply.

Before starting to make this patch, I did consider whether I should
reuse "struct  git_istream" to solve the problem, but I found that in the
process of git unpack-objects, the data comes from stdin, and we
cannot get an oid in advance until the whole object data is read.
Also, we can't do "lseek()“ on stdin to change the data reading position.

I compared the implementation of "bulk-checkin", and they do have
some similarities.
I think the difference in the reverse implementation is that we do not
always clearly know where the boundary of the target data is. For
example, in the process of "unpack-objects", the "buffer" has been
partially read after calling "fill()". And the "buffer" remaining after
reading cannot be discarded because it is the beginning of the next
object.
Perhaps "struct input_stream" can make some improvements to
"index_bulk_checkin()", so that it can read from an inner buffer in
addition to reading from "fd" if necessary.

>
>
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  5 +++++
> >  2 files changed, 51 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index c3d866a287..227f53a0de 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
> > +
> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> > -                           int hdrlen, const void *buf, unsigned long len,
> > +                           int hdrlen, struct input_stream *in_stream,
> >                             time_t mtime, unsigned flags)
> >  {
> >       int fd, ret;
> > @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       struct object_id parano_oid;
> >       static struct strbuf tmp_file = STRBUF_INIT;
> >       static struct strbuf filename = STRBUF_INIT;
> > +     const void *buf;
> > +     unsigned long len;
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > +     buf = in_stream->read(in_stream, &len);
> >       stream.next_in = (void *)buf;
> >       stream.avail_in = len;
> >       do {
> > @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >  {
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen = sizeof(hdr);
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* Normally if we have it in the pack then we do not bother writing
> >        * it out into .git/objects/??/?{38} file.
> > @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >                                 &hdrlen);
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               return 0;
> > -     return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> > +     return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> >  }
> >
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
> > +     data.buf = buf;
> > +     data.len = len;
> >       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> > -     ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> > +     ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> >       free(buf);
> >
> >       return ret;
> > diff --git a/object-store.h b/object-store.h
> > index 952efb6a4b..ccc1fc9c1a 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -34,6 +34,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-29  7:01   ` Han Xin
  2021-11-29 19:12     ` Jeff King
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-29  7:01 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

Han Xin <chiyutianyi@gmail.com> writes:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Although we do not recommend users push large binary files to the git repositories,
> it's difficult to prevent them from doing so. Once, we found a problem with a surge
> in memory usage on the server. The source of the problem is that a user submitted
> a single object with a size of 15GB. Once someone initiates a git push, the git
> process will immediately allocate 15G of memory, resulting in an OOM risk.
>
> Through further analysis, we found that when we execute git unpack-objects, in
> unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> memory equal to the size of the object. This is quite a scary thing, because the
> pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
>
> I got inspiration from the deflate process of zlib, maybe it would be a good idea
> to change unpack-objects to stream deflate.
>

Hi, Jeff.

I hope you can share with me how Github solves this problem.

As you said in your reply at:
https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
"we don't have a match in unpack-objects, but we always run index-pack
on incoming packs".

In the original implementation of "index-pack", for objects larger than
big_file_threshold, "fixed_buf" with a size of 8192 will be used to
complete the calculation of "oid".

I tried the implementation in jk/no-more-unpack-objects, as you noted:
  /* XXX This will expand too-large objects! */
  if (!data)
  data = new_data = get_data_from_pack(obj_entry);
If the conditions of --unpack are given, there will be risks here.
When I create an object larger than 1GB and execute index-pack, the
result is as follows:
  $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
  fatal: attempting to allocate 1228800001 over limit 1073741824

Looking forward to your reply.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-29 15:10   ` Derrick Stolee
  2021-11-29 20:44     ` Junio C Hamano
  0 siblings, 1 reply; 165+ messages in thread
From: Derrick Stolee @ 2021-11-29 15:10 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
> 
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.

My first reaction is to not write into .git/objects/ directly, but
instead make a .git/objects/tmp/ directory and write within that
directory. The idea is to prevent leaving stale files in the
.git/objects/ directory if the process terminates strangely (say,
a power outage or segfault).

If this was an interesting idea to pursue, it does leave a question:
should we clean up the tmp/ directory when it is empty? That would
require adding a check in finalize_object_file() that is probably
best left unchecked (the lstat() would add a cost per loose object
write that is probably too costly). I would rather leave an empty
tmp/ directory than add that cost per loose object write.

I suppose another way to do it would be to register the check as
an event at the end of the process, so we only check once, and
that only happens if we created a loose object with this streaming
method.

With all of these complications in mind, I think cleaning up the
stale tmp/ directory could (at the very least) be delayed to another
commit or patch series. Hopefully adding the directory is not too
much complication to add here.

> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

Here, you could instead of the strbuf_addch() do

	strbuf_add(&filename, "/tmp/", 5);
	if (safe_create_leading_directories(filename.buf)) {
		error(_("failed to create '%s'"));
		strbuf_release(&filename);
		return -1;
	}		

> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				return -1;
> +			if (adjust_shared_perm(dir.buf))
> +				return -1;
> +			strbuf_release(&dir);
> +		}
> +	}
> +

Upon first reading I was asking "where is the file rename?" but
it is part of finalize_object_file() which is called further down.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-11-29 17:37   ` Derrick Stolee
  2021-11-30 13:49     ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Derrick Stolee @ 2021-11-29 17:37 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
> 
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
> 
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
>       Range (min … max):    9.786 s … 10.603 s    10 runs
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
>       Range (min … max):    9.884 s … 12.192 s    10 runs

It seems that you want us to compare this pair of results, and
hyperfine can assist with that by including multiple benchmarks
(with labels, using '-n') as follows:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
  Range (min … max):   20.741 s … 20.909 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
  Range (min … max):   26.419 s … 26.611 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
  Range (min … max):   26.416 s … 26.739 s    10 runs
 
Summary
  'old' ran
    1.27 ± 0.00 times faster than 'new'
    1.27 ± 0.01 times faster than 'new (small threshold)'

(Here, 'old' is testing a compiled version of the latest 'master'
branch, while 'new' has your patches applied on top.)

Notice from this example I had a pack with many small objects (mostly
commits and trees) and I see that this change introduces significant
overhead to this case.

It would be nice to understand this overhead and fix it before taking
this change any further.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29  7:01   ` Han Xin
@ 2021-11-29 19:12     ` Jeff King
  2021-11-30  2:57       ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Jeff King @ 2021-11-29 19:12 UTC (permalink / raw)
  To: Han Xin; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Mon, Nov 29, 2021 at 03:01:47PM +0800, Han Xin wrote:

> Han Xin <chiyutianyi@gmail.com> writes:
> >
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Although we do not recommend users push large binary files to the git repositories,
> > it's difficult to prevent them from doing so. Once, we found a problem with a surge
> > in memory usage on the server. The source of the problem is that a user submitted
> > a single object with a size of 15GB. Once someone initiates a git push, the git
> > process will immediately allocate 15G of memory, resulting in an OOM risk.
> >
> > Through further analysis, we found that when we execute git unpack-objects, in
> > unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> > memory equal to the size of the object. This is quite a scary thing, because the
> > pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
> >
> > I got inspiration from the deflate process of zlib, maybe it would be a good idea
> > to change unpack-objects to stream deflate.
> >
> 
> Hi, Jeff.
> 
> I hope you can share with me how Github solves this problem.
> 
> As you said in your reply at:
> https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
> "we don't have a match in unpack-objects, but we always run index-pack
> on incoming packs".
> 
> In the original implementation of "index-pack", for objects larger than
> big_file_threshold, "fixed_buf" with a size of 8192 will be used to
> complete the calculation of "oid".

We set transfer.unpackLimit to "1", so we never run unpack-objects at
all. We always run index-pack, and every push, no matter how small,
results in a pack.

We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
custom code in index-pack to detect large objects (where our definition
of "large" is 100MB by default):

  - for large blobs, we do index it as normal, writing the oid out to a
    file which is then processed by a pre-receive hook (since people
    often push up large files accidentally, the hook generates a nice
    error message, including finding the path at which the blob is
    referenced)

  - for other large objects, we die immediately (with an error message).
    100MB commit messages aren't a common user error, and it closes off
    a whole set of possible integer-overflow parsing attacks (e.g.,
    index-pack in strict-mode will run every tree through fsck_tree(),
    so there's otherwise nothing stopping you from having a 4GB filename
    in a tree).

> I tried the implementation in jk/no-more-unpack-objects, as you noted:
>   /* XXX This will expand too-large objects! */
>   if (!data)
>   data = new_data = get_data_from_pack(obj_entry);
> If the conditions of --unpack are given, there will be risks here.
> When I create an object larger than 1GB and execute index-pack, the
> result is as follows:
>   $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
>   fatal: attempting to allocate 1228800001 over limit 1073741824

Yeah, that issue was one of the reasons I never sent the "index-pack
--unpack" code to the list. We don't actually use those patches at
GitHub. It was something I was working on for upstream but never
finished.

-Peff

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 15:10   ` Derrick Stolee
@ 2021-11-29 20:44     ` Junio C Hamano
  2021-11-29 22:18       ` Derrick Stolee
  0 siblings, 1 reply; 165+ messages in thread
From: Junio C Hamano @ 2021-11-29 20:44 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Derrick Stolee <stolee@gmail.com> writes:

> My first reaction is to not write into .git/objects/ directly, but
> instead make a .git/objects/tmp/ directory and write within that
> directory. The idea is to prevent leaving stale files in the
> .git/objects/ directory if the process terminates strangely (say,
> a power outage or segfault).

Even if we know the name of the object we are writing beforehand, I
do not think it is a good idea to open-write-close the final object
file.  The approach we already use everywhere is to write into a
tmpfile/lockfile and rename it to the final name 

object-file.c::write_loose_object() uses create_tmpfile() to prepare
a temporary file whose name begins with "tmp_obj_", so that "gc" can
recognize stale ones and remove them.

> If this was an interesting idea to pursue, it does leave a question:
> should we clean up the tmp/ directory when it is empty? That would
> require adding a check in finalize_object_file() that is probably
> best left unchecked (the lstat() would add a cost per loose object
> write that is probably too costly). I would rather leave an empty
> tmp/ directory than add that cost per loose object write.

I am not sure why we want a new tmp/ directory.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 20:44     ` Junio C Hamano
@ 2021-11-29 22:18       ` Derrick Stolee
  2021-11-30  3:23         ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Derrick Stolee @ 2021-11-29 22:18 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> Derrick Stolee <stolee@gmail.com> writes:
> 
>> My first reaction is to not write into .git/objects/ directly, but
>> instead make a .git/objects/tmp/ directory and write within that
>> directory. The idea is to prevent leaving stale files in the
>> .git/objects/ directory if the process terminates strangely (say,
>> a power outage or segfault).
> 
> Even if we know the name of the object we are writing beforehand, I
> do not think it is a good idea to open-write-close the final object
> file.  The approach we already use everywhere is to write into a
> tmpfile/lockfile and rename it to the final name 
> 
> object-file.c::write_loose_object() uses create_tmpfile() to prepare
> a temporary file whose name begins with "tmp_obj_", so that "gc" can
> recognize stale ones and remove them.

The only difference is that the tmp_obj_* file would go into the
loose object directory corresponding to the first two hex characters
of the OID, but that no longer happens now.
 
>> If this was an interesting idea to pursue, it does leave a question:
>> should we clean up the tmp/ directory when it is empty? That would
>> require adding a check in finalize_object_file() that is probably
>> best left unchecked (the lstat() would add a cost per loose object
>> write that is probably too costly). I would rather leave an empty
>> tmp/ directory than add that cost per loose object write.
> 
> I am not sure why we want a new tmp/ directory.

I'm just thinking of a case where this fails repeatedly I would
rather have those failed tmp_obj_* files isolated in their own
directory. It's an extremely minor point, so I'm fine to drop
the recommendation.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29 19:12     ` Jeff King
@ 2021-11-30  2:57       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-30  2:57 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 3:12 AM Jeff King <peff@peff.net> wrote:
> We set transfer.unpackLimit to "1", so we never run unpack-objects at
> all. We always run index-pack, and every push, no matter how small,
> results in a pack.
>
> We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
> custom code in index-pack to detect large objects (where our definition
> of "large" is 100MB by default):
>
>   - for large blobs, we do index it as normal, writing the oid out to a
>     file which is then processed by a pre-receive hook (since people
>     often push up large files accidentally, the hook generates a nice
>     error message, including finding the path at which the blob is
>     referenced)
>
>   - for other large objects, we die immediately (with an error message).
>     100MB commit messages aren't a common user error, and it closes off
>     a whole set of possible integer-overflow parsing attacks (e.g.,
>     index-pack in strict-mode will run every tree through fsck_tree(),
>     so there's otherwise nothing stopping you from having a 4GB filename
>     in a tree).

Thank you very much for sharing.

The way Github handles it reminds me of what Shawn Pearce introduced in
"Scaling up JGit". I guess "mulit-pack-index" and "bitmap" must play an
important role in this.

I will seriously consider this solution, thanks a lot.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 22:18       ` Derrick Stolee
@ 2021-11-30  3:23         ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-11-30  3:23 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 6:18 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> > Derrick Stolee <stolee@gmail.com> writes:
> >
> >> My first reaction is to not write into .git/objects/ directly, but
> >> instead make a .git/objects/tmp/ directory and write within that
> >> directory. The idea is to prevent leaving stale files in the
> >> .git/objects/ directory if the process terminates strangely (say,
> >> a power outage or segfault).
> >
> > Even if we know the name of the object we are writing beforehand, I
> > do not think it is a good idea to open-write-close the final object
> > file.  The approach we already use everywhere is to write into a
> > tmpfile/lockfile and rename it to the final name
> >
> > object-file.c::write_loose_object() uses create_tmpfile() to prepare
> > a temporary file whose name begins with "tmp_obj_", so that "gc" can
> > recognize stale ones and remove them.
>
> The only difference is that the tmp_obj_* file would go into the
> loose object directory corresponding to the first two hex characters
> of the OID, but that no longer happens now.
>

At the beginning of this patch, I did save the temporary object in a
two hex characters directory of "null_oid", but this is also a very
strange behavior. "Gc" will indeed clean up these tmp_obj_* files, no
matter if they are in .git/objects/ or .git/objects/xx.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-29 17:37   ` Derrick Stolee
@ 2021-11-30 13:49     ` Han Xin
  2021-11-30 18:38       ` Derrick Stolee
  0 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-11-30 13:49 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/21/2021 10:32 PM, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > By implementing a zstream version of input_stream interface, we can use
> > a small fixed buffer for "unpack_non_delta_entry()".
> >
> > However, unpack non-delta objects from a stream instead of from an entrie
> > buffer will have 10% performance penalty. Therefore, only unpack object
> > larger than the "big_file_threshold" in zstream. See the following
> > benchmarks:
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
> >       Range (min … max):    9.786 s … 10.603 s    10 runs
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
> >       Range (min … max):    9.884 s … 12.192 s    10 runs
>
> It seems that you want us to compare this pair of results, and
> hyperfine can assist with that by including multiple benchmarks
> (with labels, using '-n') as follows:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>   Range (min … max):   20.741 s … 20.909 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>   Range (min … max):   26.419 s … 26.611 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>   Range (min … max):   26.416 s … 26.739 s    10 runs
>
> Summary
>   'old' ran
>     1.27 ± 0.00 times faster than 'new'
>     1.27 ± 0.01 times faster than 'new (small threshold)'
>
> (Here, 'old' is testing a compiled version of the latest 'master'
> branch, while 'new' has your patches applied on top.)
>
> Notice from this example I had a pack with many small objects (mostly
> commits and trees) and I see that this change introduces significant
> overhead to this case.
>
> It would be nice to understand this overhead and fix it before taking
> this change any further.
>
> Thanks,
> -Stolee

Can you show me the specific information of the repository you
tested, so that I can analyze it further.

I test this repository, but did not meet the problem:

 Unpacking objects: 100% (18345/18345), 43.15 MiB

hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' 'git -C dest.git unpack-objects <big.pack' \
        -n 'new' 'new/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' 'new/git -c
core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
Benchmark 1: old
  Time (mean ± σ):     17.403 s ±  0.880 s    [User: 4.996 s, System: 11.803 s]
  Range (min … max):   15.911 s … 19.368 s    10 runs

Benchmark 2: new
  Time (mean ± σ):     17.788 s ±  0.199 s    [User: 5.054 s, System: 12.257 s]
  Range (min … max):   17.420 s … 18.195 s    10 runs

Benchmark 3: new (small threshold)
  Time (mean ± σ):     18.433 s ±  0.711 s    [User: 4.982 s, System: 12.338 s]
  Range (min … max):   17.518 s … 19.775 s    10 runs

Summary
  'old' ran
    1.02 ± 0.05 times faster than 'new'
    1.06 ± 0.07 times faster than 'new (small threshold)'

Thanks,
- Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 13:49     ` Han Xin
@ 2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  0 siblings, 2 replies; 165+ messages in thread
From: Derrick Stolee @ 2021-11-30 18:38 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 11/30/2021 8:49 AM, Han Xin wrote:
> On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>> $ hyperfine \
>>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>>
>> Benchmark 1: old
>>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>>   Range (min … max):   20.741 s … 20.909 s    10 runs
>>
>> Benchmark 2: new
>>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>>   Range (min … max):   26.419 s … 26.611 s    10 runs
>>
>> Benchmark 3: new (small threshold)
>>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>>   Range (min … max):   26.416 s … 26.739 s    10 runs
>>
>> Summary
>>   'old' ran
>>     1.27 ± 0.00 times faster than 'new'
>>     1.27 ± 0.01 times faster than 'new (small threshold)'
>>
>> (Here, 'old' is testing a compiled version of the latest 'master'
>> branch, while 'new' has your patches applied on top.)
>>
>> Notice from this example I had a pack with many small objects (mostly
>> commits and trees) and I see that this change introduces significant
>> overhead to this case.
>>
>> It would be nice to understand this overhead and fix it before taking
>> this change any further.
>>
>> Thanks,
>> -Stolee
> 
> Can you show me the specific information of the repository you
> tested, so that I can analyze it further.

I used a pack-file from an internal repo. It happened to be using
partial clone, so here is a repro with the git/git repository
after cloning this way:

$ git clone --no-checkout --filter=blob:none https://github.com/git/git

(copy the large .pack from git/.git/objects/pack/ to big.pack)

$ hyperfine \
	--prepare 'rm -rf dest.git && git init --bare dest.git' \
	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
  Range (min … max):   82.042 s … 83.587 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
  Range (min … max):   100.866 s … 102.633 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
  Range (min … max):   100.639 s … 101.375 s    10 runs
 
Summary
  'old' ran
    1.22 ± 0.01 times faster than 'new (small threshold)'
    1.23 ± 0.01 times faster than 'new'

I'm also able to repro this with a smaller repo (microsoft/scalar)
so the tests complete much faster:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'

Benchmark 1: old
  Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
  Range (min … max):    3.269 s …  3.351 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
  Range (min … max):    3.378 s …  3.679 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
  Range (min … max):    3.359 s …  3.747 s    10 runs
 
Summary
  'old' ran
    1.09 ± 0.04 times faster than 'new (small threshold)'
    1.09 ± 0.03 times faster than 'new'

It's not the same relative overhead, but still significant.

These pack-files contain (mostly) small objects, no large blobs.
I know that's not the target of your efforts, but it would be
good to avoid a regression here.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 165+ messages in thread

* "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...])
  2021-11-30 18:38       ` Derrick Stolee
@ 2021-12-01 20:37         ` Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-01 20:37 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Han Xin, David Peter


I hadn't sent a shameless plug for my "git hyperfine" script to the
list, perhaps this is a good time. It's just a thin shellscript wrapper
around "hyperfine" that I wrote the other day, which...

On Tue, Nov 30 2021, Derrick Stolee wrote:

> [...]
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
> 	--prepare 'rm -rf dest.git && git init --bare dest.git' \
> 	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>  
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>  
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>  
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'

...adds enough sugar around "hyperfine" itself to do this as e.g. (the
"-s" is a feature I submitted to hyperfine itself, it's not in a release
yet[1], but in this case you could also use "-p"):

    git hyperfine -L rev v2.20.0,origin/master \
        -s 'if ! test -d redis.git; then git clone --bare --filter=blob:none https://github.com/redis/redis; fi && make' \
        -p 'rm -rf dest.git; git init --bare dest.git' \
        './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)'

The sugar being that for each named "rev" parameter it'll set up "git
worktree" for you, so under the hood each of those is chdir-ing to the
respective revision of:
    
    $ git worktree list
    [...]
    /run/user/1001/git-hyperfine/origin/master  abe6bb39053 (detached HEAD)
    /run/user/1001/git-hyperfine/v2.33.0        225bc32a989 (detached HEAD)

That they're named revisions and not git-rev-parse'd is intentional,
since you'll benefit from faster incremental "make" (even if using
"ccache"). I'm typically benchmarking HEAD~1,HEAD~0.

The output will then use those "rev" parameters, and be e.g.:
    
    Benchmark 1: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0
      Time (mean ± σ):      6.678 s ±  0.046 s    [User: 4.525 s, System: 2.117 s]
      Range (min … max):    6.619 s …  6.765 s    10 runs
     
    Benchmark 2: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master
      Time (mean ± σ):      6.756 s ±  0.074 s    [User: 4.586 s, System: 2.134 s]
      Range (min … max):    6.691 s …  6.941 s    10 runs
     
    Summary
      './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0' ran
        1.01 ± 0.01 times faster than './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master'

I think if you're routinely benchmarking N different git versions you'll
find it handy, it also has configurable hook support (using git config),
so e.g. it's easy to copy your config.mak in-place in the
worktrees. E.g. my config is:

    $ git -P config --get-regexp '^hyperfine'
    hyperfine.run-dir $XDG_RUNTIME_DIR/git-hyperfine
    hyperfine.xargs-options -r
    hyperfine.hook.setup ~/g/git.meta/config.mak.sh

It's hosted at https://github.com/avar/git-hyperfine/ and
https://gitlab.com/avar/git-hyperfine/; It's implemented in (portable)
POSIX shell script.

There's surely some bugs in it, one known one is that unlike hyperfine
it doesn't accept there being spaces in the parameters to -L, because
I'm screwing up some quoting-within-quoting in the (shellscript)
implementation (suggestions for that particular one most welcome).

I hacked it up after this suggestion from Jeff King[2] of moving t/perf
over to it.

I haven't done any of that legwork, but I think a wrapper like
"git-hyperfine" that prepares worktrees for the N revisions we're
benchmarking is a good direction to go in.

We don't use git-worktrees in t/perf, but probably could for most/all
tests. In any case it would be easy to have the script setup the revs to
be benchmarked in some hookable custom manner to have it do exactly what
t/perf/run is doing now.

1. https://github.com/sharkdp/hyperfine/commit/017d55a
2. https://lore.kernel.org/git/YV+zFqi4VmBVJYex@coredump.intra.peff.net/

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
@ 2021-12-02  7:33         ` Han Xin
  2021-12-02 13:53           ` Derrick Stolee
  1 sibling, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-02  7:33 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'
>
> I'm also able to repro this with a smaller repo (microsoft/scalar)
> so the tests complete much faster:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
>   Range (min … max):    3.269 s …  3.351 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
>   Range (min … max):    3.378 s …  3.679 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
>   Range (min … max):    3.359 s …  3.747 s    10 runs
>
> Summary
>   'old' ran
>     1.09 ± 0.04 times faster than 'new (small threshold)'
>     1.09 ± 0.03 times faster than 'new'
>
> It's not the same relative overhead, but still significant.
>
> These pack-files contain (mostly) small objects, no large blobs.
> I know that's not the target of your efforts, but it would be
> good to avoid a regression here.
>
> Thanks,
> -Stolee

With your help, I did catch this performance problem, which was
introduced in this patch:
https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/

This patch changes the original data reading ino to stream reading, but
its problem is that even for the original reading of the whole object data,
it still generates an additional git_deflate() and subsequent transfer.

I will fix it in a follow-up patch.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-02 13:53           ` Derrick Stolee
  0 siblings, 0 replies; 165+ messages in thread
From: Derrick Stolee @ 2021-12-02 13:53 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 12/2/2021 2:33 AM, Han Xin wrote:
> On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>> These pack-files contain (mostly) small objects, no large blobs.
>> I know that's not the target of your efforts, but it would be
>> good to avoid a regression here.
>>
>> Thanks,
>> -Stolee
> 
> With your help, I did catch this performance problem, which was
> introduced in this patch:
> https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/
> 
> This patch changes the original data reading ino to stream reading, but
> its problem is that even for the original reading of the whole object data,
> it still generates an additional git_deflate() and subsequent transfer.

I'm glad you found it!

> I will fix it in a follow-up patch.

Looking forward to it.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-07 16:18     ` Derrick Stolee
                       ` (7 more replies)
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                     ` (4 subsequent siblings)
  6 siblings, 8 replies; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v3:
* Add "size" to "struct input_stream" which used by following commits.

* Increase the buffer size of "struct input_zstream_data" from 4096 to
  8192, which is consistent with the "fixed_buf" in the "index-pack.c".

* Refactor "read stream in a loop in write_loose_object()" which
  introduced a performance problem reported by Derrick Stolee[1].

* Rewrite benchmarks in "unpack-objects: unpack_non_delta_entry() read
  data in a stream" with sugguestions by Derrick Stolee[1] and
  Ævar Arnfjörð Bjarmason[2]. 
  Now use "scalar.git" to benchmark, which contains more than 28000
  objects and 96 objects larger than 16kB.

1. https://lore.kernel.org/git/8ff89e50-1b80-7932-f0e2-af401ee04bb1@gmail.com/
2. https://lore.kernel.org/git/211201.86r1aw9gbd.gmgdl@evledraar.gmail.com/

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            |  93 +++++++++++++++++++++++--
 object-file.c                       | 102 ++++++++++++++++++++++++----
 object-store.h                      |  10 +++
 t/t5590-unpack-non-delta-objects.sh |  76 +++++++++++++++++++++
 4 files changed, 262 insertions(+), 19 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v3:
1:  8640b04f6d ! 1:  af707ef304 object-file: refactor write_loose_object() to read buffer from stream
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* Normally if we have it in the pack then we do not bother writing
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* type string, SP, %lu of the length plus NUL must fit this */
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	if (has_loose_object(oid))
      		return 0;
      	buf = read_object(the_repository, oid, &type, &len);
    ++	in_stream.size = len;
      	if (!buf)
      		return error(_("cannot read object for %s"), oid_to_hex(oid));
     +	data.buf = buf;
    @@ object-store.h: struct object_directory {
     +struct input_stream {
     +	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
    ++	size_t size;
     +};
     +
      KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
2:  d4a2caf2bd = 2:  321ad90d8e object-file.c: handle undetermined oid in write_loose_object()
3:  2575900449 ! 3:  1992ac39af object-file.c: read stream in a loop in write_loose_object()
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
     +			buf = in_stream->read(in_stream, &stream.avail_in);
    -+			if (buf) {
    -+				stream.next_in = (void *)buf;
    -+				in0 = (unsigned char *)buf;
    -+			} else {
    ++			stream.next_in = (void *)buf;
    ++			in0 = (unsigned char *)buf;
    ++			/* All data has been read. */
    ++			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
     +				flush = Z_FINISH;
    -+			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
      		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
    + 		stream.next_out = compressed;
    + 		stream.avail_out = sizeof(compressed);
    +-	} while (ret == Z_OK);
    ++	} while (ret == Z_OK || ret == Z_BUF_ERROR);
    + 
    + 	if (ret != Z_STREAM_END)
    + 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
4:  ca93ecc780 = 4:  c41eb06533 unpack-objects.c: add dry_run mode for get_data()
5:  39a072ee2a ! 5:  9427775bdc unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Commit message
         larger than the "big_file_threshold" in zstream. See the following
         benchmarks:
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    -          Range (min … max):    9.786 s … 10.603 s    10 runs
    +        hyperfine \
    +          --setup \
    +          'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
    +          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +          -n 'old' 'git -C dest.git unpack-objects <small.pack' \
    +          -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
    +          -n 'new (small threshold)' \
    +          'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    +        Benchmark 1: old
    +          Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
    +          Range (min … max):    6.018 s …  6.189 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    -          Range (min … max):    9.884 s … 12.192 s    10 runs
    +        Benchmark 2: new
    +          Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
    +          Range (min … max):    6.030 s …  6.142 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    -          Range (min … max):    2.639 s …  2.743 s    10 runs
    +        Benchmark 3: new (small threshold)
    +          Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
    +          Range (min … max):    6.711 s …  6.809 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    -          Range (min … max):    2.679 s …  3.125 s    10 runs
    +        Summary
    +          'old' ran
    +            1.00 ± 0.01 times faster than 'new'
    +            1.11 ± 0.01 times faster than 'new (small threshold)'
     
    +    Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      
     +struct input_zstream_data {
     +	git_zstream *zstream;
    -+	unsigned char buf[4096];
    ++	unsigned char buf[8192];
     +	int status;
     +};
     +
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	struct input_stream in_stream = {
     +		.read = feed_input_zstream,
     +		.data = &data,
    ++		.size = size,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
     +	int ret;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  6 ++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb972cdccd..82656f7428 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,14 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1997,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
+	in_stream.size = len;
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..a84d891d60 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	size_t size;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (2 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
                     ` (2 subsequent siblings)
  6 siblings, 2 replies; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 82656f7428..1c41587bfb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (3 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1c41587bfb..fa54e39c2c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,18 +1925,23 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)buf;
+			in0 = (unsigned char *)buf;
+			/* All data has been read. */
+			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (4 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (5 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
                       ` (2 more replies)
  6 siblings, 3 replies; 165+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      -n 'old' 'git -C dest.git unpack-objects <small.pack' \
      -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
      -n 'new (small threshold)' \
      'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    Benchmark 1: old
      Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
      Range (min … max):    6.018 s …  6.189 s    10 runs

    Benchmark 2: new
      Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
      Range (min … max):    6.030 s …  6.142 s    10 runs

    Benchmark 3: new (small threshold)
      Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
      Range (min … max):    6.711 s …  6.809 s    10 runs

    Summary
      'old' ran
        1.00 ± 0.01 times faster than 'new'
        1.11 ± 0.01 times faster than 'new (small threshold)'

Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 77 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bedc494e2d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,86 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+		.size = size,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index fa54e39c2c..71d510614b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index a84d891d60..ac5b11ec16 100644
--- a/object-store.h
+++ b/object-store.h
@@ -229,6 +229,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:42       ` Han Xin
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:07 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git' \
>       -n 'old' 'git -C dest.git unpack-objects <small.pack' \
>       -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
>       -n 'new (small threshold)' \
>       'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
>     Benchmark 1: old
>       Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
>       Range (min … max):    6.018 s …  6.189 s    10 runs
>
>     Benchmark 2: new
>       Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
>       Range (min … max):    6.030 s …  6.142 s    10 runs
>
>     Benchmark 3: new (small threshold)
>       Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
>       Range (min … max):    6.711 s …  6.809 s    10 runs
>
>     Summary
>       'old' ran
>         1.00 ± 0.01 times faster than 'new'
>         1.11 ± 0.01 times faster than 'new (small threshold)'

So before we wrote used core.bigfilethreshold for two things (or more?):
Whether we show a diff for it (we mark it "binary") and whether it's
split into a loose object.

Now it's three things, we've added a "this is a threshold when we'll
stream the object" to that.

Might it make sense to squash something like this in, so we can have our
cake & eat it too?

With this I get, where HEAD~0 is this change:
    
    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0' ran
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.06 ± 0.14 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.20 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

I.e. it's 5% slower, not 20% (haven't looked into why), but we'll not
stream out 16k..128MB objects (maybe the repo has even bigger ones?)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a1..601b7a2418f 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index bedc494e2db..94ce275c807 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -400,7 +400,7 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 	void *buf;
 
 	/* Write large blob in stream without allocating full buffer. */
-	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
 		write_stream_blob(nr, size);
 		return;
 	}
diff --git a/cache.h b/cache.h
index eba12487b99..4037c7fd849 100644
--- a/cache.h
+++ b/cache.h
@@ -964,6 +964,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a706..7b122a142a8 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 9da7f3c1a19..4fcc3de7417 100644
--- a/environment.c
+++ b/environment.c
@@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:51       ` Han Xin
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:21 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);

Why are we breaking the promise that "oid" is constant here? I tested
locally with the below on top, and it seems to work (at least no tests
broke). Isn't it preferrable to the cast & the caller having its "oid"
changed?

diff --git a/object-file.c b/object-file.c
index 71d510614b9..d014e6942ea 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	close_loose_object(fd);
 
 	if (is_null_oid(oid)) {
+		struct object_id oid2;
 		int dirlen;
 
-		oidcpy((struct object_id *)oid, &parano_oid);
-		loose_object_path(the_repository, &filename, oid);
+		oidcpy(&oid2, &parano_oid);
+		loose_object_path(the_repository, &filename, &oid2);
 
 		/* We finally know the object path, and create the missing dir. */
 		dirlen = directory_size(filename.buf);

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:07       ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:28 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  6 ++++++
>  2 files changed, 55 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb972cdccd..82656f7428 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};

I see why you picked "const void *buf" here, over say const char *, it's
what "struct input_stream" uses.

But why not use size_t for the length, as input_stream does?

> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {

nit: if (!data->len)...

> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;

But isn't the body of this functin the same as:

        *len = data->len;
        if (!len)
                return NULL;
        data->len = 0;
        return data->buf;

I.e. you don't need the condition for setting "*len" if it's 0, then
data->len is also 0. You just want to return NULL afterwards, and not
set (harmless, but no need) data->len to 0)< or return data->buf.
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

Maybe it's that I'm unused to it, but I find this a bit more readable:
	
	@@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
	 {
	 	char hdr[MAX_HEADER_LEN];
	 	int hdrlen = sizeof(hdr);
	+	struct simple_input_stream_data tmp = {
	+		.buf = buf,
	+		.len = len,
	+	};
	 	struct input_stream in_stream = {
	 		.read = feed_simple_input_stream,
	-		.data = (void *)&(struct simple_input_stream_data) {
	-			.buf = buf,
	-			.len = len,
	-		},
	+		.data = (void *)&tmp,
	 		.size = len,
	 	};
	
Yes there's a temporary variable, but no denser inline casting. Also
easier to strep through in a debugger (which will have the type
information on "tmp".

>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

ditto..

>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
> +	in_stream.size = len;

Why are we setting this here?...

>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));

...Insted of after this point, as we may error and never use it?

> +	data.buf = buf;
> +	data.len = len;

Probably won't matter,  just a nit...

> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +	size_t size;
> +};
> +

Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:12       ` Han Xin
  1 sibling, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:41 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);

Why re-use this & leak memory? An existing strbuf use in this function
doesn't leak in the same way. Just release it as in the below patch on
top (the ret v.s. err variable naming is a bit confused, maybe could do
with a prep cleanup step.).

> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

And once we do that this could just become:

	strbuf_addf($filename, "%s/", ...)

Is there's existing uses of this pattern, so mayb e not worth it, but it
allows you to remove the braces on the if/else.

diff --git a/object-file.c b/object-file.c
index 8bd89e7b7ba..2b52f3fc1cc 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 		       int hdrlen, struct input_stream *in_stream,
 		       time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret, err = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
-		strbuf_reset(&filename);
 		strbuf_addstr(&filename, the_repository->objects->odb->path);
 		strbuf_addch(&filename, '/');
 	} else {
@@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			err = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			err = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			struct strbuf dir = STRBUF_INIT;
 			strbuf_add(&dir, filename.buf, dirlen - 1);
 			if (mkdir(dir.buf, 0777) && errno != EEXIST)
-				return -1;
-			if (adjust_shared_perm(dir.buf))
-				return -1;
-			strbuf_release(&dir);
+				err = -1;
+			else if (adjust_shared_perm(dir.buf))
+				err = -1;
+			else
+				strbuf_release(&dir);
+			if (err < 0)
+				goto cleanup;
 		}
 	}
 
@@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	return err;
 }
 
 static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:17       ` Han Xin
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:54 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> new file mode 100755
> index 0000000000..01d950d119
> --- /dev/null
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -0,0 +1,76 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort

...are thse...

> +	) >expect &&
> +	PACK=$(echo main | git pack-objects --progress --revs test)

Is --progress needed?

> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100

I think it would be better to just (could roll this into a function):

	test_when_finished "rm -rf dest.git" &&
	git init dest.git &&
	git -C dest.git config ...

Then you can use it with e.g. --run=3-4 and not have it error out
because of skipped setup.

A lot of our tests fail like that, but in this case fixing it seems
trivial.



> +'
> +
> +test_expect_success 'fail to unpack-objects: cannot allocate' '
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	test_i18ngrep "fatal: attempting to allocate" err &&

nit: just "grep", not "test_i18ngrep"

> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort

..."find" needed over just globbing?:

    obj=$(echo objects/*/*)

?

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:20       ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> +	unsigned long bufsize = dry_run ? 4096 : size;
> +	void *buf = xmallocz(bufsize);

It's probably nothing, but in your CL you note that you changed another
hardcoding from 4k to 8k, should this one still be 4k?

It's probably fine, just wondering...

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:48       ` Han Xin
  2 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 14:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [..]
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +	char hdr[32];
> +	int hdrlen;
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +		.size = size,
> +	};
> +	struct object_id *oid = &obj_list[nr].oid;
> +	int ret;
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));
> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	/* Generate the header */
> +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
> +		die(_("failed to write object in stream %d"), ret);
> +
> +	if (zstream.total_out != size || data.status != Z_STREAM_END)
> +		die(_("inflate returned %d"), data.status);
> +	git_inflate_end(&zstream);
> +
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}

Just a side-note, I think (but am not 100% sure) that these existing
occurances aren't needed due to our use of CALLOC_ARRAY():
    
    diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
    index 4a9466295ba..00b349412c5 100644
    --- a/builtin/unpack-objects.c
    +++ b/builtin/unpack-objects.c
    @@ -248,7 +248,6 @@ static void write_object(unsigned nr, enum object_type type,
                            die("failed to write object");
                    added_object(nr, type, buf, size);
                    free(buf);
    -               obj_list[nr].obj = NULL;
            } else if (type == OBJ_BLOB) {
                    struct blob *blob;
                    if (write_object_file(buf, size, type_name(type),
    @@ -262,7 +261,6 @@ static void write_object(unsigned nr, enum object_type type,
                            blob->object.flags |= FLAG_WRITTEN;
                    else
                            die("invalid blob object");
    -               obj_list[nr].obj = NULL;
            } else {
                    struct object *obj;
                    int eaten;

The reason I'm noting it is that the same seems to be true of your new
addition here. I.e. are these assignments to NULL needed?

Anyway, the reason I started poking at this it tha this
write_stream_blob() seems to duplicate much of write_object(). AFAICT
only the writing part is really different, the part where we
lookup_blob() after, set FLAG_WRITTEN etc. is all the same.

Why can't we call write_object() here?

The obvious answer seems to be that the call to write_object_file()
isn't prepared to do the sort of streaming that you want, so instead
you're bypassing it and calling write_loose_object() directly.

I haven't tried this myself, but isn't a better and cleaner approach
here to not add another meaning to what is_null_oid() means, but to just
add a HASH_STREAM flag that'll get passed down as "unsigned flags" to
write_loose_object()? See FLAG_BITS in object.h.

Then the "obj_list[nr].obj" here could also become
"obj_list[nr].obj.flags |= (1u<<12)" or whatever (but that wouldn't
strictly be needed I think.

But by adding the "HASH_STREAM" flag you could I think stop duplicating
the "Generate the header" etc. here and call write_object_file_flags().

I don't so much care about how it's done within unpack-objects.c, but
not having another meaning to is_null_oid() in play would be really
nice, and it this case it seems entirely avoidable.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:07       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-06  2:07 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:41 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  6 ++++++
> >  2 files changed, 55 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb972cdccd..82656f7428 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
>
> I see why you picked "const void *buf" here, over say const char *, it's
> what "struct input_stream" uses.
>
> But why not use size_t for the length, as input_stream does?
>

Yes, "size_t" will be better here.

> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
>
> nit: if (!data->len)...
>

Will apply.

> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
>
> But isn't the body of this functin the same as:
>
>         *len = data->len;
>         if (!len)
>                 return NULL;
>         data->len = 0;
>         return data->buf;
>
> I.e. you don't need the condition for setting "*len" if it's 0, then
> data->len is also 0. You just want to return NULL afterwards, and not
> set (harmless, but no need) data->len to 0)< or return data->buf.

Will apply.

> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> Maybe it's that I'm unused to it, but I find this a bit more readable:
>
>         @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>          {
>                 char hdr[MAX_HEADER_LEN];
>                 int hdrlen = sizeof(hdr);
>         +       struct simple_input_stream_data tmp = {
>         +               .buf = buf,
>         +               .len = len,
>         +       };
>                 struct input_stream in_stream = {
>                         .read = feed_simple_input_stream,
>         -               .data = (void *)&(struct simple_input_stream_data) {
>         -                       .buf = buf,
>         -                       .len = len,
>         -               },
>         +               .data = (void *)&tmp,
>                         .size = len,
>                 };
>
> Yes there's a temporary variable, but no denser inline casting. Also
> easier to strep through in a debugger (which will have the type
> information on "tmp".
>

Will apply.

> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> ditto..
>
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> > +     in_stream.size = len;
>
> Why are we setting this here?...
>

Yes, putting "in_stream.size=len;" here was a stupid decision.

> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
>
> ...Insted of after this point, as we may error and never use it?
>
> > +     data.buf = buf;
> > +     data.len = len;
>
> Probably won't matter,  just a nit...
>
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +     size_t size;
> > +};
> > +
>
> Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:51       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-06  2:51 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:27 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +     } else {
> > +             loose_object_path(the_repository, &filename, oid);
> > +     }
> >
> >       fd = create_tmpfile(&tmp_file, filename.buf);
> >       if (fd < 0) {
> > @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >               die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> >                   ret);
> >       the_hash_algo->final_oid_fn(&parano_oid, &c);
> > -     if (!oideq(oid, &parano_oid))
> > +     if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
> >               die(_("confused by unstable object source data for %s"),
> >                   oid_to_hex(oid));
> >
> >       close_loose_object(fd);
> >
> > +     if (is_null_oid(oid)) {
> > +             int dirlen;
> > +
> > +             oidcpy((struct object_id *)oid, &parano_oid);
> > +             loose_object_path(the_repository, &filename, oid);
>
> Why are we breaking the promise that "oid" is constant here? I tested
> locally with the below on top, and it seems to work (at least no tests
> broke). Isn't it preferrable to the cast & the caller having its "oid"
> changed?
>
> diff --git a/object-file.c b/object-file.c
> index 71d510614b9..d014e6942ea 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         close_loose_object(fd);
>
>         if (is_null_oid(oid)) {
> +               struct object_id oid2;
>                 int dirlen;
>
> -               oidcpy((struct object_id *)oid, &parano_oid);
> -               loose_object_path(the_repository, &filename, oid);
> +               oidcpy(&oid2, &parano_oid);
> +               loose_object_path(the_repository, &filename, &oid2);
>
>                 /* We finally know the object path, and create the missing dir. */
>                 dirlen = directory_size(filename.buf);

Maybe I should change the promise that "oid" is constant in
"write_loose_object()".

The original write_object_file_flags() defines a variable "oid", and
completes the calculation of the "oid" in
"write_object_file_prepare()" which will be passed to
"write_loose_object()".

If a null oid is maintained after calling "write_loose_object()",
"--strict" will become meaningless, although it does not break existing
test cases.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:12       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-06  3:12 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:54 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
>
> Why re-use this & leak memory? An existing strbuf use in this function
> doesn't leak in the same way. Just release it as in the below patch on
> top (the ret v.s. err variable naming is a bit confused, maybe could do
> with a prep cleanup step.).
>
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
>
> And once we do that this could just become:
>
>         strbuf_addf($filename, "%s/", ...)
>
> Is there's existing uses of this pattern, so mayb e not worth it, but it
> allows you to remove the braces on the if/else.
>
> diff --git a/object-file.c b/object-file.c
> index 8bd89e7b7ba..2b52f3fc1cc 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                        int hdrlen, struct input_stream *in_stream,
>                        time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret, err = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> -               strbuf_reset(&filename);
>                 strbuf_addstr(&filename, the_repository->objects->odb->path);
>                 strbuf_addch(&filename, '/');
>         } else {
> @@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         fd = create_tmpfile(&tmp_file, filename.buf);
>         if (fd < 0) {
>                 if (flags & HASH_SILENT)
> -                       return -1;
> +                       err = -1;
>                 else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>                 else
> -                       return error_errno(_("unable to create temporary file"));
> +                       err = error_errno(_("unable to create temporary file"));
> +               goto cleanup;
>         }
>
>         /* Set it up */
> @@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         struct strbuf dir = STRBUF_INIT;
>                         strbuf_add(&dir, filename.buf, dirlen - 1);
>                         if (mkdir(dir.buf, 0777) && errno != EEXIST)
> -                               return -1;
> -                       if (adjust_shared_perm(dir.buf))
> -                               return -1;
> -                       strbuf_release(&dir);
> +                               err = -1;
> +                       else if (adjust_shared_perm(dir.buf))
> +                               err = -1;
> +                       else
> +                               strbuf_release(&dir);
> +                       if (err < 0)
> +                               goto cleanup;
>                 }
>         }
>
> @@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         warning_errno(_("failed utime() on %s"), tmp_file.buf);
>         }
>
> -       return finalize_object_file(tmp_file.buf, filename.buf);
> +       err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +       strbuf_release(&filename);
> +       return err;
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)

Yes, this will be much better. Will apply.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:20       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-06  3:20 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 10:00 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > +     unsigned long bufsize = dry_run ? 4096 : size;
> > +     void *buf = xmallocz(bufsize);
>
> It's probably nothing, but in your CL you note that you changed another
> hardcoding from 4k to 8k, should this one still be 4k?
>
> It's probably fine, just wondering...

Yes, I think this is an omission from my work.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:17       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-07  6:17 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:59 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> > new file mode 100755
> > index 0000000000..01d950d119
> > --- /dev/null
> > +++ b/t/t5590-unpack-non-delta-objects.sh
> > @@ -0,0 +1,76 @@
> > +#!/bin/sh
> > +#
> > +# Copyright (c) 2021 Han Xin
> > +#
> > +
> > +test_description='Test unpack-objects when receive pack'
> > +
> > +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> > +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> > +
> > +. ./test-lib.sh
> > +
> > +test_expect_success "create commit with big blobs (1.5 MB)" '
> > +     test-tool genrandom foo 1500000 >big-blob &&
> > +     test_commit --append foo big-blob &&
> > +     test-tool genrandom bar 1500000 >big-blob &&
> > +     test_commit --append bar big-blob &&
> > +     (
> > +             cd .git &&
> > +             find objects/?? -type f | sort
>
> ...are thse...
>
> > +     ) >expect &&
> > +     PACK=$(echo main | git pack-objects --progress --revs test)
>
> Is --progress needed?
>

"--progress" is not necessary.

> > +'
> > +
> > +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> > +     GIT_ALLOC_LIMIT=1m &&
> > +     export GIT_ALLOC_LIMIT
> > +'
> > +
> > +test_expect_success 'prepare dest repository' '
> > +     git init --bare dest.git &&
> > +     git -C dest.git config core.bigFileThreshold 2m &&
> > +     git -C dest.git config receive.unpacklimit 100
>
> I think it would be better to just (could roll this into a function):
>
>         test_when_finished "rm -rf dest.git" &&
>         git init dest.git &&
>         git -C dest.git config ...
>
> Then you can use it with e.g. --run=3-4 and not have it error out
> because of skipped setup.
>
> A lot of our tests fail like that, but in this case fixing it seems
> trivial.
>
>

OK, I will take it.

>
> > +'
> > +
> > +test_expect_success 'fail to unpack-objects: cannot allocate' '
> > +     test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> > +     test_i18ngrep "fatal: attempting to allocate" err &&
>
> nit: just "grep", not "test_i18ngrep"
>
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
>
> ..."find" needed over just globbing?:
>
>     obj=$(echo objects/*/*)
>
> ?

I tried to use "echo" instead of "find". It works well on my personal
computer, but fails due to the "info/commit-graph" generated when CI on
Github.
So it seems that ".git/objects/??" will be more rigorous?

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:42       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-07  6:42 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:19 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > By implementing a zstream version of input_stream interface, we can use
> > a small fixed buffer for "unpack_non_delta_entry()".
> >
> > However, unpack non-delta objects from a stream instead of from an entrie
> > buffer will have 10% performance penalty. Therefore, only unpack object
> > larger than the "big_file_threshold" in zstream. See the following
> > benchmarks:
> >
> >     hyperfine \
> >       --setup \
> >       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
> >       --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >       -n 'old' 'git -C dest.git unpack-objects <small.pack' \
> >       -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
> >       -n 'new (small threshold)' \
> >       'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
> >     Benchmark 1: old
> >       Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
> >       Range (min … max):    6.018 s …  6.189 s    10 runs
> >
> >     Benchmark 2: new
> >       Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
> >       Range (min … max):    6.030 s …  6.142 s    10 runs
> >
> >     Benchmark 3: new (small threshold)
> >       Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
> >       Range (min … max):    6.711 s …  6.809 s    10 runs
> >
> >     Summary
> >       'old' ran
> >         1.00 ± 0.01 times faster than 'new'
> >         1.11 ± 0.01 times faster than 'new (small threshold)'
>
> So before we wrote used core.bigfilethreshold for two things (or more?):
> Whether we show a diff for it (we mark it "binary") and whether it's
> split into a loose object.
>
> Now it's three things, we've added a "this is a threshold when we'll
> stream the object" to that.
>
> Might it make sense to squash something like this in, so we can have our
> cake & eat it too?
>
> With this I get, where HEAD~0 is this change:
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0' ran
>         1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.06 ± 0.14 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.20 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> I.e. it's 5% slower, not 20% (haven't looked into why), but we'll not
> stream out 16k..128MB objects (maybe the repo has even bigger ones?)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a1..601b7a2418f 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>
> +core.bigFileStreamingThreshold::
> +       Files larger than this will be streamed out to a temporary
> +       object file while being hashed, which will when be renamed
> +       in-place to a loose object, particularly if the
> +       `core.bigFileThreshold' setting dictates that they're always
> +       written out as loose objects.
> ++
> +Default is 128 MiB on all platforms.
> ++
> +Common unit suffixes of 'k', 'm', or 'g' are supported.
> +
>  core.excludesFile::
>         Specifies the pathname to the file that contains patterns to
>         describe paths that are not meant to be tracked, in addition
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index bedc494e2db..94ce275c807 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -400,7 +400,7 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>         void *buf;
>
>         /* Write large blob in stream without allocating full buffer. */
> -       if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
> +       if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
>                 write_stream_blob(nr, size);
>                 return;
>         }
> diff --git a/cache.h b/cache.h
> index eba12487b99..4037c7fd849 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -964,6 +964,7 @@ extern size_t packed_git_window_size;
>  extern size_t packed_git_limit;
>  extern size_t delta_base_cache_limit;
>  extern unsigned long big_file_threshold;
> +extern unsigned long big_file_streaming_threshold;
>  extern unsigned long pack_size_limit_cfg;
>
>  /*
> diff --git a/config.c b/config.c
> index c5873f3a706..7b122a142a8 100644
> --- a/config.c
> +++ b/config.c
> @@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>                 return 0;
>         }
>
> +       if (!strcmp(var, "core.bigfilestreamingthreshold")) {
> +               big_file_streaming_threshold = git_config_ulong(var, value);
> +               return 0;
> +       }
> +
>         if (!strcmp(var, "core.packedgitlimit")) {
>                 packed_git_limit = git_config_ulong(var, value);
>                 return 0;
> diff --git a/environment.c b/environment.c
> index 9da7f3c1a19..4fcc3de7417 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
>  size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
>  size_t delta_base_cache_limit = 96 * 1024 * 1024;
>  unsigned long big_file_threshold = 512 * 1024 * 1024;
> +unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
>  int pager_use_color = 1;
>  const char *editor_program;
>  const char *askpass_program;

I'm not sure if we need an additional "core.bigFileStreamingThreshold"
here, because "core.bigFileThreshold" has been widely used in
"index-pack", "read_object" and so on.

In the test case which uses "core.bigFileStreamingThreshold" instead of
"core.bigFileThreshold", I found the test case execution failed because
of "fsck", who tried to allocate 15MB of memory.
In the process of "fsck_loose()", "read_loose_object()" will be called,
which contains the following content:

  if (*oi->typep == OBJ_BLOB && *size> big_file_threshold) {
    if (check_stream_oid(&stream, hdr, *size, path, expected_oid) <0)
    goto out;
  } else {
    /* this will allocate 15MB of memory */
    *contents = unpack_loose_rest(&stream, hdr, *size, expected_oid);
    ...
  }

The same case can be found in "unpack_entry_data()":

  static char fixed_buf[8192];
  ...
  if (type == OBJ_BLOB && size > big_file_threshold)
    buf = fixed_buf;
  else
    buf = xmallocz(size);
 ...

Although I know that setting a "core.bigfilethreshold" smaller than the
default value on the server side does not help me prevent users from
creating large delta objects on the client side, it can still
effectively help me reduce the Memory allocation in "receive-pack".

If this is not the correct way to use "core.bigfilethreshold", maybe
you can share some better solutions to me, if you want.

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:48       ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-07  6:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 10:29 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> > [..]
> > +static void write_stream_blob(unsigned nr, unsigned long size)
> > +{
> > +     char hdr[32];
> > +     int hdrlen;
> > +     git_zstream zstream;
> > +     struct input_zstream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_input_zstream,
> > +             .data = &data,
> > +             .size = size,
> > +     };
> > +     struct object_id *oid = &obj_list[nr].oid;
> > +     int ret;
> > +
> > +     memset(&zstream, 0, sizeof(zstream));
> > +     memset(&data, 0, sizeof(data));
> > +     data.zstream = &zstream;
> > +     git_inflate_init(&zstream);
> > +
> > +     /* Generate the header */
> > +     hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> > +
> > +     if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
> > +             die(_("failed to write object in stream %d"), ret);
> > +
> > +     if (zstream.total_out != size || data.status != Z_STREAM_END)
> > +             die(_("inflate returned %d"), data.status);
> > +     git_inflate_end(&zstream);
> > +
> > +     if (strict && !dry_run) {
> > +             struct blob *blob = lookup_blob(the_repository, oid);
> > +             if (blob)
> > +                     blob->object.flags |= FLAG_WRITTEN;
> > +             else
> > +                     die("invalid blob object from stream");
> > +     }
> > +     obj_list[nr].obj = NULL;
> > +}
>
> Just a side-note, I think (but am not 100% sure) that these existing
> occurances aren't needed due to our use of CALLOC_ARRAY():
>
>     diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>     index 4a9466295ba..00b349412c5 100644
>     --- a/builtin/unpack-objects.c
>     +++ b/builtin/unpack-objects.c
>     @@ -248,7 +248,6 @@ static void write_object(unsigned nr, enum object_type type,
>                             die("failed to write object");
>                     added_object(nr, type, buf, size);
>                     free(buf);
>     -               obj_list[nr].obj = NULL;
>             } else if (type == OBJ_BLOB) {
>                     struct blob *blob;
>                     if (write_object_file(buf, size, type_name(type),
>     @@ -262,7 +261,6 @@ static void write_object(unsigned nr, enum object_type type,
>                             blob->object.flags |= FLAG_WRITTEN;
>                     else
>                             die("invalid blob object");
>     -               obj_list[nr].obj = NULL;
>             } else {
>                     struct object *obj;
>                     int eaten;
>
> The reason I'm noting it is that the same seems to be true of your new
> addition here. I.e. are these assignments to NULL needed?
>
> Anyway, the reason I started poking at this it tha this
> write_stream_blob() seems to duplicate much of write_object(). AFAICT
> only the writing part is really different, the part where we
> lookup_blob() after, set FLAG_WRITTEN etc. is all the same.
>
> Why can't we call write_object() here?
>
> The obvious answer seems to be that the call to write_object_file()
> isn't prepared to do the sort of streaming that you want, so instead
> you're bypassing it and calling write_loose_object() directly.
>
> I haven't tried this myself, but isn't a better and cleaner approach
> here to not add another meaning to what is_null_oid() means, but to just
> add a HASH_STREAM flag that'll get passed down as "unsigned flags" to
> write_loose_object()? See FLAG_BITS in object.h.
>
> Then the "obj_list[nr].obj" here could also become
> "obj_list[nr].obj.flags |= (1u<<12)" or whatever (but that wouldn't
> strictly be needed I think.
>
> But by adding the "HASH_STREAM" flag you could I think stop duplicating
> the "Generate the header" etc. here and call write_object_file_flags().
>
> I don't so much care about how it's done within unpack-objects.c, but
> not having another meaning to is_null_oid() in play would be really
> nice, and it this case it seems entirely avoidable.

I did refactor it according to your suggestions in my next patch version.
Using a HASH_STREAM tag is indeed a better way to deal with it, and it
can also reduce my refactor to the original contents.

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v4 0/5] unpack large objects in stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
@ 2021-12-07 16:18     ` Derrick Stolee
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 165+ messages in thread
From: Derrick Stolee @ 2021-12-07 16:18 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason
  Cc: Han Xin

On 12/3/2021 4:35 AM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> Changes since v3:
> * Add "size" to "struct input_stream" which used by following commits.
> 
> * Increase the buffer size of "struct input_zstream_data" from 4096 to
>   8192, which is consistent with the "fixed_buf" in the "index-pack.c".
> 
> * Refactor "read stream in a loop in write_loose_object()" which
>   introduced a performance problem reported by Derrick Stolee[1].

Thank you for finding the issue. It seems simple enough to add that size
information and regain the performance back to nearly no overhead. Your
hyperfine statistics are within noise, which is great. Thanks!

-Stolee

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 0/6] unpack large blobs in stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
  2021-12-07 16:18     ` Derrick Stolee
@ 2021-12-10 10:34     ` Han Xin
  2021-12-17 11:26       ` Han Xin
                         ` (6 more replies)
  2021-12-10 10:34     ` [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream Han Xin
                       ` (5 subsequent siblings)
  7 siblings, 7 replies; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v4:
* Refactor to "struct input_stream" implementations so that we can
  reduce the changes to "write_loose_object()" sugguest by
  Ævar Arnfjörð Bjarmason.

* Add a new flag called "HASH_STREAM" to support this feature.

* Add a new config "core.bigFileStreamingThreshold" instread of
  "core.bigFileThreshold" sugguest by Ævar Arnfjörð Bjarmason[1].

* Roll destination repository preparement into a function in 
  "t5590-unpack-non-delta-objects.sh", so that we can run testcases
  with --run=setup,3,4.

1. https://lore.kernel.org/git/211203.86zgphsu5a.gmgdl@evledraar.gmail.com/

Han Xin (6):
  object-file: refactor write_loose_object() to support read from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
  unpack-objects: unpack_non_delta_entry() read data in a stream

 Documentation/config/core.txt       | 11 ++++
 builtin/unpack-objects.c            | 86 +++++++++++++++++++++++++++--
 cache.h                             |  2 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 object-file.c                       | 73 +++++++++++++++++++-----
 object-store.h                      |  5 ++
 t/t5590-unpack-non-delta-objects.sh | 70 +++++++++++++++++++++++
 8 files changed, 234 insertions(+), 19 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v4:
1:  af707ef304 < -:  ---------- object-file: refactor write_loose_object() to read buffer from stream
2:  321ad90d8e < -:  ---------- object-file.c: handle undetermined oid in write_loose_object()
3:  1992ac39af < -:  ---------- object-file.c: read stream in a loop in write_loose_object()
-:  ---------- > 1:  f3595e68cc object-file: refactor write_loose_object() to support read from stream
-:  ---------- > 2:  c25fdd1fe5 object-file.c: handle undetermined oid in write_loose_object()
-:  ---------- > 3:  ed226f2f9f object-file.c: read stream in a loop in write_loose_object()
4:  c41eb06533 ! 4:  2f91e540f6 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize = dry_run ? 4096 : size;
    ++	unsigned long bufsize = dry_run ? 8192 : size;
     +	void *buf = xmallocz(bufsize);
      
      	memset(&stream, 0, sizeof(stream));
-:  ---------- > 5:  7698938eac object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
5:  9427775bdc ! 6:  103bb1db06 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Commit message
     
         However, unpack non-delta objects from a stream instead of from an entrie
         buffer will have 10% performance penalty. Therefore, only unpack object
    -    larger than the "big_file_threshold" in zstream. See the following
    +    larger than the "core.BigFileStreamingThreshold" in zstream. See the following
         benchmarks:
     
             hyperfine \
               --setup \
               'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
    -          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -          -n 'old' 'git -C dest.git unpack-objects <small.pack' \
    -          -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
    -          -n 'new (small threshold)' \
    -          'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    -        Benchmark 1: old
    -          Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
    -          Range (min … max):    6.018 s …  6.189 s    10 runs
    -
    -        Benchmark 2: new
    -          Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
    -          Range (min … max):    6.030 s …  6.142 s    10 runs
    -
    -        Benchmark 3: new (small threshold)
    -          Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
    -          Range (min … max):    6.711 s …  6.809 s    10 runs
    +          --prepare 'rm -rf dest.git && git init --bare dest.git'
     
             Summary
    -          'old' ran
    -            1.00 ± 0.01 times faster than 'new'
    -            1.11 ± 0.01 times faster than 'new (small threshold)'
    +          './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
    +            1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
    +            1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
    +            1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
    +            1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
    +            1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
     
    +    Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    + ## Documentation/config/core.txt ##
    +@@ Documentation/config/core.txt: be delta compressed, but larger binary media files won't be.
    + +
    + Common unit suffixes of 'k', 'm', or 'g' are supported.
    + 
    ++core.bigFileStreamingThreshold::
    ++	Files larger than this will be streamed out to a temporary
    ++	object file while being hashed, which will when be renamed
    ++	in-place to a loose object, particularly if the
    ++	`core.bigFileThreshold' setting dictates that they're always
    ++	written out as loose objects.
    +++
    ++Default is 128 MiB on all platforms.
    +++
    ++Common unit suffixes of 'k', 'm', or 'g' are supported.
    ++
    + core.excludesFile::
    + 	Specifies the pathname to the file that contains patterns to
    + 	describe paths that are not meant to be tracked, in addition
    +
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
      	}
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +
     +static void write_stream_blob(unsigned nr, unsigned long size)
     +{
    -+	char hdr[32];
    -+	int hdrlen;
     +	git_zstream zstream;
     +	struct input_zstream_data data;
     +	struct input_stream in_stream = {
     +		.read = feed_input_zstream,
     +		.data = &data,
    -+		.size = size,
     +	};
    -+	struct object_id *oid = &obj_list[nr].oid;
     +	int ret;
     +
     +	memset(&zstream, 0, sizeof(zstream));
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	/* Generate the header */
    -+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
    -+
    -+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
    ++	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
     +		die(_("failed to write object in stream %d"), ret);
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	git_inflate_end(&zstream);
     +
     +	if (strict && !dry_run) {
    -+		struct blob *blob = lookup_blob(the_repository, oid);
    ++		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      		write_object(nr, type, buf, size);
      	else
     
    - ## object-file.c ##
    -@@ object-file.c: static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
    - 	return data->buf;
    - }
    + ## cache.h ##
    +@@ cache.h: extern size_t packed_git_window_size;
    + extern size_t packed_git_limit;
    + extern size_t delta_base_cache_limit;
    + extern unsigned long big_file_threshold;
    ++extern unsigned long big_file_streaming_threshold;
    + extern unsigned long pack_size_limit_cfg;
      
    --static int write_loose_object(const struct object_id *oid, char *hdr,
    --			      int hdrlen, struct input_stream *in_stream,
    --			      time_t mtime, unsigned flags)
    -+int write_loose_object(const struct object_id *oid, char *hdr,
    -+		       int hdrlen, struct input_stream *in_stream,
    -+		       time_t mtime, unsigned flags)
    - {
    - 	int fd, ret;
    - 	unsigned char compressed[4096];
    + /*
     
    - ## object-store.h ##
    -@@ object-store.h: int hash_object_file(const struct git_hash_algo *algo, const void *buf,
    - 		     unsigned long len, const char *type,
    - 		     struct object_id *oid);
    + ## config.c ##
    +@@ config.c: static int git_default_core_config(const char *var, const char *value, void *cb)
    + 		return 0;
    + 	}
      
    -+int write_loose_object(const struct object_id *oid, char *hdr,
    -+		       int hdrlen, struct input_stream *in_stream,
    -+		       time_t mtime, unsigned flags);
    ++	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
    ++		big_file_streaming_threshold = git_config_ulong(var, value);
    ++		return 0;
    ++	}
     +
    - int write_object_file_flags(const void *buf, unsigned long len,
    - 			    const char *type, struct object_id *oid,
    - 			    unsigned flags);
    + 	if (!strcmp(var, "core.packedgitlimit")) {
    + 		packed_git_limit = git_config_ulong(var, value);
    + 		return 0;
    +
    + ## environment.c ##
    +@@ environment.c: size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
    + size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
    + size_t delta_base_cache_limit = 96 * 1024 * 1024;
    + unsigned long big_file_threshold = 512 * 1024 * 1024;
    ++unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
    + int pager_use_color = 1;
    + const char *editor_program;
    + const char *askpass_program;
     
      ## t/t5590-unpack-non-delta-objects.sh (new) ##
     @@
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +
     +. ./test-lib.sh
     +
    -+test_expect_success "create commit with big blobs (1.5 MB)" '
    ++prepare_dest () {
    ++	test_when_finished "rm -rf dest.git" &&
    ++	git init --bare dest.git &&
    ++	git -C dest.git config core.bigFileStreamingThreshold $1
    ++	git -C dest.git config core.bigFileThreshold $1
    ++}
    ++
    ++test_expect_success "setup repo with big blobs (1.5 MB)" '
     +	test-tool genrandom foo 1500000 >big-blob &&
     +	test_commit --append foo big-blob &&
     +	test-tool genrandom bar 1500000 >big-blob &&
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +		cd .git &&
     +		find objects/?? -type f | sort
     +	) >expect &&
    -+	PACK=$(echo main | git pack-objects --progress --revs test)
    ++	PACK=$(echo main | git pack-objects --revs test)
     +'
     +
    -+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
    ++test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
     +	GIT_ALLOC_LIMIT=1m &&
     +	export GIT_ALLOC_LIMIT
     +'
     +
    -+test_expect_success 'prepare dest repository' '
    -+	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileThreshold 2m &&
    -+	git -C dest.git config receive.unpacklimit 100
    -+'
    -+
     +test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	prepare_dest 2m &&
     +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    -+	test_i18ngrep "fatal: attempting to allocate" err &&
    ++	grep "fatal: attempting to allocate" err &&
     +	(
     +		cd dest.git &&
     +		find objects/?? -type f | sort
     +	) >actual &&
    ++	test_file_not_empty actual &&
     +	! test_cmp expect actual
     +'
     +
    -+test_expect_success 'set a lower bigfile threshold' '
    -+	git -C dest.git config core.bigFileThreshold 1m
    -+'
    -+
     +test_expect_success 'unpack big object in stream' '
    ++	prepare_dest 1m &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_cmp expect actual
     +'
     +
    -+test_expect_success 'setup for unpack-objects dry-run test' '
    -+	git init --bare unpack-test.git
    -+'
    -+
     +test_expect_success 'unpack-objects dry-run' '
    ++	prepare_dest 1m &&
    ++	git -C dest.git unpack-objects -n <test-$PACK.pack &&
     +	(
    -+		cd unpack-test.git &&
    -+		git unpack-objects -n <../test-$PACK.pack
    -+	) &&
    -+	(
    -+		cd unpack-test.git &&
    ++		cd dest.git &&
     +		find objects/ -type f
     +	) >actual &&
     +	test_must_be_empty actual
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
  2021-12-07 16:18     ` Derrick Stolee
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface.

In the first step, we add a new flag called "HASH_STREAM" and make a
simple implementation, feeding the entire buffer in the stream to
"write_loose_object()" as a refactor.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 cache.h        | 1 +
 object-file.c  | 7 ++++++-
 object-store.h | 5 +++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cache.h b/cache.h
index eba12487b9..51bd435dea 100644
--- a/cache.h
+++ b/cache.h
@@ -888,6 +888,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
 #define HASH_FORMAT_CHECK 2
 #define HASH_RENORMALIZE  4
 #define HASH_SILENT 8
+#define HASH_STREAM 16
 int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
 int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
 
diff --git a/object-file.c b/object-file.c
index eb972cdccd..06375a90d6 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1898,7 +1898,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	stream.next_in = (void *)buf;
+	if (flags & HASH_STREAM) {
+		struct input_stream *in_stream = (struct input_stream *)buf;
+		stream.next_in = (void *)in_stream->read(in_stream, &len);
+	} else {
+		stream.next_in = (void *)buf;
+	}
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..ccc1fc9c1a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (2 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-13  7:32       ` Ævar Arnfjörð Bjarmason
  2021-12-10 10:34     ` [PATCH v5 3/6] object-file.c: read stream in a loop " Han Xin
                       ` (3 subsequent siblings)
  7 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

The promise that "oid" is constant in "write_loose_object()" has been
removed because it will be filled after reading all stream data.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/object-file.c b/object-file.c
index 06375a90d6..41099b137f 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
+static int write_loose_object(struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret, err = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (flags & HASH_STREAM)
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_addf(&filename, "%s/", get_object_directory());
+	else
+		loose_object_path(the_repository, &filename, oid);
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			err = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			err = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!(flags & HASH_STREAM) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (flags & HASH_STREAM) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				err = -1;
+			else if (adjust_shared_perm(dir.buf))
+				err = -1;
+			else
+				strbuf_release(&dir);
+			if (err < 0)
+				goto cleanup;
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
@@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	return err;
 }
 
 static int freshen_loose_object(const struct object_id *oid)
@@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
 	return ret;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 3/6] object-file.c: read stream in a loop in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (3 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/object-file.c b/object-file.c
index 41099b137f..455ab3c06e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1864,7 +1864,7 @@ static int write_loose_object(struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret, err = 0;
+	int fd, ret, err = 0, flush = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1903,22 +1903,29 @@ static int write_loose_object(struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	if (flags & HASH_STREAM) {
-		struct input_stream *in_stream = (struct input_stream *)buf;
-		stream.next_in = (void *)in_stream->read(in_stream, &len);
-	} else {
+	if (!(flags & HASH_STREAM)) {
 		stream.next_in = (void *)buf;
+		stream.avail_in = len;
+		flush = Z_FINISH;
 	}
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (flags & HASH_STREAM && !stream.avail_in) {
+			struct input_stream *in_stream = (struct input_stream *)buf;
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (4 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 3/6] object-file.c: read stream in a loop " Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM" Han Xin
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  7 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..d878e2f8b4 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (5 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  7 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We will use "write_object_file_flags()" in "unpack_non_delta_entry()" to
read the entire data contents in stream. When read in stream, we needn't
prepare "oid" before "write_loose_object()", only generate the header.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/object-file.c b/object-file.c
index 455ab3c06e..906590dae5 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2002,6 +2002,11 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	if (flags & HASH_STREAM) {
+		/* Generate the header */
+		hdrlen = xsnprintf(hdr, hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+		return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	}
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (6 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM" Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-13  8:05       ` Ævar Arnfjörð Bjarmason
  7 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "core.BigFileStreamingThreshold" in zstream. See the following
benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 +++++
 builtin/unpack-objects.c            | 70 ++++++++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 +++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 70 +++++++++++++++++++++++++++++
 6 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index d878e2f8b4..0df115ab0d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,79 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 51bd435dea..78548cd67a 100644
--- a/cache.h
+++ b/cache.h
@@ -965,6 +965,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 9da7f3c1a1..4fcc3de741 100644
--- a/environment.c
+++ b/environment.c
@@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..ff4c78900b
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileStreamingThreshold $1
+	git -C dest.git config core.bigFileThreshold $1
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest 2m &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-13  7:32       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-13  7:32 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 10 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> The promise that "oid" is constant in "write_loose_object()" has been
> removed because it will be filled after reading all stream data.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 39 insertions(+), 9 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 06375a90d6..41099b137f 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> -static int write_loose_object(const struct object_id *oid, char *hdr,
> +static int write_loose_object(struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
>  {
> -	int fd, ret;
> +	int fd, ret, err = 0;
>  	unsigned char compressed[4096];
>  	git_zstream stream;
>  	git_hash_ctx c;
> @@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (flags & HASH_STREAM)
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_addf(&filename, "%s/", get_object_directory());
> +	else
> +		loose_object_path(the_repository, &filename, oid);
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
>  		if (flags & HASH_SILENT)
> -			return -1;
> +			err = -1;
>  		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>  		else
> -			return error_errno(_("unable to create temporary file"));
> +			err = error_errno(_("unable to create temporary file"));
> +		goto cleanup;
>  	}
>  
>  	/* Set it up */
> @@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!(flags & HASH_STREAM) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));

Here we don't have a meaningful "const" OID anymore, but still if we die
we use the "oid". 

>  	close_loose_object(fd);
>  
> +	if (flags & HASH_STREAM) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);

This cast isn't needed anymore now that you stripped the "const" off,
but more on that later...

> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				err = -1;
> +			else if (adjust_shared_perm(dir.buf))
> +				err = -1;
> +			else
> +				strbuf_release(&dir);
> +			if (err < 0)
> +				goto cleanup;

Can't we use one of the existing utility functions for this? Testing
locally I could replace this with:
	
	diff --git a/object-file.c b/object-file.c
	index 7c93db11b2d..05e1fae893d 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1952,14 +1952,11 @@ static int write_loose_object(struct object_id *oid, char *hdr,
	 		if (dirlen) {
	 			struct strbuf dir = STRBUF_INIT;
	 			strbuf_add(&dir, filename.buf, dirlen - 1);
	-			if (mkdir(dir.buf, 0777) && errno != EEXIST)
	+			
	+			if (mkdir_in_gitdir(dir.buf) < 0) {
	 				err = -1;
	-			else if (adjust_shared_perm(dir.buf))
	-				err = -1;
	-			else
	-				strbuf_release(&dir);
	-			if (err < 0)
	 				goto cleanup;
	+			}
	 		}
	 	}

And your tests still pass. Maybe they have a blind spot, or maybe we can
just use the existing function.
	 
> +		}
> +	}
> +
>  	if (mtime) {
>  		struct utimbuf utb;
>  		utb.actime = mtime;
> @@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  			warning_errno(_("failed utime() on %s"), tmp_file.buf);
>  	}
>  
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&filename);
> +	return err;
>  }

Reading this series is an odd mixture of of things that would really be
much easier to understand if they were combined, e.g. 1/6 adding APIs
that aren't used by anything, but then adding one codepath (also
unused), that we then use later. Could just add it at the same time as
the use and the patch would be easier to read....

...and then this, which *is* something that could be split up into an
earlier cleanup step, i.e. the strbuf leak here exists before this
series, fixing it is good, but splitting that up into its own patch
would make this diff smaller & the actual behavior changes easier to
reason about.

>  static int freshen_loose_object(const struct object_id *oid)
> @@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>  
>  	return ret;

 ...on the "more on that later", here we're casting the "oid" from const
 for a function that's never going to be involved in the streaming
 codepath.

I know I suggested the HASH_STREAM flag, but what I was really going for
was "let's share more of the code?", looking at this v5 (which is
already much better than v4) I think a better approach is to split up
write_loose_object().

I.e. it already calls close_loose_object() and finalize_object_file() to
do some of its work, but around that we have:

 1. Figuring out a path for the (temp) object file
 2. Creating the tempfile
 3. Setting up zlib
 4. Once zlib is set up inspect its state, die with a message
    about oid_to_hex(oid) if we failed
 5. Optionally, do HASH_STREAM stuff
    Maybe force a loose object if "mtime".

I think if that's split up so that each of those is its own little
function what's now write_loose_object() can call those in sequence, and
a new stream_loose_object() can just do #1 differentl, followed by the
same #2 and #4, but do #4 differently etc.

You'll still be able to re-use the write_object_file_prepare()
etc. logic.

As an example your 5/6 copy/pastes the xsnprintf() formatting of the
object header. It's just one line, but it's also code that's very
central to git, so I think instead of just copy/pasting it a prep step
of factoring it out would make sense, and that would be a prep cleanup
that would help later readability. E.g.:
	
	diff --git a/object-file.c b/object-file.c
	index eac67f6f5f9..a7dcbd929e9 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1009,6 +1009,13 @@ void *xmmap(void *start, size_t length,
	 	return ret;
	 }
	 
	+static int generate_object_header(char *buf, int bufsz, const char *type_name,
	+				  unsigned long size)
	+{
	+	return xsnprintf(buf, bufsz, "%s %"PRIuMAX , type_name,
	+			 (uintmax_t)size) + 1;
	+}
	+
	 /*
	  * With an in-core object data in "map", rehash it to make sure the
	  * object name actually matches "oid" to detect object corruption.
	@@ -1037,7 +1044,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
	 		return -1;
	 
	 	/* Generate the header */
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size);
	 
	 	/* Sha1.. */
	 	r->hash_algo->init_fn(&c);
	@@ -1737,7 +1744,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
	 	git_hash_ctx c;
	 
	 	/* Generate the header */
	-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
	+	*hdrlen = generate_object_header(hdr, *hdrlen, type, len);
	 
	 	/* Sha1.. */
	 	algo->init_fn(&c);
	@@ -2009,7 +2016,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
	 	buf = read_object(the_repository, oid, &type, &len);
	 	if (!buf)
	 		return error(_("cannot read object for %s"), oid_to_hex(oid));
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len);
	 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
	 	free(buf);

Then in your change on top you just call that generate_object_header(),
or better yet your amended write_object_file_flags() can just call a
similarly amended write_object_file_prepare() directly.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-13  8:05       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-13  8:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 10 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> +	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))

There's some odd code formatting here, i.e.. ") ,&" not "), &". Could
also use line-wrapping at 79 characters.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v5 0/6] unpack large blobs in stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
                           ` (12 more replies)
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
                         ` (5 subsequent siblings)
  6 siblings, 13 replies; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v5:
* Refactor write_loose_object() to reuse in stream version sugguest by
  Ævar Arnfjörð Bjarmason [1].

* Add a new testcase into t5590-unpack-non-delta-objects to cover the case of
  unpacking existing objects.

* Fix code formatting in unpack-objects.c sugguest by
  Ævar Arnfjörð Bjarmason [2].

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
2. https://lore.kernel.org/git/211213.867dc8ansq.gmgdl@evledraar.gmail.com/

Han Xin (6):
  object-file.c: release strbuf in write_loose_object()
  object-file.c: refactor object header generation into a function
  object-file.c: refactor write_loose_object() to reuse in stream
    version
  object-file.c: make "write_object_file_flags()" to support read in
    stream
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 Documentation/config/core.txt       |  11 ++
 builtin/unpack-objects.c            |  94 ++++++++++++-
 cache.h                             |   2 +
 config.c                            |   5 +
 environment.c                       |   1 +
 object-file.c                       | 207 +++++++++++++++++++++++-----
 object-store.h                      |   5 +
 t/t5590-unpack-non-delta-objects.sh |  87 ++++++++++++
 8 files changed, 370 insertions(+), 42 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v5:
1:  f3595e68cc < -:  ---------- object-file: refactor write_loose_object() to support read from stream
2:  c25fdd1fe5 < -:  ---------- object-file.c: handle undetermined oid in write_loose_object()
3:  ed226f2f9f < -:  ---------- object-file.c: read stream in a loop in write_loose_object()
-:  ---------- > 1:  59d35dac5f object-file.c: release strbuf in write_loose_object()
-:  ---------- > 2:  2174a6cbad object-file.c: refactor object header generation into a function
-:  ---------- > 3:  8a704ecc59 object-file.c: refactor write_loose_object() to reuse in stream version
-:  ---------- > 4:  96f05632a2 object-file.c: make "write_object_file_flags()" to support read in stream
4:  2f91e540f6 ! 5:  1acbb6e849 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize = dry_run ? 8192 : size;
    -+	void *buf = xmallocz(bufsize);
    ++	unsigned long bufsize;
    ++	void *buf;
      
      	memset(&stream, 0, sizeof(stream));
    ++	if (dry_run && size > 8192)
    ++		bufsize = 8192;
    ++	else
    ++		bufsize = size;
    ++	buf = xmallocz(bufsize);
      
      	stream.next_out = buf;
     -	stream.avail_out = size;
5:  7698938eac < -:  ---------- object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
6:  92d69cb84a ! 6:  476aaba527 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	int status;
     +};
     +
    -+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
    ++static const void *feed_input_zstream(const struct input_stream *in_stream,
    ++				      unsigned long *readlen)
     +{
     +	struct input_zstream_data *data = in_stream->data;
     +	git_zstream *zstream = data->zstream;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +		.read = feed_input_zstream,
     +		.data = &data,
     +	};
    -+	int ret;
     +
     +	memset(&zstream, 0, sizeof(zstream));
     +	memset(&data, 0, sizeof(data));
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
    -+		die(_("failed to write object in stream %d"), ret);
    ++	if (write_object_file_flags(&in_stream, size,
    ++				    type_name(OBJ_BLOB),
    ++				    &obj_list[nr].oid,
    ++				    HASH_STREAM))
    ++		die(_("failed to write object in stream"));
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
     +		die(_("inflate returned %d"), data.status);
     +	git_inflate_end(&zstream);
     +
    -+	if (strict && !dry_run) {
    ++	if (strict) {
     +		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    -+			die("invalid blob object from stream");
    ++			die(_("invalid blob object from stream"));
     +	}
     +	obj_list[nr].obj = NULL;
     +}
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +prepare_dest () {
     +	test_when_finished "rm -rf dest.git" &&
     +	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileStreamingThreshold $1
    ++	git -C dest.git config core.bigFileStreamingThreshold $1 &&
     +	git -C dest.git config core.bigFileThreshold $1
     +}
     +
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_cmp expect actual
     +'
     +
    ++test_expect_success 'unpack big object in stream with existing oids' '
    ++	prepare_dest 1m &&
    ++	git -C dest.git index-pack --stdin <test-$PACK.pack &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_must_be_empty actual &&
    ++	git -C dest.git unpack-objects <test-$PACK.pack &&
    ++	git -C dest.git fsck &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_must_be_empty actual
    ++'
    ++
     +test_expect_success 'unpack-objects dry-run' '
     +	prepare_dest 1m &&
     +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
  2021-12-17 11:26       ` Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 19:28         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
                         ` (4 subsequent siblings)
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Fix a strbuf leak in "write_loose_object()" sugguested by
Ævar Arnfjörð Bjarmason.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..32acf1dad6 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1874,11 +1874,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			ret = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			ret = error(_("insufficient permission for adding an "
+				      "object to repository database %s"),
+				    get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			ret = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1930,7 +1933,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	ret = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	strbuf_release(&tmp_file);
+	return ret;
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 2/6] object-file.c: refactor object header generation into a function
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
  2021-12-17 11:26       ` Han Xin
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-17 11:26       ` [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
                         ` (3 subsequent siblings)
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

There are 3 places where "xsnprintf" is used to generate the object
header, and I originally planned to add a fourth in the latter patch.

According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
one line, it's also code that's very central to git, so reafactor them
into a function which will help later readability.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/object-file.c b/object-file.c
index 32acf1dad6..95fcd5435d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+static inline int generate_object_header(char *buf, int bufsz,
+					 const char *type_name,
+					 unsigned long size)
+{
+	return xsnprintf(buf, bufsz, "%s %"PRIuMAX, type_name,
+			 (uintmax_t)size) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = generate_object_header(hdr, *hdrlen, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2013,7 +2021,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (2 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
                         ` (2 subsequent siblings)
  6 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in
stream instead of read into the whole buf.

As this new method "stream_loose_object()" has many similarities with
"write_loose_object()", we split up "write_loose_object()" into some
steps:
 1. Figuring out a path for the (temp) object file.
 2. Creating the tempfile.
 3. Setting up zlib and write header.
 4. Write object data and handle errors.
 5. Optionally, do someting after write, maybe force a loose object if
"mtime".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 98 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/object-file.c b/object-file.c
index 95fcd5435d..dd29e5372e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1751,6 +1751,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1836,7 +1855,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1844,7 +1864,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static void setup_stream_and_header(git_zstream *stream,
+				    unsigned char *compressed,
+				    unsigned long compressed_size,
+				    git_hash_ctx *c,
+				    char *hdr,
+				    int hdrlen)
+{
+	/* Set it up */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = compressed;
+	stream->avail_out = compressed_size;
+	the_hash_algo->init_fn(c);
+
+	/* First header.. */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1879,31 +1932,15 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
 	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			ret = -1;
-		else if (errno == EACCES)
-			ret = error(_("insufficient permission for adding an "
-				      "object to repository database %s"),
-				    get_object_directory());
-		else
-			ret = error_errno(_("unable to create temporary file"));
+		ret = -1;
 		goto cleanup;
 	}
 
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1932,16 +1969,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	ret = finalize_object_file(tmp_file.buf, filename.buf);
+	ret = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
 cleanup:
 	strbuf_release(&filename);
 	strbuf_release(&tmp_file);
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (3 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 22:52         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-17 11:26       ` [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in a
stream. The input stream is implemented as an interface.

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in a new function called
"stream_loose_object()".

In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
read the entire data contents in stream, so a new flag "HASH_STREAM" is
added. When read in stream, we needn't prepare the "oid" before
"write_loose_object()", only generate the header.
"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 cache.h        |  1 +
 object-file.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  5 +++
 3 files changed, 98 insertions(+)

diff --git a/cache.h b/cache.h
index cfba463aa9..6d68fd10a3 100644
--- a/cache.h
+++ b/cache.h
@@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
 #define HASH_FORMAT_CHECK 2
 #define HASH_RENORMALIZE  4
 #define HASH_SILENT 8
+#define HASH_STREAM 16
 int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
 int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
 
diff --git a/object-file.c b/object-file.c
index dd29e5372e..2ef1d4fb00 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1994,6 +1994,88 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
+			       const struct input_stream *in_stream,
+			       unsigned long len, time_t mtime, unsigned flags)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct object_id parano_oid;
+	static struct strbuf tmp_file = STRBUF_INIT;
+	static struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+
+	/* When oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (ret != Z_STREAM_END)
+		die(_("unable to deflate new object streamingly (%d)"), ret);
+	ret = git_deflate_end_gently(&stream);
+	if (ret != Z_OK)
+		die(_("deflateEnd on object streamingly failed (%d)"), ret);
+	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	close_loose_object(fd);
+
+	oidcpy(oid, &parano_oid);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+
+		if (mkdir_in_gitdir(dir.buf) < 0) {
+			err = -1;
+			goto cleanup;
+		}
+	}
+
+	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
@@ -2001,6 +2083,16 @@ int write_object_file_flags(const void *buf, unsigned long len,
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
 
+	/* When streaming a large blob object (marked as HASH_STREAM),
+	 * we have no chance to run "write_object_file_prepare()" to
+	 * calculate the "oid" in advance.  Call "stream_loose_object()"
+	 * to write loose object in stream.
+	 */
+	if (flags & HASH_STREAM) {
+		hdrlen = generate_object_header(hdr, hdrlen, type, len);
+		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	}
+
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
 	 */
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..4040e2c40a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(const struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (4 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 21:22         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c4a17bdb44 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,21 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (5 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-17 11:26       ` Han Xin
  6 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an
entrie buffer will have 10% performance penalty. Therefore, only unpack
object larger than the "core.BigFileStreamingThreshold" in zstream. See
the following benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 ++++
 builtin/unpack-objects.c            | 73 +++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 87 +++++++++++++++++++++++++++++
 6 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c4a17bdb44..42e1033d85 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(const struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (write_object_file_flags(&in_stream, size,
+				    type_name(OBJ_BLOB),
+				    &obj_list[nr].oid,
+				    HASH_STREAM))
+		die(_("failed to write object in stream"));
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 6d68fd10a3..976f9cf656 100644
--- a/cache.h
+++ b/cache.h
@@ -975,6 +975,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 0d06a31024..04bba593de 100644
--- a/environment.c
+++ b/environment.c
@@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..11c70e192c
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,87 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileStreamingThreshold $1 &&
+	git -C dest.git config core.bigFileThreshold $1
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest 2m &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream with existing oids' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
@ 2021-12-17 19:28         ` René Scharfe
  2021-12-18  0:09           ` Junio C Hamano
  0 siblings, 1 reply; 165+ messages in thread
From: René Scharfe @ 2021-12-17 19:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Fix a strbuf leak in "write_loose_object()" sugguested by
> Ævar Arnfjörð Bjarmason.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..32acf1dad6 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1874,11 +1874,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,


Relevant context lines:

	static struct strbuf tmp_file = STRBUF_INIT;
	static struct strbuf filename = STRBUF_INIT;

	loose_object_path(the_repository, &filename, oid);

>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
>  		if (flags & HASH_SILENT)
> -			return -1;
> +			ret = -1;
>  		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +			ret = error(_("insufficient permission for adding an "
> +				      "object to repository database %s"),
> +				    get_object_directory());
>  		else
> -			return error_errno(_("unable to create temporary file"));
> +			ret = error_errno(_("unable to create temporary file"));
> +		goto cleanup;
>  	}
>
>  	/* Set it up */
> @@ -1930,7 +1933,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  			warning_errno(_("failed utime() on %s"), tmp_file.buf);
>  	}
>
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	ret = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&filename);
> +	strbuf_release(&tmp_file);

There was no leak before.  Both strbufs are static and both functions
they are passed to (loose_object_path() and create_tmpfile()) reset
them first.  So while the allocated memory was not released before,
it was reused.

Not sure if making write_loose_object() allocate and release these
buffers on every call has much of a performance impact.  The only
reason I can think of for wanting such a change is to get rid of the
static buffers, to allow the function to be used by concurrent
threads.

So I think either keeping the code as-is or also making the strbufs
non-static would be better (but then discussing a possible
performance impact in the commit message would be nice).

> +	return ret;
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-17 21:22         ` René Scharfe
  0 siblings, 0 replies; 165+ messages in thread
From: René Scharfe @ 2021-12-17 21:22 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.

Clever.  Looks good to me.

For some reason I was expecting this patch to have some connection to
one of the earlier ones (perhaps because get_data() was mentioned),
but it is technically independent.

>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..c4a17bdb44 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>
> -static void *get_data(unsigned long size)
> +static void *get_data(unsigned long size, int dry_run)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize;
> +	void *buf;
>
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);
>
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
>  	return buf;
> @@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  				   unsigned nr)
>  {
> -	void *buf = get_data(size);
> +	void *buf = get_data(size, dry_run);
>
>  	if (!dry_run && buf)
>  		write_object(nr, type, buf, size);
> @@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  	if (type == OBJ_REF_DELTA) {
>  		oidread(&base_oid, fill(the_hash_algo->rawsz));
>  		use(the_hash_algo->rawsz);
> -		delta_data = get_data(delta_size);
> +		delta_data = get_data(delta_size, dry_run);
>  		if (dry_run || !delta_data) {
>  			free(delta_data);
>  			return;
> @@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
>  			die("offset value out of bound for delta base object");
>
> -		delta_data = get_data(delta_size);
> +		delta_data = get_data(delta_size, dry_run);
>  		if (dry_run || !delta_data) {
>  			free(delta_data);
>  			return;


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
@ 2021-12-17 22:52         ` René Scharfe
  0 siblings, 0 replies; 165+ messages in thread
From: René Scharfe @ 2021-12-17 22:52 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "stream_loose_object()" in a
> stream. The input stream is implemented as an interface.
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in a new function called
> "stream_loose_object()".
>
> In "write_loose_object()", we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
> read the entire data contents in stream, so a new flag "HASH_STREAM" is
> added. When read in stream, we needn't prepare the "oid" before
> "write_loose_object()", only generate the header.
> "freshen_packed_object()" or "freshen_loose_object()" will be called
> inside "stream_loose_object()" after obtaining the "oid".
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  cache.h        |  1 +
>  object-file.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  object-store.h |  5 +++
>  3 files changed, 98 insertions(+)
>
> diff --git a/cache.h b/cache.h
> index cfba463aa9..6d68fd10a3 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
>  #define HASH_FORMAT_CHECK 2
>  #define HASH_RENORMALIZE  4
>  #define HASH_SILENT 8
> +#define HASH_STREAM 16
>  int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
>  int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
>
> diff --git a/object-file.c b/object-file.c
> index dd29e5372e..2ef1d4fb00 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1994,6 +1994,88 @@ static int freshen_packed_object(const struct object_id *oid)
>  	return 1;
>  }
>
> +static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
> +			       const struct input_stream *in_stream,
> +			       unsigned long len, time_t mtime, unsigned flags)
> +{
> +	int fd, ret, err = 0, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct object_id parano_oid;
> +	static struct strbuf tmp_file = STRBUF_INIT;
> +	static struct strbuf filename = STRBUF_INIT;

Note these static strbufs.

> +	int dirlen;
> +
> +	/* When oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());
> +
> +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
> +	if (fd < 0) {
> +		err = -1;
> +		goto cleanup;
> +	}
> +
> +	/* Set it up and write header */
> +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
> +				&c, hdr, hdrlen);
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +		if (!stream.avail_in) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (len + hdrlen == stream.total_in + stream.avail_in)
> +				flush = Z_FINISH;
> +		}
> +		ret = git_deflate(&stream, flush);
> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream.next_out = compressed;
> +		stream.avail_out = sizeof(compressed);
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
> +
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to deflate new object streamingly (%d)"), ret);
> +	ret = git_deflate_end_gently(&stream);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);
> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +
> +	close_loose_object(fd);
> +
> +	oidcpy(oid, &parano_oid);
> +
> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		goto cleanup;
> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	/* We finally know the object path, and create the missing dir. */
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		strbuf_add(&dir, filename.buf, dirlen - 1);
> +
> +		if (mkdir_in_gitdir(dir.buf) < 0) {
> +			err = -1;
> +			goto cleanup;
> +		}
> +	}
> +
> +	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);

The static strbufs are released here.  That combination is strange --
why keep the variable values between calls by making them static, but
throw away the allocated buffers instead of reusing them?

Given that this function is only used for huge objects I think making
the strbufs non-static and releasing them is the best choice here.

> +	return err;
> +}
> +
>  int write_object_file_flags(const void *buf, unsigned long len,
>  			    const char *type, struct object_id *oid,
>  			    unsigned flags)
> @@ -2001,6 +2083,16 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen = sizeof(hdr);
>
> +	/* When streaming a large blob object (marked as HASH_STREAM),
> +	 * we have no chance to run "write_object_file_prepare()" to
> +	 * calculate the "oid" in advance.  Call "stream_loose_object()"
> +	 * to write loose object in stream.
> +	 */
> +	if (flags & HASH_STREAM) {
> +		hdrlen = generate_object_header(hdr, hdrlen, type, len);
> +		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +	}

So stream_loose_object() is called by passing the flag HASH_STREAM to
write_object_file_flags() and passing a struct input_stream via its
buf pointer.  That's ... unconventional.  Certainly scary.  Why not
export stream_loose_object() and call it directly?  Demo patch below.

> +
>  	/* Normally if we have it in the pack then we do not bother writing
>  	 * it out into .git/objects/??/?{38} file.
>  	 */
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..4040e2c40a 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>  	char *path;
>  };
>
> +struct input_stream {
> +	const void *(*read)(const struct input_stream *, unsigned long *len);
> +	void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)
>


diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 42e1033d85..07d186bd20 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -375,10 +375,8 @@ static void write_stream_blob(unsigned nr, unsigned long size)
 	data.zstream = &zstream;
 	git_inflate_init(&zstream);

-	if (write_object_file_flags(&in_stream, size,
-				    type_name(OBJ_BLOB),
-				    &obj_list[nr].oid,
-				    HASH_STREAM))
+	if (stream_loose_object(&in_stream, size, type_name(OBJ_BLOB), 0, 0,
+				&obj_list[nr].oid))
 		die(_("failed to write object in stream"));

 	if (zstream.total_out != size || data.status != Z_STREAM_END)
diff --git a/object-file.c b/object-file.c
index 2ef1d4fb00..0a6b65ab26 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1994,9 +1994,9 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }

-static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
-			       const struct input_stream *in_stream,
-			       unsigned long len, time_t mtime, unsigned flags)
+int stream_loose_object(struct input_stream *in_stream, unsigned long len,
+			const char *type, time_t mtime, unsigned flags,
+			struct object_id *oid)
 {
 	int fd, ret, err = 0, flush = 0;
 	unsigned char compressed[4096];
@@ -2006,6 +2006,10 @@ static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen = sizeof(hdr);
+
+	hdrlen = generate_object_header(hdr, hdrlen, type, len);

 	/* When oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
@@ -2083,16 +2087,6 @@ int write_object_file_flags(const void *buf, unsigned long len,
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);

-	/* When streaming a large blob object (marked as HASH_STREAM),
-	 * we have no chance to run "write_object_file_prepare()" to
-	 * calculate the "oid" in advance.  Call "stream_loose_object()"
-	 * to write loose object in stream.
-	 */
-	if (flags & HASH_STREAM) {
-		hdrlen = generate_object_header(hdr, hdrlen, type, len);
-		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
-	}
-
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
 	 */
diff --git a/object-store.h b/object-store.h
index 4040e2c40a..786b6435b1 100644
--- a/object-store.h
+++ b/object-store.h
@@ -237,6 +237,10 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }

+int stream_loose_object(struct input_stream *in_stream, unsigned long len,
+			const char *type, time_t mtime, unsigned flags,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-17 19:28         ` René Scharfe
@ 2021-12-18  0:09           ` Junio C Hamano
  0 siblings, 0 replies; 165+ messages in thread
From: Junio C Hamano @ 2021-12-18  0:09 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

René Scharfe <l.s.r@web.de> writes:

> There was no leak before.  Both strbufs are static and both functions
> they are passed to (loose_object_path() and create_tmpfile()) reset
> them first.  So while the allocated memory was not released before,
> it was reused.
>
> Not sure if making write_loose_object() allocate and release these
> buffers on every call has much of a performance impact.  The only
> reason I can think of for wanting such a change is to get rid of the
> static buffers, to allow the function to be used by concurrent
> threads.
>
> So I think either keeping the code as-is or also making the strbufs
> non-static would be better (but then discussing a possible
> performance impact in the commit message would be nice).

Makes sense.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
@ 2021-12-20 12:10         ` Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
                             ` (2 more replies)
  0 siblings, 3 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-20 12:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jeff King, Philip Oakley,
	Derrick Stolee, Ævar Arnfjörð Bjarmason

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---

On Fri, Dec 17 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> There are 3 places where "xsnprintf" is used to generate the object
> header, and I originally planned to add a fourth in the latter patch.
>
> According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
> one line, it's also code that's very central to git, so reafactor them
> into a function which will help later readability.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>

I came up with this after my comment on the earlier round suggesting
to factor out that header formatting. I don't know if this more
thorough approach is worth it or if you'd like to replace your change
with this one, but just posting it here as an RFC.

 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 14 +++++++++++---
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7dc..900c6539f68 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac806..446dea7c516 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_loose_header((char *)obuf, sizeof(obuf),
+					 type, (uintmax_t)size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index d5cafba17d4..ccece21a4a2 100644
--- a/cache.h
+++ b/cache.h
@@ -1309,6 +1309,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_loose_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_loose_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_loose_header() is a convenience wrapper for
+ * format_loose_header_extended().
+ */
+int format_loose_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_loose_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_loose_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004a..d1a8619e0af 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index eac67f6f5f9..d94609ee48d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1009,6 +1009,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_loose_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1037,7 +1045,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1737,7 +1745,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_loose_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2009,7 +2017,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.1119.g606023410ba


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
@ 2021-12-20 12:48           ` Philip Oakley
  2021-12-20 22:25           ` Junio C Hamano
  2021-12-21 11:43           ` Han Xin
  2 siblings, 0 replies; 165+ messages in thread
From: Philip Oakley @ 2021-12-20 12:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, git
  Cc: Junio C Hamano, Han Xin, Jeff King, Derrick Stolee, Johannes Schindelin

Hi Ævar,
(catching up after a week away, and noticed your patch today..)

On 20/12/2021 12:10, Ævar Arnfjörð Bjarmason wrote:
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.

I recently completed a PR in the Git for Windows build that is focused on
"git hash-object --literally" as a starter for LLP64 large file (>4GB)
compatibility.
(https://github.com/git-for-windows/git/pull/3533), which Dscho has
merged (cc'd).

I'm not sure that the `extended` version will work as expected across
the test suite
as multiple fake object types are tried, though I only skimmed the patch.

I'd support the general thrust, but just wanted to synchronise any changes.

Philip
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>
> On Fri, Dec 17 2021, Han Xin wrote:
>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> There are 3 places where "xsnprintf" is used to generate the object
>> header, and I originally planned to add a fourth in the latter patch.
>>
>> According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
>> one line, it's also code that's very central to git, so reafactor them
>> into a function which will help later readability.
>>
>> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> I came up with this after my comment on the earlier round suggesting
> to factor out that header formatting. I don't know if this more
> thorough approach is worth it or if you'd like to replace your change
> with this one, but just posting it here as an RFC.
>
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index c23d01de7dc..900c6539f68 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
>  	int hdrlen;
>  
>  	if (!is_delta_type(type)) {
> -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
> -				   type_name(type),(uintmax_t)size) + 1;
> +		hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)size);
>  		the_hash_algo->init_fn(&c);
>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>  	} else
> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index 8785b2ac806..446dea7c516 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>  	if (seekback == (off_t) -1)
>  		return error("cannot find the current offset");
>  
> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
> -			       type_name(type), (uintmax_t)size) + 1;
> +	header_len = format_loose_header((char *)obuf, sizeof(obuf),
> +					 type, (uintmax_t)size);
>  	the_hash_algo->init_fn(&ctx);
>  	the_hash_algo->update_fn(&ctx, obuf, header_len);
>  
> diff --git a/cache.h b/cache.h
> index d5cafba17d4..ccece21a4a2 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1309,6 +1309,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
>  						    unsigned long bufsiz,
>  						    struct strbuf *hdrbuf);
>  
> +/**
> + * format_loose_header() is a thin wrapper around s xsnprintf() that
> + * writes the initial "<type> <obj-len>" part of the loose object
> + * header. It returns the size that snprintf() returns + 1.
> + *
> + * The format_loose_header_extended() function allows for writing a
> + * type_name that's not one of the "enum object_type" types. This is
> + * used for "git hash-object --literally". Pass in a OBJ_NONE as the
> + * type, and a non-NULL "type_str" to do that.
> + *
> + * format_loose_header() is a convenience wrapper for
> + * format_loose_header_extended().
> + */
> +int format_loose_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *type_str, size_t objsize);
> +static inline int format_loose_header(char *str, size_t size,
> +				      enum object_type type, size_t objsize)
> +{
> +	return format_loose_header_extended(str, size, type, NULL, objsize);
> +}
> +
>  /**
>   * parse_loose_header() parses the starting "<type> <len>\0" of an
>   * object. If it doesn't follow that format -1 is returned. To check
> diff --git a/http-push.c b/http-push.c
> index 3309aaf004a..d1a8619e0af 100644
> --- a/http-push.c
> +++ b/http-push.c
> @@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
>  	git_zstream stream;
>  
>  	unpacked = read_object_file(&request->obj->oid, &type, &len);
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)len);
>  
>  	/* Set it up */
>  	git_deflate_init(&stream, zlib_compression_level);
> diff --git a/object-file.c b/object-file.c
> index eac67f6f5f9..d94609ee48d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1009,6 +1009,14 @@ void *xmmap(void *start, size_t length,
>  	return ret;
>  }
>  
> +int format_loose_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *typestr, size_t objsize)
> +{
> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
> +
> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
> +}
> +
>  /*
>   * With an in-core object data in "map", rehash it to make sure the
>   * object name actually matches "oid" to detect object corruption.
> @@ -1037,7 +1045,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
>  		return -1;
>  
>  	/* Generate the header */
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), obj_type, size);
>  
>  	/* Sha1.. */
>  	r->hash_algo->init_fn(&c);
> @@ -1737,7 +1745,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	git_hash_ctx c;
>  
>  	/* Generate the header */
> -	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
> +	*hdrlen = format_loose_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
>  
>  	/* Sha1.. */
>  	algo->init_fn(&c);
> @@ -2009,7 +2017,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), type, len);
>  	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>  


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
@ 2021-12-20 22:25           ` Junio C Hamano
  2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:43           ` Han Xin
  2 siblings, 1 reply; 165+ messages in thread
From: Junio C Hamano @ 2021-12-20 22:25 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
> ...
> +/**
> + * format_loose_header() is a thin wrapper around s xsnprintf() that

The name should have "object" somewhere in it.  Not all readers can
be expected to know that you meant "loose" to be an acceptable short
hand for "loose object".

That nit aside, I think it is a good idea to give people a common
helper function to call.  I am undecided if it is a good idea to
make it take enum or "const char *"; most everybody should be able
to say

	format_object_header(type_name(OBJ_COMMIT), ...)

just fine, so two variants might be overkill, just to allow 

	format_object_header(OBJ_COMMIT, ...)

and to forbid

	format_object_header("connit", ...)

I dunno.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 22:25           ` Junio C Hamano
@ 2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
  2021-12-21  2:11               ` Junio C Hamano
  0 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21  1:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee


On Mon, Dec 20 2021, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:
>
>> Add a convenience function to wrap the xsnprintf() command that
>> generates loose object headers. This code was copy/pasted in various
>> parts of the codebase, let's define it in one place and re-use it from
>> there.
>> ...
>> +/**
>> + * format_loose_header() is a thin wrapper around s xsnprintf() that
>
> The name should have "object" somewhere in it.  Not all readers can
> be expected to know that you meant "loose" to be an acceptable short
> hand for "loose object".

*nod*

> That nit aside, I think it is a good idea to give people a common
> helper function to call.  I am undecided if it is a good idea to
> make it take enum or "const char *"; most everybody should be able
> to say
>
> 	format_object_header(type_name(OBJ_COMMIT), ...)
>
> just fine, so two variants might be overkill, just to allow 
>
> 	format_object_header(OBJ_COMMIT, ...)
>
> and to forbid
>
> 	format_object_header("connit", ...)
>
> I dunno.

Ultimately only a single API caller in hash-object.c really cares about
something else than the enum.

I've got some patches locally to convert e.g. write_object_file() to use
the enum, and it removes the need for some callers to convert enum to
char *, only to have other things convert it back.

So I think for any new APIs it makes sense to work towards sidelining
the hash-object.c --literally caller.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
@ 2021-12-21  2:11               ` Junio C Hamano
  2021-12-21  2:27                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 165+ messages in thread
From: Junio C Hamano @ 2021-12-21  2:11 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> I've got some patches locally to convert e.g. write_object_file() to use
> the enum, and it removes the need for some callers to convert enum to
> char *, only to have other things convert it back.
>
> So I think for any new APIs it makes sense to work towards sidelining
> the hash-object.c --literally caller.

Your logic is backwards to argue "because I did something this way,
it makes sense to do it this way"?

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-21  2:11               ` Junio C Hamano
@ 2021-12-21  2:27                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21  2:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee


On Mon, Dec 20 2021, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> I've got some patches locally to convert e.g. write_object_file() to use
>> the enum, and it removes the need for some callers to convert enum to
>> char *, only to have other things convert it back.
>>
>> So I think for any new APIs it makes sense to work towards sidelining
>> the hash-object.c --literally caller.
>
> Your logic is backwards to argue "because I did something this way,
> it makes sense to do it this way"?

No, it's that if you look at the write_object_file() and
hash_object_file() callers in-tree now many, including in object-file.c
itself are taking an "enum object_type" only to convert it to a string,
and then we'll in turn sometimes convert that to the "enum object_type"
again at some lower level.

That API inconsistency dates back to at least Linus's a733cb606fe
(Change pack file format. Hopefully for the last time., 2005-06-28).

I'm just pointing out that I have local patches that prove that a lot of
back & forth is done for no good reason, and that this is one of the
codepaths that's tangentally involved. So it makes sense in this case to
make any new API take "enum object_type" as the primary interface.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
  2021-12-20 22:25           ` Junio C Hamano
@ 2021-12-21 11:43           ` Han Xin
  2 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:43 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git List, Junio C Hamano, Jeff King, Philip Oakley, Derrick Stolee

On Mon, Dec 20, 2021 at 8:10 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> I came up with this after my comment on the earlier round suggesting
> to factor out that header formatting. I don't know if this more
> thorough approach is worth it or if you'd like to replace your change
> with this one, but just posting it here as an RFC.
>

I will take this patch and rename the function name from
"format_loose_header()" to "format_object_header()".

Thanks
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 0/5] unpack large blobs in stream
  2021-12-17 11:26       ` Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
                           ` (11 subsequent siblings)
  12 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v6:
* Remove "object-file.c: release strbuf in write_loose_object()" which is not
  needed anymore. Thanks to René Scharfe[1] for reporting this.

* Reorder the patch series and put "unpack-objects.c: add dry_run mode for get_data()"
  and its testcases to the front.

* Replace "refactor object header generation into a function" with
  "object-file API: add a format_object_header() function" sugguested by
  Ævar Arnfjörð Bjarmason[2].

* Export "write_stream_object_file()" instead of "reusing write_object_file_flags()"
  sugguested by René Scharfe[3]. The new flag "HASH_STREAM" has been removed.

* Fix the directory creation error and the "strbuf dir" leak in
  "write_stream_object_file()".

* Change "unsigned long size" to "size_t size" in "write_stream_blob()" and
  "get_data()" in "unpack-objects.c".

1. https://lore.kernel.org/git/c860c56f-ce25-4391-7f65-50c9d5d80c2c@web.de/
2. https://lore.kernel.org/git/RFC-patch-1.1-bda62567f6b-20211220T120740Z-avarab@gmail.com/
3. https://lore.kernel.org/git/e959e4f1-7500-5f6b-5bd2-2f060287eeff@web.de/

Han Xin (4):
  unpack-objects.c: add dry_run mode for get_data()
  object-file.c: refactor write_loose_object() to reuse in stream
    version
  object-file.c: add "write_stream_object_file()" to support read in
    stream
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 Documentation/config/core.txt       |  11 ++
 builtin/index-pack.c                |   3 +-
 builtin/unpack-objects.c            |  94 ++++++++++++-
 bulk-checkin.c                      |   4 +-
 cache.h                             |  22 +++
 config.c                            |   5 +
 environment.c                       |   1 +
 http-push.c                         |   2 +-
 object-file.c                       | 199 ++++++++++++++++++++++------
 object-store.h                      |   9 ++
 t/t5590-unpack-non-delta-objects.sh |  91 +++++++++++++
 11 files changed, 392 insertions(+), 49 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v6:
1:  59d35dac5f < -:  ---------- object-file.c: release strbuf in write_loose_object()
2:  2174a6cbad < -:  ---------- object-file.c: refactor object header generation into a function
5:  1acbb6e849 ! 1:  a8f232f553 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      }
      
     -static void *get_data(unsigned long size)
    -+static void *get_data(unsigned long size, int dry_run)
    ++static void *get_data(size_t size, int dry_run)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize;
    ++	size_t bufsize;
     +	void *buf;
      
      	memset(&stream, 0, sizeof(stream));
    @@ builtin/unpack-objects.c: static void unpack_delta_entry(enum object_type type,
      		if (dry_run || !delta_data) {
      			free(delta_data);
      			return;
    +
    + ## t/t5590-unpack-non-delta-objects.sh (new) ##
    +@@
    ++#!/bin/sh
    ++#
    ++# Copyright (c) 2021 Han Xin
    ++#
    ++
    ++test_description='Test unpack-objects with non-delta objects'
    ++
    ++GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
    ++export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    ++
    ++. ./test-lib.sh
    ++
    ++prepare_dest () {
    ++	test_when_finished "rm -rf dest.git" &&
    ++	git init --bare dest.git
    ++}
    ++
    ++test_expect_success "setup repo with big blobs (1.5 MB)" '
    ++	test-tool genrandom foo 1500000 >big-blob &&
    ++	test_commit --append foo big-blob &&
    ++	test-tool genrandom bar 1500000 >big-blob &&
    ++	test_commit --append bar big-blob &&
    ++	(
    ++		cd .git &&
    ++		find objects/?? -type f | sort
    ++	) >expect &&
    ++	PACK=$(echo main | git pack-objects --revs test)
    ++'
    ++
    ++test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    ++	GIT_ALLOC_LIMIT=1m &&
    ++	export GIT_ALLOC_LIMIT
    ++'
    ++
    ++test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	prepare_dest &&
    ++	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	grep "fatal: attempting to allocate" err &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_file_not_empty actual &&
    ++	! test_cmp expect actual
    ++'
    ++
    ++test_expect_success 'unpack-objects dry-run' '
    ++	prepare_dest &&
    ++	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/ -type f
    ++	) >actual &&
    ++	test_must_be_empty actual
    ++'
    ++
    ++test_done
-:  ---------- > 2:  0d2e0f3a00 object-file API: add a format_object_header() function
3:  8a704ecc59 ! 3:  a571b8f16c object-file.c: refactor write_loose_object() to reuse in stream version
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	loose_object_path(the_repository, &filename, oid);
      
     -	fd = create_tmpfile(&tmp_file, filename.buf);
    -+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    - 	if (fd < 0) {
    +-	if (fd < 0) {
     -		if (flags & HASH_SILENT)
    --			ret = -1;
    +-			return -1;
     -		else if (errno == EACCES)
    --			ret = error(_("insufficient permission for adding an "
    --				      "object to repository database %s"),
    --				    get_object_directory());
    +-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
     -		else
    --			ret = error_errno(_("unable to create temporary file"));
    -+		ret = -1;
    - 		goto cleanup;
    - 	}
    - 
    +-			return error_errno(_("unable to create temporary file"));
    +-	}
    +-
     -	/* Set it up */
     -	git_deflate_init(&stream, zlib_compression_level);
     -	stream.next_out = compressed;
     -	stream.avail_out = sizeof(compressed);
     -	the_hash_algo->init_fn(&c);
    --
    ++	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    ++	if (fd < 0)
    ++		return -1;
    + 
     -	/* First header.. */
     -	stream.next_in = (unsigned char *)hdr;
     -	stream.avail_in = hdrlen;
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -			warning_errno(_("failed utime() on %s"), tmp_file.buf);
     -	}
     -
    --	ret = finalize_object_file(tmp_file.buf, filename.buf);
    -+	ret = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
    - cleanup:
    - 	strbuf_release(&filename);
    - 	strbuf_release(&tmp_file);
    +-	return finalize_object_file(tmp_file.buf, filename.buf);
    ++	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
    ++					       mtime, flags);
    + }
    + 
    + static int freshen_loose_object(const struct object_id *oid)
4:  96f05632a2 ! 4:  1de06a8f5c object-file.c: make "write_object_file_flags()" to support read in stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: make "write_object_file_flags()" to support read in stream
    +    object-file.c: add "write_stream_object_file()" to support read in stream
     
         We used to call "get_data()" in "unpack_non_delta_entry()" to read the
         entire contents of a blob object, no matter how big it is. This
         implementation may consume all the memory and cause OOM.
     
    -    This can be improved by feeding data to "stream_loose_object()" in a
    -    stream. The input stream is implemented as an interface.
    -
    -    When streaming a large blob object to "write_loose_object()", we have no
    -    chance to run "write_object_file_prepare()" to calculate the oid in
    -    advance. So we need to handle undetermined oid in a new function called
    -    "stream_loose_object()".
    +    This can be improved by feeding data to "write_stream_object_file()"
    +    in a stream. The input stream is implemented as an interface.
     
    +    The difference with "write_loose_object()" is that we have no chance
    +    to run "write_object_file_prepare()" to calculate the oid in advance.
         In "write_loose_object()", we know the oid and we can write the
         temporary file in the same directory as the final object, but for an
         object with an undetermined oid, we don't know the exact directory for
         the object, so we have to save the temporary file in ".git/objects/"
         directory instead.
     
    -    We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
    -    read the entire data contents in stream, so a new flag "HASH_STREAM" is
    -    added. When read in stream, we needn't prepare the "oid" before
    -    "write_loose_object()", only generate the header.
         "freshen_packed_object()" or "freshen_loose_object()" will be called
    -    inside "stream_loose_object()" after obtaining the "oid".
    +    inside "write_stream_object_file()" after obtaining the "oid".
     
    +    Helped-by: René Scharfe <l.s.r@web.de>
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    - ## cache.h ##
    -@@ cache.h: int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
    - #define HASH_FORMAT_CHECK 2
    - #define HASH_RENORMALIZE  4
    - #define HASH_SILENT 8
    -+#define HASH_STREAM 16
    - int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
    - int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
    - 
    -
      ## object-file.c ##
     @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
      	return 1;
      }
      
    -+static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
    -+			       const struct input_stream *in_stream,
    -+			       unsigned long len, time_t mtime, unsigned flags)
    ++int write_stream_object_file(struct input_stream *in_stream, size_t len,
    ++			     enum object_type type, time_t mtime,
    ++			     unsigned flags, struct object_id *oid)
     +{
    -+	int fd, ret, err = 0, flush = 0;
    ++	int fd, ret, flush = 0;
     +	unsigned char compressed[4096];
     +	git_zstream stream;
     +	git_hash_ctx c;
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	static struct strbuf tmp_file = STRBUF_INIT;
     +	static struct strbuf filename = STRBUF_INIT;
     +	int dirlen;
    ++	char hdr[MAX_HEADER_LEN];
    ++	int hdrlen = sizeof(hdr);
     +
    ++	/* Since "filename" is defined as static, it will be reused. So reset it
    ++	 * first before using it. */
    ++	strbuf_reset(&filename);
     +	/* When oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
     +
     +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    -+	if (fd < 0) {
    -+		err = -1;
    -+		goto cleanup;
    -+	}
    ++	if (fd < 0)
    ++		return -1;
    ++
    ++	hdrlen = format_object_header(hdr, hdrlen, type, len);
     +
     +	/* Set it up and write header */
     +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
    -+		goto cleanup;
    ++		return 0;
     +	}
     +
     +	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +		struct strbuf dir = STRBUF_INIT;
     +		strbuf_add(&dir, filename.buf, dirlen - 1);
     +
    -+		if (mkdir_in_gitdir(dir.buf) < 0) {
    -+			err = -1;
    -+			goto cleanup;
    ++		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
    ++			ret = error_errno(_("unable to create directory %s"), dir.buf);
    ++			strbuf_release(&dir);
    ++			return ret;
     +		}
    ++		strbuf_release(&dir);
     +	}
     +
    -+	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
    -+cleanup:
    -+	strbuf_release(&tmp_file);
    -+	strbuf_release(&filename);
    -+	return err;
    ++	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
     +}
     +
      int write_object_file_flags(const void *buf, unsigned long len,
      			    const char *type, struct object_id *oid,
      			    unsigned flags)
    -@@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
    - 	char hdr[MAX_HEADER_LEN];
    - 	int hdrlen = sizeof(hdr);
    - 
    -+	/* When streaming a large blob object (marked as HASH_STREAM),
    -+	 * we have no chance to run "write_object_file_prepare()" to
    -+	 * calculate the "oid" in advance.  Call "stream_loose_object()"
    -+	 * to write loose object in stream.
    -+	 */
    -+	if (flags & HASH_STREAM) {
    -+		hdrlen = generate_object_header(hdr, hdrlen, type, len);
    -+		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
    -+	}
    -+
    - 	/* Normally if we have it in the pack then we do not bother writing
    - 	 * it out into .git/objects/??/?{38} file.
    - 	 */
     
      ## object-store.h ##
     @@ object-store.h: struct object_directory {
    @@ object-store.h: struct object_directory {
      };
      
     +struct input_stream {
    -+	const void *(*read)(const struct input_stream *, unsigned long *len);
    ++	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
     +};
     +
      KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
      	struct object_directory *, 1, fspathhash, fspatheq)
      
    +@@ object-store.h: static inline int write_object_file(const void *buf, unsigned long len,
    + 	return write_object_file_flags(buf, len, type, oid, 0);
    + }
    + 
    ++int write_stream_object_file(struct input_stream *in_stream, size_t len,
    ++			     enum object_type type, time_t mtime,
    ++			     unsigned flags, struct object_id *oid);
    ++
    + int hash_object_file_literally(const void *buf, unsigned long len,
    + 			       const char *type, struct object_id *oid,
    + 			       unsigned flags);
6:  476aaba527 ! 5:  e7b4e426ef unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	int status;
     +};
     +
    -+static const void *feed_input_zstream(const struct input_stream *in_stream,
    ++static const void *feed_input_zstream(struct input_stream *in_stream,
     +				      unsigned long *readlen)
     +{
     +	struct input_zstream_data *data = in_stream->data;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	return data->buf;
     +}
     +
    -+static void write_stream_blob(unsigned nr, unsigned long size)
    ++static void write_stream_blob(unsigned nr, size_t size)
     +{
     +	git_zstream zstream;
     +	struct input_zstream_data data;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	if (write_object_file_flags(&in_stream, size,
    -+				    type_name(OBJ_BLOB),
    -+				    &obj_list[nr].oid,
    -+				    HASH_STREAM))
    ++	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
    ++				     &obj_list[nr].oid))
     +		die(_("failed to write object in stream"));
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	git_inflate_end(&zstream);
     +
     +	if (strict) {
    -+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
    ++		struct blob *blob =
    ++			lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB &&
    ++	    size > big_file_streaming_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
    @@ environment.c: size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
      const char *editor_program;
      const char *askpass_program;
     
    - ## t/t5590-unpack-non-delta-objects.sh (new) ##
    -@@
    -+#!/bin/sh
    -+#
    -+# Copyright (c) 2021 Han Xin
    -+#
    -+
    -+test_description='Test unpack-objects when receive pack'
    -+
    -+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
    -+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    -+
    -+. ./test-lib.sh
    -+
    -+prepare_dest () {
    -+	test_when_finished "rm -rf dest.git" &&
    -+	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileStreamingThreshold $1 &&
    -+	git -C dest.git config core.bigFileThreshold $1
    -+}
    -+
    -+test_expect_success "setup repo with big blobs (1.5 MB)" '
    -+	test-tool genrandom foo 1500000 >big-blob &&
    -+	test_commit --append foo big-blob &&
    -+	test-tool genrandom bar 1500000 >big-blob &&
    -+	test_commit --append bar big-blob &&
    -+	(
    -+		cd .git &&
    -+		find objects/?? -type f | sort
    -+	) >expect &&
    -+	PACK=$(echo main | git pack-objects --revs test)
    -+'
    -+
    -+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    -+	GIT_ALLOC_LIMIT=1m &&
    -+	export GIT_ALLOC_LIMIT
    -+'
    -+
    -+test_expect_success 'fail to unpack-objects: cannot allocate' '
    + ## t/t5590-unpack-non-delta-objects.sh ##
    +@@ t/t5590-unpack-non-delta-objects.sh: export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    + prepare_dest () {
    + 	test_when_finished "rm -rf dest.git" &&
    + 	git init --bare dest.git
    ++	if test -n "$1"
    ++	then
    ++		git -C dest.git config core.bigFileStreamingThreshold $1
    ++		git -C dest.git config core.bigFileThreshold $1
    ++	fi
    + }
    + 
    + test_expect_success "setup repo with big blobs (1.5 MB)" '
    +@@ t/t5590-unpack-non-delta-objects.sh: test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    + '
    + 
    + test_expect_success 'fail to unpack-objects: cannot allocate' '
    +-	prepare_dest &&
     +	prepare_dest 2m &&
    -+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    -+	grep "fatal: attempting to allocate" err &&
    -+	(
    -+		cd dest.git &&
    -+		find objects/?? -type f | sort
    -+	) >actual &&
    -+	test_file_not_empty actual &&
    -+	! test_cmp expect actual
    -+'
    -+
    + 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    + 	grep "fatal: attempting to allocate" err &&
    + 	(
    +@@ t/t5590-unpack-non-delta-objects.sh: test_expect_success 'fail to unpack-objects: cannot allocate' '
    + 	! test_cmp expect actual
    + '
    + 
     +test_expect_success 'unpack big object in stream' '
     +	prepare_dest 1m &&
    ++	mkdir -p dest.git/objects/05 &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_must_be_empty actual
     +'
     +
    -+test_expect_success 'unpack-objects dry-run' '
    -+	prepare_dest 1m &&
    -+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    -+	(
    -+		cd dest.git &&
    -+		find objects/ -type f
    -+	) >actual &&
    -+	test_must_be_empty actual
    -+'
    -+
    -+test_done
    + test_expect_success 'unpack-objects dry-run' '
    + 	prepare_dest &&
    + 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:06           ` Jiang Xin
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
                           ` (10 subsequent siblings)
  12 siblings, 2 replies; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 23 +++++++++---
 t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 6 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..9104eb48da 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,21 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(size_t size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	size_t bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..48c4fb1ba3
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects with non-delta objects'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:30           ` René Scharfe
  2021-12-31  3:12           ` [PATCH v7 2/5] object-file API: add a format_object_header() function Jiang Xin
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
                           ` (9 subsequent siblings)
  12 siblings, 2 replies; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 14 +++++++++++---
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..4a765ddae6 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..1733a1de4f 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, (uintmax_t)size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f55e316ff4 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index eb1426f98c..6bba4766f9 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2006,7 +2014,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-17 11:26       ` Han Xin
                           ` (2 preceding siblings ...)
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
                           ` (8 subsequent siblings)
  12 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in
stream instead of read into the whole buf.

As this new method "stream_loose_object()" has many similarities with
"write_loose_object()", we split up "write_loose_object()" into some
steps:
 1. Figuring out a path for the (temp) object file.
 2. Creating the tempfile.
 3. Setting up zlib and write header.
 4. Write object data and handle errors.
 5. Optionally, do someting after write, maybe force a loose object if
"mtime".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 100 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 35 deletions(-)

diff --git a/object-file.c b/object-file.c
index 6bba4766f9..e048f3d39e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1751,6 +1751,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1836,7 +1855,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1844,7 +1864,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static void setup_stream_and_header(git_zstream *stream,
+				    unsigned char *compressed,
+				    unsigned long compressed_size,
+				    git_hash_ctx *c,
+				    char *hdr,
+				    int hdrlen)
+{
+	/* Set it up */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = compressed;
+	stream->avail_out = compressed_size;
+	the_hash_algo->init_fn(c);
+
+	/* First header.. */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1879,28 +1932,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0)
+		return -1;
 
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1929,16 +1967,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
+					       mtime, flags);
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-17 11:26       ` Han Xin
                           ` (3 preceding siblings ...)
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-21 11:52         ` Han Xin
  2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
                           ` (7 subsequent siblings)
  12 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:52 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_stream_object_file()"
in a stream. The input stream is implemented as an interface.

The difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "write_stream_object_file()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  9 ++++++
 2 files changed, 94 insertions(+)

diff --git a/object-file.c b/object-file.c
index e048f3d39e..d0573e2a61 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1989,6 +1989,91 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int write_stream_object_file(struct input_stream *in_stream, size_t len,
+			     enum object_type type, time_t mtime,
+			     unsigned flags, struct object_id *oid)
+{
+	int fd, ret, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct object_id parano_oid;
+	static struct strbuf tmp_file = STRBUF_INIT;
+	static struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen = sizeof(hdr);
+
+	/* Since "filename" is defined as static, it will be reused. So reset it
+	 * first before using it. */
+	strbuf_reset(&filename);
+	/* When oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0)
+		return -1;
+
+	hdrlen = format_object_header(hdr, hdrlen, type, len);
+
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (ret != Z_STREAM_END)
+		die(_("unable to deflate new object streamingly (%d)"), ret);
+	ret = git_deflate_end_gently(&stream);
+	if (ret != Z_OK)
+		die(_("deflateEnd on object streamingly failed (%d)"), ret);
+	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	close_loose_object(fd);
+
+	oidcpy(oid, &parano_oid);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		return 0;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			ret = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			return ret;
+		}
+		strbuf_release(&dir);
+	}
+
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..061b0cb2ba 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +237,10 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int write_stream_object_file(struct input_stream *in_stream, size_t len,
+			     enum object_type type, time_t mtime,
+			     unsigned flags, struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-17 11:26       ` Han Xin
                           ` (4 preceding siblings ...)
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
@ 2021-12-21 11:52         ` Han Xin
  2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:19           ` Jiang Xin
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                           ` (6 subsequent siblings)
  12 siblings, 2 replies; 165+ messages in thread
From: Han Xin @ 2021-12-21 11:52 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an
entrie buffer will have 10% performance penalty. Therefore, only unpack
object larger than the "core.BigFileStreamingThreshold" in zstream. See
the following benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 +++++
 builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
 6 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 9104eb48da..72d8616e00 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
+				     &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB &&
+	    size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 64071a8d80..8c9123cb5d 100644
--- a/cache.h
+++ b/cache.h
@@ -974,6 +974,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 0d06a31024..04bba593de 100644
--- a/environment.c
+++ b/environment.c
@@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
index 48c4fb1ba3..8436cbf8db 100755
--- a/t/t5590-unpack-non-delta-objects.sh
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -13,6 +13,11 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
 	git init --bare dest.git
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileStreamingThreshold $1
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_expect_success "setup repo with big blobs (1.5 MB)" '
@@ -33,7 +38,7 @@ test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
 '
 
 test_expect_success 'fail to unpack-objects: cannot allocate' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err &&
 	(
@@ -44,6 +49,35 @@ test_expect_success 'fail to unpack-objects: cannot allocate' '
 	! test_cmp expect actual
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	mkdir -p dest.git/objects/05 &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream with existing oids' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual
+'
+
 test_expect_success 'unpack-objects dry-run' '
 	prepare_dest &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-21 14:43             ` René Scharfe
  2021-12-22 11:29             ` Jiang Xin
  2021-12-31  3:06           ` Jiang Xin
  1 sibling, 2 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:09 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c            | 23 +++++++++---
>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+), 6 deletions(-)
>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..9104eb48da 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>  
> -static void *get_data(unsigned long size)
> +static void *get_data(size_t size, int dry_run)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	size_t bufsize;
> +	void *buf;
>  
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);

Maybe I'm misunderstanding this, but the commit message says it would be
dangerous to allocate a very larger buffer, but here we only limit the
size under "dry_run".

Removing that "&& size > 8192" makes all the tests pass still, so there
seems to be some missing coverage here in any case.

> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> new file mode 100755
> index 0000000000..48c4fb1ba3
> --- /dev/null
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -0,0 +1,57 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects with non-delta objects'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +prepare_dest () {
> +	test_when_finished "rm -rf dest.git" &&
> +	git init --bare dest.git
> +}
> +
> +test_expect_success "setup repo with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort
> +	) >expect &&
> +	PACK=$(echo main | git pack-objects --revs test)
> +'
> +
> +test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'fail to unpack-objects: cannot allocate' '
> +	prepare_dest &&
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	grep "fatal: attempting to allocate" err &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	test_file_not_empty actual &&
> +	! test_cmp expect actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run' '
> +	prepare_dest &&
> +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
> +	(
> +		cd dest.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_done

I commented on this "find" usage in an earlier round, I think there's a
much easier way to do this. You're really just going back and forth
between checking whether or not all the objects are loose.

I think that the below fix-up on top of this series is a better way to
do that, and more accurate. I.e. in your test here you check "!
test_cmp", which means that we could have some packed and some loose,
but really what you're meaning to check is a flip-flop between "all
loose?" and "no loose?.

In addition to that there was no reason to hardcode "main", we can just
use HEAD. All in all I think the below fix-up makes sense:

diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
index 8436cbf8db6..d78bb89225d 100755
--- a/t/t5590-unpack-non-delta-objects.sh
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -5,9 +5,6 @@
 
 test_description='Test unpack-objects with non-delta objects'
 
-GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
-export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
-
 . ./test-lib.sh
 
 prepare_dest () {
@@ -20,16 +17,22 @@ prepare_dest () {
 	fi
 }
 
+assert_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	echo $glob >actual &&
+	test_cmp expect actual
+}
+
 test_expect_success "setup repo with big blobs (1.5 MB)" '
 	test-tool genrandom foo 1500000 >big-blob &&
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
-	(
-		cd .git &&
-		find objects/?? -type f | sort
-	) >expect &&
-	PACK=$(echo main | git pack-objects --revs test)
+
+	# Everything is loose
+	rmdir .git/objects/pack &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
 '
 
 test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
@@ -41,51 +44,27 @@ test_expect_success 'fail to unpack-objects: cannot allocate' '
 	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_file_not_empty actual &&
-	! test_cmp expect actual
+	rmdir dest.git/objects/pack
 '
 
 test_expect_success 'unpack big object in stream' '
 	prepare_dest 1m &&
 	mkdir -p dest.git/objects/05 &&
 	git -C dest.git unpack-objects <test-$PACK.pack &&
-	git -C dest.git fsck &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_cmp expect actual
+	rmdir dest.git/objects/pack
 '
 
 test_expect_success 'unpack big object in stream with existing oids' '
 	prepare_dest 1m &&
 	git -C dest.git index-pack --stdin <test-$PACK.pack &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_must_be_empty actual &&
 	git -C dest.git unpack-objects <test-$PACK.pack &&
-	git -C dest.git fsck &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_must_be_empty actual
+	assert_no_loose
 '
 
 test_expect_success 'unpack-objects dry-run' '
 	prepare_dest &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-	(
-		cd dest.git &&
-		find objects/ -type f
-	) >actual &&
-	test_must_be_empty actual
+	assert_no_loose
 '
 
 test_done

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
  2021-12-22 12:02             ` Jiang Xin
  0 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:16 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;
>  
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);
> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());

This should be an error_errno() instead, ...

> +		else
> +			return error_errno(_("unable to create temporary file"));

...and we can just fold this whole if/else into one condition with a
briefer message, e.g.:

    error_errno(_("unable to add object to '%s'"), get_object_directory());

Or whatever, unless there's another bug here where you inverted these
conditions, and the "else" really should not use "error_errno" but
"error".... (I don't know...)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
@ 2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
  2021-12-21 15:05             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:20 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> +int write_stream_object_file(struct input_stream *in_stream, size_t len,
> +			     enum object_type type, time_t mtime,
> +			     unsigned flags, struct object_id *oid)
> +{
> +	int fd, ret, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct object_id parano_oid;
> +	static struct strbuf tmp_file = STRBUF_INIT;
> +	static struct strbuf filename = STRBUF_INIT;
> +	int dirlen;
> +	char hdr[MAX_HEADER_LEN];
> +	int hdrlen = sizeof(hdr);
> +
> +	/* Since "filename" is defined as static, it will be reused. So reset it
> +	 * first before using it. */
> +	strbuf_reset(&filename);
> +	/* When oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());

I realize this is somewhat following the pattern of code you moved
around earlier, but FWIW I think these sorts of comments are really
over-doing it. I.e. we try not to comment on things that are obvious
from the code itself.

Also René's comment on v6 still applies here:

    Given that this function is only used for huge objects I think making
    the strbufs non-static and releasing them is the best choice here.

I thin just making them non-static and doing a strbuf_release() as he
suggested is best here.

> +
> +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
> +	if (fd < 0)
> +		return -1;
> +
> +	hdrlen = format_object_header(hdr, hdrlen, type, len);
> +
> +	/* Set it up and write header */
> +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
> +				&c, hdr, hdrlen);
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +		if (!stream.avail_in) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (len + hdrlen == stream.total_in + stream.avail_in)
> +				flush = Z_FINISH;
> +		}
> +		ret = git_deflate(&stream, flush);
> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream.next_out = compressed;
> +		stream.avail_out = sizeof(compressed);
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
> +
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to deflate new object streamingly (%d)"), ret);
> +	ret = git_deflate_end_gently(&stream);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);

nit: let's say "unable to stream deflate new object" or something, and
not use the confusing (invented?) word "streamingly".

> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +
> +	close_loose_object(fd);
> +
> +	oidcpy(oid, &parano_oid);

I see there's still quite a bit of duplication between this and
write_loose_object(), but maybe it's not easy to factor out.

> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		return 0;
> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	/* We finally know the object path, and create the missing dir. */
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		strbuf_add(&dir, filename.buf, dirlen - 1);

Just a minor nit, but I noticed we could have this on top, i.e. this
"remove the slash" is now what 1/3 users of it wan:
	
	 object-file.c | 10 +++++-----
	 1 file changed, 5 insertions(+), 5 deletions(-)
	
	diff --git a/object-file.c b/object-file.c
	index 77a3217fd0e..b0dea96906e 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1878,13 +1878,13 @@ static void close_loose_object(int fd)
	 		die_errno(_("error when closing loose object file"));
	 }
	 
	-/* Size of directory component, including the ending '/' */
	+/* Size of directory component, excluding the ending '/' */
	 static inline int directory_size(const char *filename)
	 {
	 	const char *s = strrchr(filename, '/');
	 	if (!s)
	 		return 0;
	-	return s - filename + 1;
	+	return s - filename;
	 }
	 
	 /*
	@@ -1901,7 +1901,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
	 
	 	strbuf_reset(tmp);
	 	strbuf_add(tmp, filename, dirlen);
	-	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
	+	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
	 	fd = git_mkstemp_mode(tmp->buf, 0444);
	 	do {
	 		if (fd >= 0 || !dirlen || errno != ENOENT)
	@@ -1913,7 +1913,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
	 		 * scratch.
	 		 */
	 		strbuf_reset(tmp);
	-		strbuf_add(tmp, filename, dirlen - 1);
	+		strbuf_add(tmp, filename, dirlen);
	 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
	 			break;
	 		if (adjust_shared_perm(tmp->buf))
	@@ -2100,7 +2100,7 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 	dirlen = directory_size(filename.buf);
	 	if (dirlen) {
	 		struct strbuf dir = STRBUF_INIT;
	-		strbuf_add(&dir, filename.buf, dirlen - 1);
	+		strbuf_add(&dir, filename.buf, dirlen);
	 
	 		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
	 			ret = error_errno(_("unable to create directory %s"), dir.buf);

On my platform (linux) it's not needed either way, a "mkdir foo" works
as well as "mkdir foo/", but maybe some oS's have trouble with it.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
@ 2021-12-21 14:30           ` René Scharfe
  2022-02-01 14:28             ` C99 %z (was: [PATCH v7 2/5] object-file API: add a format_object_header() function) Ævar Arnfjörð Bjarmason
  2021-12-31  3:12           ` [PATCH v7 2/5] object-file API: add a format_object_header() function Jiang Xin
  1 sibling, 1 reply; 165+ messages in thread
From: René Scharfe @ 2021-12-21 14:30 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 21.12.21 um 12:51 schrieb Han Xin:
> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index c23d01de7d..4a765ddae6 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
>  	int hdrlen;
>
>  	if (!is_delta_type(type)) {
> -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
> -				   type_name(type),(uintmax_t)size) + 1;
> +		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
                                                                      ^^^^^^^^^^^
This explicit cast is unnecessary.  It was needed with xsnprintf(), but
that implementation detail is handled inside the new helper function.

(format_object_header() takes a size_t; even if unsigned long would be
wider than that on some weird architecture, casting the size to
uintmax_t will not avoid the implicit truncation happening during the
function call.)

>  		the_hash_algo->init_fn(&c);
>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>  	} else
> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index 8785b2ac80..1733a1de4f 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>  	if (seekback == (off_t) -1)
>  		return error("cannot find the current offset");
>
> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
> -			       type_name(type), (uintmax_t)size) + 1;
> +	header_len = format_object_header((char *)obuf, sizeof(obuf),
> +					 type, (uintmax_t)size);
                                               ^^^^^^^^^^^
Same here, just that size is already of type size_t, so a cast makes
even less sense.

>  	the_hash_algo->init_fn(&ctx);
>  	the_hash_algo->update_fn(&ctx, obuf, header_len);
>
> diff --git a/cache.h b/cache.h
> index cfba463aa9..64071a8d80 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
>  						    unsigned long bufsiz,
>  						    struct strbuf *hdrbuf);
>
> +/**
> + * format_object_header() is a thin wrapper around s xsnprintf() that
> + * writes the initial "<type> <obj-len>" part of the loose object
> + * header. It returns the size that snprintf() returns + 1.
> + *
> + * The format_object_header_extended() function allows for writing a
> + * type_name that's not one of the "enum object_type" types. This is
> + * used for "git hash-object --literally". Pass in a OBJ_NONE as the
> + * type, and a non-NULL "type_str" to do that.
> + *
> + * format_object_header() is a convenience wrapper for
> + * format_object_header_extended().
> + */
> +int format_object_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *type_str, size_t objsize);
> +static inline int format_object_header(char *str, size_t size,
> +				      enum object_type type, size_t objsize)
> +{
> +	return format_object_header_extended(str, size, type, NULL, objsize);
> +}
> +
>  /**
>   * parse_loose_header() parses the starting "<type> <len>\0" of an
>   * object. If it doesn't follow that format -1 is returned. To check
> diff --git a/http-push.c b/http-push.c
> index 3309aaf004..f55e316ff4 100644
> --- a/http-push.c
> +++ b/http-push.c
> @@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
>  	git_zstream stream;
>
>  	unpacked = read_object_file(&request->obj->oid, &type, &len);
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
                                                              ^^^^^^^^^^^
Same here; len is of type unsigned long.

>
>  	/* Set it up */
>  	git_deflate_init(&stream, zlib_compression_level);
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..6bba4766f9 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
>  	return ret;
>  }
>
> +int format_object_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *typestr, size_t objsize)
> +{
> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
> +
> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
                                                      ^^^^^^^^^^^
This cast is necessary to match PRIuMAX.  And that is used because the z
modifier (as in e.g. printf("%zu", sizeof(size_t));) was only added in
C99 and not all platforms may have it.  (Perhaps this cautious approach
is worth revisiting separately, now that some time has passed, but this
patch series should still use PRIuMAX, as it does.)

> +}
> +
>  /*
>   * With an in-core object data in "map", rehash it to make sure the
>   * object name actually matches "oid" to detect object corruption.
> @@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
>  		return -1;
>
>  	/* Generate the header */
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
>
>  	/* Sha1.. */
>  	r->hash_algo->init_fn(&c);
> @@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	git_hash_ctx c;
>
>  	/* Generate the header */
> -	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
> +	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
>
>  	/* Sha1.. */
>  	algo->init_fn(&c);
> @@ -2006,7 +2014,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
>  	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>

No explicit cast in these three cases -- good.  They all pass an
unsigned long as last parameter btw.

René

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
@ 2021-12-21 14:43             ` René Scharfe
  2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
  2021-12-22 11:15               ` Jiang Xin
  2021-12-22 11:29             ` Jiang Xin
  1 sibling, 2 replies; 165+ messages in thread
From: René Scharfe @ 2021-12-21 14:43 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> In dry_run mode, "get_data()" is used to verify the inflation of data,
>> and the returned buffer will not be used at all and will be freed
>> immediately. Even in dry_run mode, it is dangerous to allocate a
>> full-size buffer for a large blob object. Therefore, only allocate a
>> low memory footprint when calling "get_data()" in dry_run mode.
>>
>> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>> ---
>>  builtin/unpack-objects.c            | 23 +++++++++---
>>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>>  2 files changed, 74 insertions(+), 6 deletions(-)
>>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>>
>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>> index 4a9466295b..9104eb48da 100644
>> --- a/builtin/unpack-objects.c
>> +++ b/builtin/unpack-objects.c
>> @@ -96,15 +96,21 @@ static void use(int bytes)
>>  	display_throughput(progress, consumed_bytes);
>>  }
>>
>> -static void *get_data(unsigned long size)
>> +static void *get_data(size_t size, int dry_run)
>>  {
>>  	git_zstream stream;
>> -	void *buf = xmallocz(size);
>> +	size_t bufsize;
>> +	void *buf;
>>
>>  	memset(&stream, 0, sizeof(stream));
>> +	if (dry_run && size > 8192)
>> +		bufsize = 8192;
>> +	else
>> +		bufsize = size;
>> +	buf = xmallocz(bufsize);
>
> Maybe I'm misunderstanding this, but the commit message says it would be
> dangerous to allocate a very larger buffer, but here we only limit the
> size under "dry_run".

This patch reduces the memory usage of dry runs, as its commit message
says.  The memory usage of one type of actual (non-dry) unpack is reduced
by patch 5.

> Removing that "&& size > 8192" makes all the tests pass still, so there
> seems to be some missing coverage here in any case.

How would you test that an 8KB buffer is allocated even though a smaller
one would suffice?  And why?  Wasting a few KB shouldn't be noticeable.

René

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:43             ` René Scharfe
@ 2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
  2021-12-22 11:15               ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:04 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, Han Xin


On Tue, Dec 21 2021, René Scharfe wrote:

> Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
>>
>> On Tue, Dec 21 2021, Han Xin wrote:
>>
>>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>>
>>> In dry_run mode, "get_data()" is used to verify the inflation of data,
>>> and the returned buffer will not be used at all and will be freed
>>> immediately. Even in dry_run mode, it is dangerous to allocate a
>>> full-size buffer for a large blob object. Therefore, only allocate a
>>> low memory footprint when calling "get_data()" in dry_run mode.
>>>
>>> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>>> ---
>>>  builtin/unpack-objects.c            | 23 +++++++++---
>>>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>>>  2 files changed, 74 insertions(+), 6 deletions(-)
>>>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>>>
>>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>>> index 4a9466295b..9104eb48da 100644
>>> --- a/builtin/unpack-objects.c
>>> +++ b/builtin/unpack-objects.c
>>> @@ -96,15 +96,21 @@ static void use(int bytes)
>>>  	display_throughput(progress, consumed_bytes);
>>>  }
>>>
>>> -static void *get_data(unsigned long size)
>>> +static void *get_data(size_t size, int dry_run)
>>>  {
>>>  	git_zstream stream;
>>> -	void *buf = xmallocz(size);
>>> +	size_t bufsize;
>>> +	void *buf;
>>>
>>>  	memset(&stream, 0, sizeof(stream));
>>> +	if (dry_run && size > 8192)
>>> +		bufsize = 8192;
>>> +	else
>>> +		bufsize = size;
>>> +	buf = xmallocz(bufsize);
>>
>> Maybe I'm misunderstanding this, but the commit message says it would be
>> dangerous to allocate a very larger buffer, but here we only limit the
>> size under "dry_run".
>
> This patch reduces the memory usage of dry runs, as its commit message
> says.  The memory usage of one type of actual (non-dry) unpack is reduced
> by patch 5.
>
>> Removing that "&& size > 8192" makes all the tests pass still, so there
>> seems to be some missing coverage here in any case.
>
> How would you test that an 8KB buffer is allocated even though a smaller
> one would suffice?  And why?  Wasting a few KB shouldn't be noticeable.

That doesn't sound like it needs to be tested. I was just trying to grok
what this was all doing. Thanks!

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
@ 2021-12-21 15:05             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Ævar Arnfjörð Bjarmason wrote:

> On Tue, Dec 21 2021, Han Xin wrote:

>> +	/* Then the data itself.. */
>> +	do {
>> +		unsigned char *in0 = stream.next_in;
>> +		if (!stream.avail_in) {
>> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
>> +			stream.next_in = (void *)in;
>> +			in0 = (unsigned char *)in;
>> +			/* All data has been read. */
>> +			if (len + hdrlen == stream.total_in + stream.avail_in)
>> +				flush = Z_FINISH;
>> +		}
>> +		ret = git_deflate(&stream, flush);
>> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>> +			die(_("unable to write loose object file"));
>> +		stream.next_out = compressed;
>> +		stream.avail_out = sizeof(compressed);
>> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
>> +
>> +	if (ret != Z_STREAM_END)
>> +		die(_("unable to deflate new object streamingly (%d)"), ret);
>> +	ret = git_deflate_end_gently(&stream);
>> +	if (ret != Z_OK)
>> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);
>
> nit: let's say "unable to stream deflate new object" or something, and
> not use the confusing (invented?) word "streamingly".
>
>> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
>> +
>> +	close_loose_object(fd);
>> +
>> +	oidcpy(oid, &parano_oid);
>
> I see there's still quite a bit of duplication between this and
> write_loose_object(), but maybe it's not easy to factor out.

For what it's worth I tried to do that and the result doesn't really
seem worth it. I.e. something like the below. The inner loop of the
do/while looks like it could get a similar treatment, but likewise
doesn't seem worth the effort.

diff --git a/object-file.c b/object-file.c
index b0dea96906e..7fc2363cfa1 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1957,6 +1957,46 @@ static void setup_stream_and_header(git_zstream *stream,
 	the_hash_algo->update_fn(c, hdr, hdrlen);
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     enum object_type type, size_t len,
+				     char *hdr, int *hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename, flags);
+	if (fd < 0)
+		return -1;
+
+	if (type != OBJ_NONE)
+		*hdrlen = format_object_header(hdr, *hdrlen, type, len);
+
+	/* Set it up and write header */
+	setup_stream_and_header(stream, buf, buflen, c, hdr, *hdrlen);
+
+	return fd;
+
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *zstream_end_fmt,
+				    const char *z_ok_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(zstream_end_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(z_ok_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1970,15 +2010,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf filename = STRBUF_INIT;
 
 	loose_object_path(the_repository, &filename, oid);
-
-	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_NONE, 0, hdr, &hdrlen);
 	if (fd < 0)
 		return -1;
 
-	/* Set it up and write header */
-	setup_stream_and_header(&stream, compressed, sizeof(compressed),
-				&c, hdr, hdrlen);
-
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
@@ -1992,14 +2029,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
@@ -2049,16 +2081,12 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
 	/* When oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
 
-	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, type, len, hdr, &hdrlen);
 	if (fd < 0)
 		return -1;
 
-	hdrlen = format_object_header(hdr, hdrlen, type, len);
-
-	/* Set it up and write header */
-	setup_stream_and_header(&stream, compressed, sizeof(compressed),
-				&c, hdr, hdrlen);
-
 	/* Then the data itself.. */
 	do {
 		unsigned char *in0 = stream.next_in;
@@ -2078,12 +2106,9 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object streamingly (%d)"), ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object streamingly failed (%d)"), ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	end_loose_object_common(ret, &c, &stream, &parano_oid, NULL,
+				N_("unable to deflate new object streamingly (%d)"),
+				N_("deflateEnd on object streamingly failed (%d)"));
 
 	close_loose_object(fd);
 

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:19           ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:06 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>  
> +core.bigFileStreamingThreshold::
> +	Files larger than this will be streamed out to a temporary
> +	object file while being hashed, which will when be renamed
> +	in-place to a loose object, particularly if the
> +	`core.bigFileThreshold' setting dictates that they're always
> +	written out as loose objects.
> ++
> +Default is 128 MiB on all platforms.
> ++
> +Common unit suffixes of 'k', 'm', or 'g' are supported.
> +
>  core.excludesFile::
>  	Specifies the pathname to the file that contains patterns to
>  	describe paths that are not meant to be tracked, in addition
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 9104eb48da..72d8616e00 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +struct input_zstream_data {
> +	git_zstream *zstream;
> +	unsigned char buf[8192];
> +	int status;
> +};
> +
> +static const void *feed_input_zstream(struct input_stream *in_stream,
> +				      unsigned long *readlen)
> +{
> +	struct input_zstream_data *data = in_stream->data;
> +	git_zstream *zstream = data->zstream;
> +	void *in = fill(1);
> +
> +	if (!len || data->status == Z_STREAM_END) {
> +		*readlen = 0;
> +		return NULL;
> +	}
> +
> +	zstream->next_out = data->buf;
> +	zstream->avail_out = sizeof(data->buf);
> +	zstream->next_in = in;
> +	zstream->avail_in = len;
> +
> +	data->status = git_inflate(zstream, 0);
> +	use(len - zstream->avail_in);
> +	*readlen = sizeof(data->buf) - zstream->avail_out;
> +
> +	return data->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, size_t size)
> +{
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +	};
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));

nit/style: both of these memset can be replaced by "{ 0 }", e.g. "git_zstream zstream = { 0 }".

> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
> +				     &obj_list[nr].oid))

So at the end of this series we never pass in anything but blob here,
mtime is always 0 etc. So there was no reason to create a factored out
finalize_object_file_with_mtime() earlier in the series.

Well, I don't mind the finalize_object_file_with_mtime() exiting, but
let's not pretend this is more generalized than it is. We're unlikely to
ever want to do this for non-blobs.

This on top of this series (and my local WIP fixups as I'm reviewing
this, so it won't cleanly apply, but the idea should be clear) makes
this simpler:
	
	diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
	index 2f8d34a2e47..a3a1d4b266f 100644
	--- a/builtin/unpack-objects.c
	+++ b/builtin/unpack-objects.c
	@@ -375,8 +375,7 @@ static void write_stream_blob(unsigned nr, size_t size)
	 	data.zstream = &zstream;
	 	git_inflate_init(&zstream);
	 
	-	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
	-				     &obj_list[nr].oid))
	+	if (write_stream_object_file(&in_stream, size, &obj_list[nr].oid))
	 		die(_("failed to write object in stream"));
	 
	 	if (zstream.total_out != size || data.status != Z_STREAM_END)
	diff --git a/object-file.c b/object-file.c
	index 7fc2363cfa1..0572b34fc5a 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -2061,8 +2061,7 @@ static int freshen_packed_object(const struct object_id *oid)
	 }
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid)
	+			     struct object_id *oid)
	 {
	 	int fd, ret, flush = 0;
	 	unsigned char compressed[4096];
	@@ -2081,9 +2080,9 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 	/* When oid is not determined, save tmp file to odb path. */
	 	strbuf_addf(&filename, "%s/", get_object_directory());
	 
	-	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
	+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
	 				       &stream, compressed, sizeof(compressed),
	-				       &c, type, len, hdr, &hdrlen);
	+				       &c, OBJ_BLOB, len, hdr, &hdrlen);
	 	if (fd < 0)
	 		return -1;
	 
	@@ -2135,7 +2134,7 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 		strbuf_release(&dir);
	 	}
	 
	-	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
	+	return finalize_object_file(tmp_file.buf, filename.buf);
	 }
	 
	 int write_object_file_flags(const void *buf, unsigned long len,
	diff --git a/object-store.h b/object-store.h
	index 87d370d39ca..1362b58a4d3 100644
	--- a/object-store.h
	+++ b/object-store.h
	@@ -257,8 +257,7 @@ int hash_write_object_file_literally(const void *buf, unsigned long len,
	 				     unsigned flags);
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid);
	+			     struct object_id *oid);
	 
	 /*
	  * Add an object file to the in-memory object store, without writing it
	

> +		die(_("failed to write object in stream"));
> diff --git a/environment.c b/environment.c
> index 0d06a31024..04bba593de 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
>  size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
>  size_t delta_base_cache_limit = 96 * 1024 * 1024;
>  unsigned long big_file_threshold = 512 * 1024 * 1024;
> +unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
>  int pager_use_color = 1;
>  const char *editor_program;
>  const char *askpass_program;
> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> index 48c4fb1ba3..8436cbf8db 100755
> --- a/t/t5590-unpack-non-delta-objects.sh
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -13,6 +13,11 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>  prepare_dest () {
>  	test_when_finished "rm -rf dest.git" &&
>  	git init --bare dest.git
> +	if test -n "$1"
> +	then
> +		git -C dest.git config core.bigFileStreamingThreshold $1
> +		git -C dest.git config core.bigFileThreshold $1
> +	fi

All of this new code is missing "&&" to chain & test forfailures.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:43             ` René Scharfe
  2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
@ 2021-12-22 11:15               ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-22 11:15 UTC (permalink / raw)
  To: René Scharfe
  Cc: Ævar Arnfjörð Bjarmason, Han Xin, Junio C Hamano,
	Git List, Jeff King, Jiang Xin, Philip Oakley, Derrick Stolee,
	Han Xin

On Wed, Dec 22, 2021 at 9:53 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
> > Maybe I'm misunderstanding this, but the commit message says it would be
> > dangerous to allocate a very larger buffer, but here we only limit the
> > size under "dry_run".
>
> This patch reduces the memory usage of dry runs, as its commit message
> says.  The memory usage of one type of actual (non-dry) unpack is reduced
> by patch 5.
>

For Han Xin and me, it is very challenging to write better commit log
in English.  Since the commit is moved to the beginning, the commit
log should be rewritten as follows:

unpack-objects.c: low memory footprint for get_data() in dry_run mode

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to resolve unpack issue of large blob objects, refactor
"get_data()" to reduce memory footprint for dry_run mode. Because
in dry_run mode, "get_data()" is only used to check the integrity of
data, and the returned buffer is not used at all.

Therefore, add the flag "dry_run" as an additional parameter of
"get_data()" and reuse a small buffer in dry_run mode. Because in
dry_run mode, the return buffer is not the entire data that the user
wants, for this reason, we will release the buffer and return NULL.

Han Xin, I think you can try to free the allocated buffer for dry_run
mode inside "get_data()".

--
Jiang Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-21 14:43             ` René Scharfe
@ 2021-12-22 11:29             ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-22 11:29 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 8:37 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
> I commented on this "find" usage in an earlier round, I think there's a
> much easier way to do this. You're really just going back and forth
> between checking whether or not all the objects are loose.
>
> I think that the below fix-up on top of this series is a better way to
> do that, and more accurate. I.e. in your test here you check "!
> test_cmp", which means that we could have some packed and some loose,
> but really what you're meaning to check is a flip-flop between "all
> loose?" and "no loose?.
>
> In addition to that there was no reason to hardcode "main", we can just
> use HEAD. All in all I think the below fix-up makes sense:
>
> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> index 8436cbf8db6..d78bb89225d 100755
> --- a/t/t5590-unpack-non-delta-objects.sh
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -5,9 +5,6 @@
>
>  test_description='Test unpack-objects with non-delta objects'
>
> -GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> -export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> -
>  . ./test-lib.sh
>
>  prepare_dest () {
> @@ -20,16 +17,22 @@ prepare_dest () {
>         fi
>  }
>
> +assert_no_loose () {
> +       glob=dest.git/objects/?? &&
> +       echo "$glob" >expect &&
> +       echo $glob >actual &&

Incompatible for zsh. This may work:

    eval "echo $glob" >actual &&

--
Jiang Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
@ 2021-12-22 12:02             ` Jiang Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-22 12:02 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 8:40 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> > [...]
> > @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >               strbuf_reset(tmp);
> >               strbuf_add(tmp, filename, dirlen - 1);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> > -                     return -1;
> > +                     break;
> >               if (adjust_shared_perm(tmp->buf))
> > -                     return -1;
> > +                     break;
> >
> >               /* Try again */
> >               strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >               fd = git_mkstemp_mode(tmp->buf, 0444);
> > +     } while (0);
> > +
> > +     if (fd < 0 && !(flags & HASH_SILENT)) {
> > +             if (errno == EACCES)
> > +                     return error(_("insufficient permission for adding an "
> > +                                    "object to repository database %s"),
> > +                                  get_object_directory());
>
> This should be an error_errno() instead, ...

We already know the errno (EACCESS) and output a decent error message,
so using error() is OK.  BTW, it's just a refactor by copy & paste.

>
> > +             else
> > +                     return error_errno(_("unable to create temporary file"));
>
> ...and we can just fold this whole if/else into one condition with a
> briefer message, e.g.:
>
>     error_errno(_("unable to add object to '%s'"), get_object_directory());
>
> Or whatever, unless there's another bug here where you inverted these
> conditions, and the "else" really should not use "error_errno" but
> "error".... (I don't know...)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
@ 2021-12-31  3:06           ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-31  3:06 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:33 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c            | 23 +++++++++---
>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+), 6 deletions(-)
>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..9104eb48da 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>         display_throughput(progress, consumed_bytes);
>  }
>
> -static void *get_data(unsigned long size)
> +static void *get_data(size_t size, int dry_run)

After a offline talk with Han Xin, we feel it is not necessary to pass
"dry_run" as a argument, use the file-scope static variable directly
in "get_data()".

>  {
>         git_zstream stream;
> -       void *buf = xmallocz(size);
> +       size_t bufsize;
> +       void *buf;
>
>         memset(&stream, 0, sizeof(stream));
> +       if (dry_run && size > 8192)

Use the file-scope static variable "dry_run".

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
  2021-12-21 14:30           ` René Scharfe
@ 2021-12-31  3:12           ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-31  3:12 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:56 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)

After a offline review with Han Xin, we feel it's better to move this
fixup commit to the end of this series, and this commit will also fix
an additional "xsnprintf()" we introduced in this series.

--
Jiang Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
@ 2021-12-31  3:19           ` Jiang Xin
  1 sibling, 0 replies; 165+ messages in thread
From: Jiang Xin @ 2021-12-31  3:19 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:56 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>
> +core.bigFileStreamingThreshold::
> +       Files larger than this will be streamed out to a temporary
> +       object file while being hashed, which will when be renamed
> +       in-place to a loose object, particularly if the
> +       `core.bigFileThreshold' setting dictates that they're always
> +       written out as loose objects.

Han Xin told me the reason to introduce another git config variable,
but I feel it not good to introduce an application specific config
variable as "core.XXX" and parsing it in "config.c".

So in patch v8, will still reuse the config variable
"core.bigFileThreshold", and will introduce an application specific
config variable, such as unpack.bigFileThreshold and parse the new
config in "builtin/unpack-objects.c".

--
Jiang Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 0/6] unpack large blobs in stream
  2021-12-17 11:26       ` Han Xin
                           ` (5 preceding siblings ...)
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
                             ` (5 more replies)
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
                           ` (5 subsequent siblings)
  12 siblings, 6 replies; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v7:
* Use functions "assert_no_loose()" and "assert_no_pack()" to do tests instead
  of "find" sugguseted by Ævar Arnfjörð Bjarmason[1].

* "get_data()" now use the global "dry_run" and it will release the buf before
  returning.

* Add a new commit "object-file.c: remove the slash for directory_size()"
  sugguseted by Ævar Arnfjörð Bjarmason[2].

* Add "int is_finished" to "struct input_stream" who will tell us if there is 
  next buffer in the stream.

* Remove the config "core.bigFileStreamingThreshold" introduced in v5, and keep
  using "core.bigFileThreshold". Until now, the config variable has been used in
  the cases listed in "unpack-objects: unpack_non_delta_entry() read data in a
  stream", this new case belongs to the packfile category.

* Remove unnecessary explicit cast in "object-file API: add a 
  format_object_header() function" sugguseted by René Scharfe[3].

1. https://lore.kernel.org/git/211221.86bl1arqls.gmgdl@evledraar.gmail.com/
2. https://lore.kernel.org/git/211221.8635mmrpps.gmgdl@evledraar.gmail.com/
3. https://lore.kernel.org/git/b2dee243-1a38-531e-02b1-ffd66c465fa5@web.de/

Han Xin (5):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: remove the slash for directory_size()
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 builtin/index-pack.c            |   3 +-
 builtin/unpack-objects.c        | 110 +++++++++++--
 bulk-checkin.c                  |   4 +-
 cache.h                         |  21 +++
 http-push.c                     |   2 +-
 object-file.c                   | 272 ++++++++++++++++++++++++++------
 object-store.h                  |   9 ++
 t/t5329-unpack-large-objects.sh |  69 ++++++++
 8 files changed, 422 insertions(+), 68 deletions(-)
 create mode 100755 t/t5329-unpack-large-objects.sh

Range-diff against v7:
1:  a8f232f553 < -:  ---------- unpack-objects.c: add dry_run mode for get_data()
-:  ---------- > 1:  bd34da5816 unpack-objects: low memory footprint for get_data() in dry_run mode
3:  a571b8f16c ! 2:  f9a4365a7d object-file.c: refactor write_loose_object() to reuse in stream version
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: refactor write_loose_object() to reuse in stream version
    +    object-file.c: refactor write_loose_object() to several steps
     
    -    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    -    entire contents of a blob object, no matter how big it is. This
    -    implementation may consume all the memory and cause OOM.
    +    When writing a large blob using "write_loose_object()", we have to pass
    +    a buffer with the whole content of the blob, and this behavior will
    +    consume lots of memory and may cause OOM. We will introduce a stream
    +    version function ("stream_loose_object()") in latter commit to resolve
    +    this issue.
     
    -    This can be improved by feeding data to "stream_loose_object()" in
    -    stream instead of read into the whole buf.
    +    Before introducing a stream vesion function for writing loose object,
    +    do some refactoring on "write_loose_object()" to reuse code for both
    +    versions.
     
    -    As this new method "stream_loose_object()" has many similarities with
    -    "write_loose_object()", we split up "write_loose_object()" into some
    -    steps:
    -     1. Figuring out a path for the (temp) object file.
    -     2. Creating the tempfile.
    -     3. Setting up zlib and write header.
    -     4. Write object data and handle errors.
    -     5. Optionally, do someting after write, maybe force a loose object if
    -    "mtime".
    +    Rewrite "write_loose_object()" as follows:
    +
    +     1. Figure out a path for the (temp) object file. This step is only
    +        used in "write_loose_object()".
    +
    +     2. Move common steps for starting to write loose objects into a new
    +        function "start_loose_object_common()".
    +
    +     3. Compress data.
    +
    +     4. Move common steps for ending zlib stream into a new funciton
    +        "end_loose_object_common()".
    +
    +     5. Close fd and finalize the object file.
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
      	return fd;
      }
      
    -+static void setup_stream_and_header(git_zstream *stream,
    -+				    unsigned char *compressed,
    -+				    unsigned long compressed_size,
    -+				    git_hash_ctx *c,
    -+				    char *hdr,
    -+				    int hdrlen)
    ++static int start_loose_object_common(struct strbuf *tmp_file,
    ++				     const char *filename, unsigned flags,
    ++				     git_zstream *stream,
    ++				     unsigned char *buf, size_t buflen,
    ++				     git_hash_ctx *c,
    ++				     enum object_type type, size_t len,
    ++				     char *hdr, int hdrlen)
     +{
    -+	/* Set it up */
    ++	int fd;
    ++
    ++	fd = create_tmpfile(tmp_file, filename, flags);
    ++	if (fd < 0)
    ++		return -1;
    ++
    ++	/*  Setup zlib stream for compression */
     +	git_deflate_init(stream, zlib_compression_level);
    -+	stream->next_out = compressed;
    -+	stream->avail_out = compressed_size;
    ++	stream->next_out = buf;
    ++	stream->avail_out = buflen;
     +	the_hash_algo->init_fn(c);
     +
    -+	/* First header.. */
    ++	/*  Start to feed header to zlib stream */
     +	stream->next_in = (unsigned char *)hdr;
     +	stream->avail_in = hdrlen;
     +	while (git_deflate(stream, 0) == Z_OK)
     +		; /* nothing */
     +	the_hash_algo->update_fn(c, hdr, hdrlen);
    ++
    ++	return fd;
    ++}
    ++
    ++static void end_loose_object_common(int ret, git_hash_ctx *c,
    ++				    git_zstream *stream,
    ++				    struct object_id *parano_oid,
    ++				    const struct object_id *expected_oid,
    ++				    const char *die_msg1_fmt,
    ++				    const char *die_msg2_fmt)
    ++{
    ++	if (ret != Z_STREAM_END)
    ++		die(_(die_msg1_fmt), ret, expected_oid);
    ++	ret = git_deflate_end_gently(stream);
    ++	if (ret != Z_OK)
    ++		die(_(die_msg2_fmt), ret, expected_oid);
    ++	the_hash_algo->final_oid_fn(parano_oid, c);
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	stream.next_out = compressed;
     -	stream.avail_out = sizeof(compressed);
     -	the_hash_algo->init_fn(&c);
    -+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    -+	if (fd < 0)
    -+		return -1;
    - 
    +-
     -	/* First header.. */
     -	stream.next_in = (unsigned char *)hdr;
     -	stream.avail_in = hdrlen;
     -	while (git_deflate(&stream, 0) == Z_OK)
     -		; /* nothing */
     -	the_hash_algo->update_fn(&c, hdr, hdrlen);
    -+	/* Set it up and write header */
    -+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
    -+				&c, hdr, hdrlen);
    ++	/* Common steps for write_loose_object and stream_loose_object to
    ++	 * start writing loose oject:
    ++	 *
    ++	 *  - Create tmpfile for the loose object.
    ++	 *  - Setup zlib stream for compression.
    ++	 *  - Start to feed header to zlib stream.
    ++	 */
    ++	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
    ++				       &stream, compressed, sizeof(compressed),
    ++				       &c, OBJ_NONE, 0, hdr, hdrlen);
    ++	if (fd < 0)
    ++		return -1;
      
      	/* Then the data itself.. */
      	stream.next_in = (void *)buf;
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    + 		stream.avail_out = sizeof(compressed);
    + 	} while (ret == Z_OK);
    + 
    +-	if (ret != Z_STREAM_END)
    +-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
    +-		    ret);
    +-	ret = git_deflate_end_gently(&stream);
    +-	if (ret != Z_OK)
    +-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
    +-		    ret);
    +-	the_hash_algo->final_oid_fn(&parano_oid, &c);
    ++	/* Common steps for write_loose_object and stream_loose_object to
    ++	 * end writing loose oject:
    ++	 *
    ++	 *  - End the compression of zlib stream.
    ++	 *  - Get the calculated oid to "parano_oid".
    ++	 */
    ++	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
    ++				N_("unable to deflate new object %s (%d)"),
    ++				N_("deflateEnd on object %s failed (%d)"));
    ++
    + 	if (!oideq(oid, &parano_oid))
    + 		die(_("confused by unstable object source data for %s"),
    + 		    oid_to_hex(oid));
      
      	close_loose_object(fd);
      
-:  ---------- > 3:  18dd21122d object-file.c: remove the slash for directory_size()
-:  ---------- > 4:  964715451b object-file.c: add "stream_loose_object()" to handle large object
-:  ---------- > 5:  3f620466fe unpack-objects: unpack_non_delta_entry() read data in a stream
2:  0d2e0f3a00 ! 6:  8073a3888d object-file API: add a format_object_header() function
    @@ builtin/index-pack.c: static void *unpack_entry_data(off_t offset, unsigned long
      	if (!is_delta_type(type)) {
     -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
     -				   type_name(type),(uintmax_t)size) + 1;
    -+		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
    ++		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
      		the_hash_algo->init_fn(&c);
      		the_hash_algo->update_fn(&c, hdr, hdrlen);
      	} else
    @@ bulk-checkin.c: static int deflate_to_pack(struct bulk_checkin_state *state,
     -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
     -			       type_name(type), (uintmax_t)size) + 1;
     +	header_len = format_object_header((char *)obuf, sizeof(obuf),
    -+					 type, (uintmax_t)size);
    ++					 type, size);
      	the_hash_algo->init_fn(&ctx);
      	the_hash_algo->update_fn(&ctx, obuf, header_len);
      
    @@ http-push.c: static void start_put(struct transfer_request *request)
      
      	unpacked = read_object_file(&request->obj->oid, &type, &len);
     -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
    -+	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
      
      	/* Set it up */
      	git_deflate_init(&stream, zlib_compression_level);
    @@ object-file.c: static void write_object_file_prepare(const struct git_hash_algo
      
      	/* Sha1.. */
      	algo->init_fn(&c);
    +@@ object-file.c: int stream_loose_object(struct input_stream *in_stream, size_t len,
    + 
    + 	/* Since oid is not determined, save tmp file to odb path. */
    + 	strbuf_addf(&filename, "%s/", get_object_directory());
    +-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
    + 
    + 	/* Common steps for write_loose_object and stream_loose_object to
    + 	 * start writing loose oject:
     @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	buf = read_object(the_repository, oid, &type, &len);
      	if (!buf)
4:  1de06a8f5c < -:  ---------- object-file.c: add "write_stream_object_file()" to support read in stream
5:  e7b4e426ef < -:  ---------- unpack-objects: unpack_non_delta_entry() read data in a stream
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2021-12-17 11:26       ` Han Xin
                           ` (6 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 12:28           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
                           ` (4 subsequent siblings)
  12 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
 t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)
 create mode 100755 t/t5329-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c6d6c17072 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
new file mode 100755
index 0000000000..39c7a62d94
--- /dev/null
+++ b/t/t5329-unpack-large-objects.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+assert_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	eval "echo $glob" >actual &&
+	test_cmp expect actual
+}
+
+assert_no_pack () {
+	rmdir dest.git/objects/pack
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	assert_no_loose &&
+	assert_no_pack
+'
+
+test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2021-12-17 11:26       ` Han Xin
                           ` (7 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 12:28           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
                           ` (3 subsequent siblings)
  12 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in latter commit to resolve
this issue.

Before introducing a stream vesion function for writing loose object,
do some refactoring on "write_loose_object()" to reuse code for both
versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new funciton
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 44 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..5d163081b1 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     enum object_type type, size_t len,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename, flags);
+	if (fd < 0)
+		return -1;
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_NONE, 0, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid to "parano_oid".
+	 */
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
+					       mtime, flags);
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2021-12-17 11:26       ` Han Xin
                           ` (8 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 17:24           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                           ` (2 subsequent siblings)
  12 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
slash as many users of it want.

Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index 5d163081b1..4f0127e823 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
 		die_errno(_("error when closing loose object file"));
 }
 
-/* Size of directory component, including the ending '/' */
+/* Size of directory component, excluding the ending '/' */
 static inline int directory_size(const char *filename)
 {
 	const char *s = strrchr(filename, '/');
 	if (!s)
 		return 0;
-	return s - filename + 1;
+	return s - filename;
 }
 
 /*
@@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
 
 	strbuf_reset(tmp);
 	strbuf_add(tmp, filename, dirlen);
-	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
+	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
 	do {
 		if (fd >= 0 || !dirlen || errno != ENOENT)
@@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
 		 * scratch.
 		 */
 		strbuf_reset(tmp);
-		strbuf_add(tmp, filename, dirlen - 1);
+		strbuf_add(tmp, filename, dirlen);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
 			break;
 		if (adjust_shared_perm(tmp->buf))
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object
  2021-12-17 11:26       ` Han Xin
                           ` (9 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08  8:54         ` [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2022-01-08  8:54         ` [PATCH v8 6/6] object-file API: add a format_object_header() function Han Xin
  12 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in latter commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   9 +++++
 2 files changed, 110 insertions(+)

diff --git a/object-file.c b/object-file.c
index 4f0127e823..a462a21629 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2012,6 +2012,107 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_BLOB, len, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	close_loose_object(fd);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..cc41c64d69 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +238,9 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-17 11:26       ` Han Xin
                           ` (10 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08  8:54         ` [PATCH v8 6/6] object-file API: add a format_object_header() function Han Xin
  12 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()". However, unpack
non-delta objects from a stream instead of from an entrie buffer will
have 10% performance penalty.

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

Therefore, only unpack objects larger than the "core.bigFileThreshold"
in zstream. Until now, the config variable has been used in the
following cases, and our new case belongs to the packfile category.

 * Archive:

   + archive.c: write_entry(): write large blob entries to archive in
     stream.

 * Loose objects:

   + object-file.c: index_fd(): when hashing large files in worktree,
     read files in a stream, and create one packfile per large blob if
     want to save files to git object store.

   + object-file.c: read_loose_object(): when checking loose objects
     using "git-fsck", do not read full content of large loose objects.

 * Packfile:

   + fast-import.c: parse_and_store_blob(): streaming large blob from
     foreign source to packfile.

   + index-pack.c: check_collison(): read and check large blob in stream.

   + index-pack.c: unpack_entry_data(): do not return the entire
     contents of the big blob from packfile, but uses a fixed buf to
     perform some integrity checks on the object.

   + pack-check.c: verify_packfile(): used by "git-fsck" and will call
     check_object_signature() to check large blob in pack with the
     streaming interface.

   + pack-objects.c: get_object_details(): set "no_try_delta" for large
     blobs when counting objects.

   + pack-objects.c: write_no_reuse_object(): streaming large blob to
     pack.

   + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
     stream from packfile.

 * Others:

   + diff.c: diff_populate_filespec(): treat large blob file as binary.

   + streaming.c: istream_source(): as a helper of "open_istream()" to
     select proper streaming interface to read large blob from packfile.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5329-unpack-large-objects.sh | 23 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c6d6c17072..e9ec2b349d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
index 39c7a62d94..6f3bfb3df7 100755
--- a/t/t5329-unpack-large-objects.sh
+++ b/t/t5329-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 assert_no_loose () {
@@ -37,16 +41,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	assert_no_loose &&
 	assert_no_pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_loose
+'
+
 test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v8 6/6] object-file API: add a format_object_header() function
  2021-12-17 11:26       ` Han Xin
                           ` (11 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  12 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 16 ++++++++++++----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..8a6ce77940 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..9e685f0f1a 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f0c044dcf7 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index a462a21629..d384ef2952 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2027,7 +2035,7 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
 
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
 
 	/* Common steps for write_loose_object and stream_loose_object to
 	 * start writing loose oject:
@@ -2168,7 +2176,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-01-08 12:28           ` René Scharfe
  2022-01-11 10:41             ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: René Scharfe @ 2022-01-08 12:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

 Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> As the name implies, "get_data(size)" will allocate and return a given
> size of memory. Allocating memory for a large blob object may cause the
> system to run out of memory. Before preparing to replace calling of
> "get_data()" to unpack large blob objects in latter commits, refactor
> "get_data()" to reduce memory footprint for dry_run mode.
>
> Because in dry_run mode, "get_data()" is only used to check the
> integrity of data, and the returned buffer is not used at all, we can
> allocate a smaller buffer and reuse it as zstream output. Therefore,
> in dry_run mode, "get_data()" will release the allocated buffer and
> return NULL instead of returning garbage data.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
>  t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
>  create mode 100755 t/t5329-unpack-large-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..c6d6c17072 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,31 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>
> +/*
> + * Decompress zstream from stdin and return specific size of data.
> + * The caller is responsible to free the returned buffer.
> + *
> + * But for dry_run mode, "get_data()" is only used to check the
> + * integrity of data, and the returned buffer is not used at all.
> + * Therefore, in dry_run mode, "get_data()" will release the small
> + * allocated buffer which is reused to hold temporary zstream output
> + * and return NULL instead of returning garbage data.
> + */
>  static void *get_data(unsigned long size)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize;
> +	void *buf;
>
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);
>
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
> +	if (dry_run)
> +		FREE_AND_NULL(buf);
>  	return buf;
>  }
>
> @@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  {
>  	void *buf = get_data(size);
>
> -	if (!dry_run && buf)
> +	if (buf)
>  		write_object(nr, type, buf, size);
> -	else
> -		free(buf);
>  }
>
>  static int resolve_against_held(unsigned nr, const struct object_id *base,
> @@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  		oidread(&base_oid, fill(the_hash_algo->rawsz));
>  		use(the_hash_algo->rawsz);
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		if (has_object_file(&base_oid))
>  			; /* Ok we have this one */
>  		else if (resolve_against_held(nr, &base_oid,
> @@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  			die("offset value out of bound for delta base object");
>
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		lo = 0;
>  		hi = nr;
>  		while (lo < hi) {

Nice!

> diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
> new file mode 100755
> index 0000000000..39c7a62d94
> --- /dev/null
> +++ b/t/t5329-unpack-large-objects.sh
> @@ -0,0 +1,52 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='git unpack-objects with large objects'
> +
> +. ./test-lib.sh
> +
> +prepare_dest () {
> +	test_when_finished "rm -rf dest.git" &&
> +	git init --bare dest.git
> +}
> +
> +assert_no_loose () {
> +	glob=dest.git/objects/?? &&
> +	echo "$glob" >expect &&
> +	eval "echo $glob" >actual &&
> +	test_cmp expect actual
> +}
> +
> +assert_no_pack () {
> +	rmdir dest.git/objects/pack

I would expect a function whose name starts with "assert" to have no
side effects.  It doesn't matter here, because it's called only at the
very end, but that might change.  You can use test_dir_is_empty instead
of rmdir.

> +}
> +
> +test_expect_success "create large objects (1.5 MB) and PACK" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	PACK=$(echo HEAD | git pack-objects --revs test)
> +'
> +
> +test_expect_success 'set memory limitation to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'unpack-objects failed under memory limitation' '
> +	prepare_dest &&
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	grep "fatal: attempting to allocate" err
> +'
> +
> +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
> +	prepare_dest &&
> +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
> +	assert_no_loose &&
> +	assert_no_pack
> +'
> +
> +test_done

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-08 12:28           ` René Scharfe
  2022-01-11 10:33             ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: René Scharfe @ 2022-01-08 12:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When writing a large blob using "write_loose_object()", we have to pass
> a buffer with the whole content of the blob, and this behavior will
> consume lots of memory and may cause OOM. We will introduce a stream
> version function ("stream_loose_object()") in latter commit to resolve
> this issue.
>
> Before introducing a stream vesion function for writing loose object,
> do some refactoring on "write_loose_object()" to reuse code for both
> versions.
>
> Rewrite "write_loose_object()" as follows:
>
>  1. Figure out a path for the (temp) object file. This step is only
>     used in "write_loose_object()".
>
>  2. Move common steps for starting to write loose objects into a new
>     function "start_loose_object_common()".
>
>  3. Compress data.
>
>  4. Move common steps for ending zlib stream into a new funciton
>     "end_loose_object_common()".
>
>  5. Close fd and finalize the object file.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 105 insertions(+), 44 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..5d163081b1 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	algo->final_oid_fn(oid, &c);
>  }
>
> +/*
> + * Move the just written object with proper mtime into its final resting place.
> + */
> +static int finalize_object_file_with_mtime(const char *tmpfile,
> +					   const char *filename,
> +					   time_t mtime,
> +					   unsigned flags)

This function is called only once after your series.  Should it be used by
stream_loose_object()?  Probably not -- the latter doesn't have a way to
force a certain modification time and its caller doesn't need one.  So
creating finalize_object_file_with_mtime() seems unnecessary for this
series.

> +{
> +	struct utimbuf utb;
> +
> +	if (mtime) {
> +		utb.actime = mtime;
> +		utb.modtime = mtime;
> +		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
> +			warning_errno(_("failed utime() on %s"), tmpfile);
> +	}
> +	return finalize_object_file(tmpfile, filename);
> +}
> +
>  /*
>   * Move the just written object into its final resting place.
>   */
> @@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
>   * We want to avoid cross-directory filename renames, because those
>   * can have problems on various filesystems (FAT, NFS, Coda).
>   */
> -static int create_tmpfile(struct strbuf *tmp, const char *filename)
> +static int create_tmpfile(struct strbuf *tmp, const char *filename,
> +			  unsigned flags)

create_tmpfile() is not mentioned in the commit message, yet it's
changed here.  Hrm.

>  {
>  	int fd, dirlen = directory_size(filename);
>
> @@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	strbuf_add(tmp, filename, dirlen);
>  	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
>  	fd = git_mkstemp_mode(tmp->buf, 0444);
> -	if (fd < 0 && dirlen && errno == ENOENT) {
> +	do {
> +		if (fd >= 0 || !dirlen || errno != ENOENT)
> +			break;

Why turn this branch into a loop?  Is this done to mkdir multiple
components, e.g. with filename being "a/b/c/file" to create "a", "a/b",
and "a/b/c"?  It's only used for loose objects, so a fan-out directory
(e.g. ".git/objects/ff") can certainly be missing, but can their parent
be missing as well sometimes?  If that's the point then such a fix
would be worth its own patch.  (Which probably would benefit from using
safe_create_leading_directories()).

>  		/*
>  		 * Make sure the directory exists; note that the contents
>  		 * of the buffer are undefined after mkstemp returns an
> @@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;

Or is it just to replace these returns with a jump to the new error
reporting section?

>
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);

In that case a break would be missing here.

> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());
> +		else
> +			return error_errno(_("unable to create temporary file"));
>  	}

Why move this error reporting code into create_tmpfile()?  This function
has a single caller both before and after your series, so the code could
just as well stay at its call-site, avoiding the need to add the flags
parameter.

> +
>  	return fd;
>  }
>
> +static int start_loose_object_common(struct strbuf *tmp_file,
> +				     const char *filename, unsigned flags,
> +				     git_zstream *stream,
> +				     unsigned char *buf, size_t buflen,
> +				     git_hash_ctx *c,
> +				     enum object_type type, size_t len,

The parameters type and len are not used by this function and thus can
be dropped.

> +				     char *hdr, int hdrlen)
> +{
> +	int fd;
> +
> +	fd = create_tmpfile(tmp_file, filename, flags);
> +	if (fd < 0)
> +		return -1;
> +
> +	/*  Setup zlib stream for compression */
> +	git_deflate_init(stream, zlib_compression_level);
> +	stream->next_out = buf;
> +	stream->avail_out = buflen;
> +	the_hash_algo->init_fn(c);
> +
> +	/*  Start to feed header to zlib stream */
> +	stream->next_in = (unsigned char *)hdr;
> +	stream->avail_in = hdrlen;
> +	while (git_deflate(stream, 0) == Z_OK)
> +		; /* nothing */
> +	the_hash_algo->update_fn(c, hdr, hdrlen);
> +
> +	return fd;
> +}
> +
> +static void end_loose_object_common(int ret, git_hash_ctx *c,
> +				    git_zstream *stream,
> +				    struct object_id *parano_oid,
> +				    const struct object_id *expected_oid,
> +				    const char *die_msg1_fmt,
> +				    const char *die_msg2_fmt)

Hmm, the signature needs as many lines as the function body.

> +{
> +	if (ret != Z_STREAM_END)
> +		die(_(die_msg1_fmt), ret, expected_oid);
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		die(_(die_msg2_fmt), ret, expected_oid);

These format strings cannot be checked by the compiler.

Considering those two together I think I'd either unify the error
messages and move their strings here (losing the ability for users
to see if streaming was used) or not extract the function and
duplicate its few shared lines.  Just a feeling, though.

> +	the_hash_algo->final_oid_fn(parano_oid, c);
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
> @@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>  	loose_object_path(the_repository, &filename, oid);
>
> -	fd = create_tmpfile(&tmp_file, filename.buf);
> -	if (fd < 0) {
> -		if (flags & HASH_SILENT)
> -			return -1;
> -		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -		else
> -			return error_errno(_("unable to create temporary file"));
> -	}
> -
> -	/* Set it up */
> -	git_deflate_init(&stream, zlib_compression_level);
> -	stream.next_out = compressed;
> -	stream.avail_out = sizeof(compressed);
> -	the_hash_algo->init_fn(&c);
> -
> -	/* First header.. */
> -	stream.next_in = (unsigned char *)hdr;
> -	stream.avail_in = hdrlen;
> -	while (git_deflate(&stream, 0) == Z_OK)
> -		; /* nothing */
> -	the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * start writing loose oject:
> +	 *
> +	 *  - Create tmpfile for the loose object.
> +	 *  - Setup zlib stream for compression.
> +	 *  - Start to feed header to zlib stream.
> +	 */
> +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, OBJ_NONE, 0, hdr, hdrlen);
> +	if (fd < 0)
> +		return -1;
>
>  	/* Then the data itself.. */
>  	stream.next_in = (void *)buf;
> @@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		stream.avail_out = sizeof(compressed);
>  	} while (ret == Z_OK);
>
> -	if (ret != Z_STREAM_END)
> -		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> -		    ret);
> -	ret = git_deflate_end_gently(&stream);
> -	if (ret != Z_OK)
> -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> -		    ret);
> -	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * end writing loose oject:
> +	 *
> +	 *  - End the compression of zlib stream.
> +	 *  - Get the calculated oid to "parano_oid".
> +	 */
> +	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
> +				N_("unable to deflate new object %s (%d)"),
> +				N_("deflateEnd on object %s failed (%d)"));
> +
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>
>  	close_loose_object(fd);
>
> -	if (mtime) {
> -		struct utimbuf utb;
> -		utb.actime = mtime;
> -		utb.modtime = mtime;
> -		if (utime(tmp_file.buf, &utb) < 0 &&
> -		    !(flags & HASH_SILENT))
> -			warning_errno(_("failed utime() on %s"), tmp_file.buf);
> -	}
> -
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
> +					       mtime, flags);
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
@ 2022-01-08 17:24           ` René Scharfe
  2022-01-11 10:14             ` Han Xin
  0 siblings, 1 reply; 165+ messages in thread
From: René Scharfe @ 2022-01-08 17:24 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
> slash as many users of it want.
>
> Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 5d163081b1..4f0127e823 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
>  		die_errno(_("error when closing loose object file"));
>  }
>
> -/* Size of directory component, including the ending '/' */
> +/* Size of directory component, excluding the ending '/' */
>  static inline int directory_size(const char *filename)
>  {
>  	const char *s = strrchr(filename, '/');
>  	if (!s)
>  		return 0;
> -	return s - filename + 1;
> +	return s - filename;

This will return zero both for "filename" and "/filename".  Hmm.  Since
it's only used for loose object files we can assume that at least one
slash is present, so this removal of functionality is not actually a
problem.  But I don't understand its benefit.

>  }
>
>  /*
> @@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
>
>  	strbuf_reset(tmp);
>  	strbuf_add(tmp, filename, dirlen);
> -	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> +	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  	fd = git_mkstemp_mode(tmp->buf, 0444);
>  	do {
>  		if (fd >= 0 || !dirlen || errno != ENOENT)
> @@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
>  		 * scratch.
>  		 */
>  		strbuf_reset(tmp);
> -		strbuf_add(tmp, filename, dirlen - 1);
> +		strbuf_add(tmp, filename, dirlen);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)

This code makes sure that mkdir(2) is called without the trailing slash,
both with or without this patch.  From the commit message above I
somehow expected a change in this regard -- but again I wouldn't
understand its benefit.

Is this change really needed?  Is streaming unpack not possible with the
original directory_size() function?

>  			break;
>  		if (adjust_shared_perm(tmp->buf))

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2022-01-08 17:24           ` René Scharfe
@ 2022-01-11 10:14             ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-11 10:14 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sun, Jan 9, 2022 at 1:24 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
> > slash as many users of it want.
> >
> > Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 5d163081b1..4f0127e823 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
> >               die_errno(_("error when closing loose object file"));
> >  }
> >
> > -/* Size of directory component, including the ending '/' */
> > +/* Size of directory component, excluding the ending '/' */
> >  static inline int directory_size(const char *filename)
> >  {
> >       const char *s = strrchr(filename, '/');
> >       if (!s)
> >               return 0;
> > -     return s - filename + 1;
> > +     return s - filename;
>
> This will return zero both for "filename" and "/filename".  Hmm.  Since
> it's only used for loose object files we can assume that at least one
> slash is present, so this removal of functionality is not actually a
> problem.  But I don't understand its benefit.
>
> >  }
> >
> >  /*
> > @@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
> >
> >       strbuf_reset(tmp);
> >       strbuf_add(tmp, filename, dirlen);
> > -     strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> > +     strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >       fd = git_mkstemp_mode(tmp->buf, 0444);
> >       do {
> >               if (fd >= 0 || !dirlen || errno != ENOENT)
> > @@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
> >                * scratch.
> >                */
> >               strbuf_reset(tmp);
> > -             strbuf_add(tmp, filename, dirlen - 1);
> > +             strbuf_add(tmp, filename, dirlen);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
>
> This code makes sure that mkdir(2) is called without the trailing slash,
> both with or without this patch.  From the commit message above I
> somehow expected a change in this regard -- but again I wouldn't
> understand its benefit.
>
> Is this change really needed?  Is streaming unpack not possible with the
> original directory_size() function?
>

*nod*
Streaming unpacking still works with the original directory_size().

This patch is more of a code cleanup that reduces the extra handling of
directory size first increasing and then decreasing. I'll seriously consider
if I should remove this patch, or move it to the tail.

Thanks
-Han Xin

> >                       break;
> >               if (adjust_shared_perm(tmp->buf))

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2022-01-08 12:28           ` René Scharfe
@ 2022-01-11 10:33             ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-11 10:33 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote:
>
> Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When writing a large blob using "write_loose_object()", we have to pass
> > a buffer with the whole content of the blob, and this behavior will
> > consume lots of memory and may cause OOM. We will introduce a stream
> > version function ("stream_loose_object()") in latter commit to resolve
> > this issue.
> >
> > Before introducing a stream vesion function for writing loose object,
> > do some refactoring on "write_loose_object()" to reuse code for both
> > versions.
> >
> > Rewrite "write_loose_object()" as follows:
> >
> >  1. Figure out a path for the (temp) object file. This step is only
> >     used in "write_loose_object()".
> >
> >  2. Move common steps for starting to write loose objects into a new
> >     function "start_loose_object_common()".
> >
> >  3. Compress data.
> >
> >  4. Move common steps for ending zlib stream into a new funciton
> >     "end_loose_object_common()".
> >
> >  5. Close fd and finalize the object file.
> >
> > Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 105 insertions(+), 44 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb1426f98c..5d163081b1 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
> >       algo->final_oid_fn(oid, &c);
> >  }
> >
> > +/*
> > + * Move the just written object with proper mtime into its final resting place.
> > + */
> > +static int finalize_object_file_with_mtime(const char *tmpfile,
> > +                                        const char *filename,
> > +                                        time_t mtime,
> > +                                        unsigned flags)
>
> This function is called only once after your series.  Should it be used by
> stream_loose_object()?  Probably not -- the latter doesn't have a way to
> force a certain modification time and its caller doesn't need one.  So
> creating finalize_object_file_with_mtime() seems unnecessary for this
> series.
>

After accepting the suggestion by Ævar Arnfjörð Bjarmason[1] to remove
finalize_object_file_with_mtime() from stream_loose_object() , it seems to
be an overkill for write_loose_object() now. I'll put it back into
write_loose_object() .

1. https://lore.kernel.org/git/211221.86pmpqq9aj.gmgdl@evledraar.gmail.com/

Thanks
-Han Xin

> > +{
> > +     struct utimbuf utb;
> > +
> > +     if (mtime) {
> > +             utb.actime = mtime;
> > +             utb.modtime = mtime;
> > +             if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
> > +                     warning_errno(_("failed utime() on %s"), tmpfile);
> > +     }
> > +     return finalize_object_file(tmpfile, filename);
> > +}
> > +
> >  /*
> >   * Move the just written object into its final resting place.
> >   */
> > @@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
> >   * We want to avoid cross-directory filename renames, because those
> >   * can have problems on various filesystems (FAT, NFS, Coda).
> >   */
> > -static int create_tmpfile(struct strbuf *tmp, const char *filename)
> > +static int create_tmpfile(struct strbuf *tmp, const char *filename,
> > +                       unsigned flags)
>
> create_tmpfile() is not mentioned in the commit message, yet it's
> changed here.  Hrm.
>
> >  {
> >       int fd, dirlen = directory_size(filename);
> >
> > @@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       strbuf_add(tmp, filename, dirlen);
> >       strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> >       fd = git_mkstemp_mode(tmp->buf, 0444);
> > -     if (fd < 0 && dirlen && errno == ENOENT) {
> > +     do {
> > +             if (fd >= 0 || !dirlen || errno != ENOENT)
> > +                     break;
>
> Why turn this branch into a loop?  Is this done to mkdir multiple
> components, e.g. with filename being "a/b/c/file" to create "a", "a/b",
> and "a/b/c"?  It's only used for loose objects, so a fan-out directory
> (e.g. ".git/objects/ff") can certainly be missing, but can their parent
> be missing as well sometimes?  If that's the point then such a fix
> would be worth its own patch.  (Which probably would benefit from using
> safe_create_leading_directories()).
>
> >               /*
> >                * Make sure the directory exists; note that the contents
> >                * of the buffer are undefined after mkstemp returns an
> > @@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >               strbuf_reset(tmp);
> >               strbuf_add(tmp, filename, dirlen - 1);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> > -                     return -1;
> > +                     break;
> >               if (adjust_shared_perm(tmp->buf))
> > -                     return -1;
> > +                     break;
>
> Or is it just to replace these returns with a jump to the new error
> reporting section?
>
> >
> >               /* Try again */
> >               strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >               fd = git_mkstemp_mode(tmp->buf, 0444);
>
> In that case a break would be missing here.
>
> > +     } while (0);
> > +
> > +     if (fd < 0 && !(flags & HASH_SILENT)) {
> > +             if (errno == EACCES)
> > +                     return error(_("insufficient permission for adding an "
> > +                                    "object to repository database %s"),
> > +                                  get_object_directory());
> > +             else
> > +                     return error_errno(_("unable to create temporary file"));
> >       }
>
> Why move this error reporting code into create_tmpfile()?  This function
> has a single caller both before and after your series, so the code could
> just as well stay at its call-site, avoiding the need to add the flags
> parameter.
>

Here is a legacy from v7, now there is no step called "Figuring out a path
for the (temp) object file.", and it's only used in start_loose_object_common().
I will bring it back to what it was.

Thanks
-Han Xin
> > +
> >       return fd;
> >  }
> >
> > +static int start_loose_object_common(struct strbuf *tmp_file,
> > +                                  const char *filename, unsigned flags,
> > +                                  git_zstream *stream,
> > +                                  unsigned char *buf, size_t buflen,
> > +                                  git_hash_ctx *c,
> > +                                  enum object_type type, size_t len,
>
> The parameters type and len are not used by this function and thus can
> be dropped.
>

*nod*

> > +                                  char *hdr, int hdrlen)
> > +{
> > +     int fd;
> > +
> > +     fd = create_tmpfile(tmp_file, filename, flags);
> > +     if (fd < 0)
> > +             return -1;
> > +
> > +     /*  Setup zlib stream for compression */
> > +     git_deflate_init(stream, zlib_compression_level);
> > +     stream->next_out = buf;
> > +     stream->avail_out = buflen;
> > +     the_hash_algo->init_fn(c);
> > +
> > +     /*  Start to feed header to zlib stream */
> > +     stream->next_in = (unsigned char *)hdr;
> > +     stream->avail_in = hdrlen;
> > +     while (git_deflate(stream, 0) == Z_OK)
> > +             ; /* nothing */
> > +     the_hash_algo->update_fn(c, hdr, hdrlen);
> > +
> > +     return fd;
> > +}
> > +
> > +static void end_loose_object_common(int ret, git_hash_ctx *c,
> > +                                 git_zstream *stream,
> > +                                 struct object_id *parano_oid,
> > +                                 const struct object_id *expected_oid,
> > +                                 const char *die_msg1_fmt,
> > +                                 const char *die_msg2_fmt)
>
> Hmm, the signature needs as many lines as the function body.
>
> > +{
> > +     if (ret != Z_STREAM_END)
> > +             die(_(die_msg1_fmt), ret, expected_oid);
> > +     ret = git_deflate_end_gently(stream);
> > +     if (ret != Z_OK)
> > +             die(_(die_msg2_fmt), ret, expected_oid);
>
> These format strings cannot be checked by the compiler.
>
> Considering those two together I think I'd either unify the error
> messages and move their strings here (losing the ability for users
> to see if streaming was used) or not extract the function and
> duplicate its few shared lines.  Just a feeling, though.
>
> > +     the_hash_algo->final_oid_fn(parano_oid, c);
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> >                             int hdrlen, const void *buf, unsigned long len,
> >                             time_t mtime, unsigned flags)
> > @@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > -     fd = create_tmpfile(&tmp_file, filename.buf);
> > -     if (fd < 0) {
> > -             if (flags & HASH_SILENT)
> > -                     return -1;
> > -             else if (errno == EACCES)
> > -                     return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> > -             else
> > -                     return error_errno(_("unable to create temporary file"));
> > -     }
> > -
> > -     /* Set it up */
> > -     git_deflate_init(&stream, zlib_compression_level);
> > -     stream.next_out = compressed;
> > -     stream.avail_out = sizeof(compressed);
> > -     the_hash_algo->init_fn(&c);
> > -
> > -     /* First header.. */
> > -     stream.next_in = (unsigned char *)hdr;
> > -     stream.avail_in = hdrlen;
> > -     while (git_deflate(&stream, 0) == Z_OK)
> > -             ; /* nothing */
> > -     the_hash_algo->update_fn(&c, hdr, hdrlen);
> > +     /* Common steps for write_loose_object and stream_loose_object to
> > +      * start writing loose oject:
> > +      *
> > +      *  - Create tmpfile for the loose object.
> > +      *  - Setup zlib stream for compression.
> > +      *  - Start to feed header to zlib stream.
> > +      */
> > +     fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> > +                                    &stream, compressed, sizeof(compressed),
> > +                                    &c, OBJ_NONE, 0, hdr, hdrlen);
> > +     if (fd < 0)
> > +             return -1;
> >
> >       /* Then the data itself.. */
> >       stream.next_in = (void *)buf;
> > @@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >               stream.avail_out = sizeof(compressed);
> >       } while (ret == Z_OK);
> >
> > -     if (ret != Z_STREAM_END)
> > -             die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> > -                 ret);
> > -     ret = git_deflate_end_gently(&stream);
> > -     if (ret != Z_OK)
> > -             die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> > -                 ret);
> > -     the_hash_algo->final_oid_fn(&parano_oid, &c);
> > +     /* Common steps for write_loose_object and stream_loose_object to
> > +      * end writing loose oject:
> > +      *
> > +      *  - End the compression of zlib stream.
> > +      *  - Get the calculated oid to "parano_oid".
> > +      */
> > +     end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
> > +                             N_("unable to deflate new object %s (%d)"),
> > +                             N_("deflateEnd on object %s failed (%d)"));
> > +
> >       if (!oideq(oid, &parano_oid))
> >               die(_("confused by unstable object source data for %s"),
> >                   oid_to_hex(oid));
> >
> >       close_loose_object(fd);
> >
> > -     if (mtime) {
> > -             struct utimbuf utb;
> > -             utb.actime = mtime;
> > -             utb.modtime = mtime;
> > -             if (utime(tmp_file.buf, &utb) < 0 &&
> > -                 !(flags & HASH_SILENT))
> > -                     warning_errno(_("failed utime() on %s"), tmp_file.buf);
> > -     }
> > -
> > -     return finalize_object_file(tmp_file.buf, filename.buf);
> > +     return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
> > +                                            mtime, flags);
> >  }
> >
> >  static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08 12:28           ` René Scharfe
@ 2022-01-11 10:41             ` Han Xin
  0 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-11 10:41 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote:
>
>  Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > +assert_no_loose () {
> > +     glob=dest.git/objects/?? &&
> > +     echo "$glob" >expect &&
> > +     eval "echo $glob" >actual &&
> > +     test_cmp expect actual
> > +}
> > +
> > +assert_no_pack () {
> > +     rmdir dest.git/objects/pack
>
> I would expect a function whose name starts with "assert" to have no
> side effects.  It doesn't matter here, because it's called only at the
> very end, but that might change.  You can use test_dir_is_empty instead
> of rmdir.
>

*nod*
I think it would be better to rename "assert_no_loose()" to "test_no_loose()".
I will remove "assert_no_pack()" and use "test_dir_is_empty()" instead.

Thanks
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 0/5] unpack large blobs in stream
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  2022-01-20 11:21           ` [PATCH v9 1/5] " Han Xin
                             ` (4 subsequent siblings)
  5 siblings, 2 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v8:
* Rename "assert_no_loose ()" into "test_no_loose ()" in
  "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
  "test_dir_is_empty" instead.

* Revert changes to "create_tmpfile()" and error handling is now in
  "start_loose_object_common()".

* Remove "finalize_object_file_with_mtime()" which seems to be an overkill
  for "write_loose_object()" now. 

* Remove the commit "object-file.c: remove the slash for directory_size()",
  it can be in a separate patch if necessary.

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 builtin/index-pack.c            |   3 +-
 builtin/unpack-objects.c        | 110 ++++++++++++++--
 bulk-checkin.c                  |   4 +-
 cache.h                         |  21 +++
 http-push.c                     |   2 +-
 object-file.c                   | 220 +++++++++++++++++++++++++++-----
 object-store.h                  |   9 ++
 t/t5328-unpack-large-objects.sh |  65 ++++++++++
 8 files changed, 384 insertions(+), 50 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

Range-diff against v8:
1:  bd34da5816 ! 1:  6a6c11ba93 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ builtin/unpack-objects.c: static void unpack_delta_entry(enum object_type type,
      		hi = nr;
      		while (lo < hi) {
     
    - ## t/t5329-unpack-large-objects.sh (new) ##
    + ## t/t5328-unpack-large-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    -+# Copyright (c) 2021 Han Xin
    ++# Copyright (c) 2022 Han Xin
     +#
     +
     +test_description='git unpack-objects with large objects'
    @@ t/t5329-unpack-large-objects.sh (new)
     +	git init --bare dest.git
     +}
     +
    -+assert_no_loose () {
    ++test_no_loose () {
     +	glob=dest.git/objects/?? &&
     +	echo "$glob" >expect &&
     +	eval "echo $glob" >actual &&
     +	test_cmp expect actual
     +}
     +
    -+assert_no_pack () {
    -+	rmdir dest.git/objects/pack
    -+}
    -+
     +test_expect_success "create large objects (1.5 MB) and PACK" '
     +	test-tool genrandom foo 1500000 >big-blob &&
     +	test_commit --append foo big-blob &&
    @@ t/t5329-unpack-large-objects.sh (new)
     +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
     +	prepare_dest &&
     +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    -+	assert_no_loose &&
    -+	assert_no_pack
    ++	test_no_loose &&
    ++	test_dir_is_empty dest.git/objects/pack
     +'
     +
     +test_done
2:  f9a4365a7d ! 2:  bab9e0402f object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
    -@@ object-file.c: static void write_object_file_prepare(const struct git_hash_algo *algo,
    - 	algo->final_oid_fn(oid, &c);
    - }
    - 
    -+/*
    -+ * Move the just written object with proper mtime into its final resting place.
    -+ */
    -+static int finalize_object_file_with_mtime(const char *tmpfile,
    -+					   const char *filename,
    -+					   time_t mtime,
    -+					   unsigned flags)
    -+{
    -+	struct utimbuf utb;
    -+
    -+	if (mtime) {
    -+		utb.actime = mtime;
    -+		utb.modtime = mtime;
    -+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
    -+			warning_errno(_("failed utime() on %s"), tmpfile);
    -+	}
    -+	return finalize_object_file(tmpfile, filename);
    -+}
    -+
    - /*
    -  * Move the just written object into its final resting place.
    -  */
    -@@ object-file.c: static inline int directory_size(const char *filename)
    -  * We want to avoid cross-directory filename renames, because those
    -  * can have problems on various filesystems (FAT, NFS, Coda).
    -  */
    --static int create_tmpfile(struct strbuf *tmp, const char *filename)
    -+static int create_tmpfile(struct strbuf *tmp, const char *filename,
    -+			  unsigned flags)
    - {
    - 	int fd, dirlen = directory_size(filename);
    - 
    -@@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
    - 	strbuf_add(tmp, filename, dirlen);
    - 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
    - 	fd = git_mkstemp_mode(tmp->buf, 0444);
    --	if (fd < 0 && dirlen && errno == ENOENT) {
    -+	do {
    -+		if (fd >= 0 || !dirlen || errno != ENOENT)
    -+			break;
    - 		/*
    - 		 * Make sure the directory exists; note that the contents
    - 		 * of the buffer are undefined after mkstemp returns an
     @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
    - 		strbuf_reset(tmp);
    - 		strbuf_add(tmp, filename, dirlen - 1);
    - 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
    --			return -1;
    -+			break;
    - 		if (adjust_shared_perm(tmp->buf))
    --			return -1;
    -+			break;
    - 
    - 		/* Try again */
    - 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
    - 		fd = git_mkstemp_mode(tmp->buf, 0444);
    -+	} while (0);
    -+
    -+	if (fd < 0 && !(flags & HASH_SILENT)) {
    -+		if (errno == EACCES)
    -+			return error(_("insufficient permission for adding an "
    -+				       "object to repository database %s"),
    -+				     get_object_directory());
    -+		else
    -+			return error_errno(_("unable to create temporary file"));
    - 	}
    -+
      	return fd;
      }
      
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +				     git_zstream *stream,
     +				     unsigned char *buf, size_t buflen,
     +				     git_hash_ctx *c,
    -+				     enum object_type type, size_t len,
     +				     char *hdr, int hdrlen)
     +{
     +	int fd;
     +
    -+	fd = create_tmpfile(tmp_file, filename, flags);
    -+	if (fd < 0)
    -+		return -1;
    ++	fd = create_tmpfile(tmp_file, filename);
    ++	if (fd < 0) {
    ++		if (flags & HASH_SILENT)
    ++			return -1;
    ++		else if (errno == EACCES)
    ++			return error(_("insufficient permission for adding "
    ++				       "an object to repository database %s"),
    ++				     get_object_directory());
    ++		else
    ++			return error_errno(
    ++				_("unable to create temporary file"));
    ++	}
     +
     +	/*  Setup zlib stream for compression */
     +	git_deflate_init(stream, zlib_compression_level);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
     +				       &stream, compressed, sizeof(compressed),
    -+				       &c, OBJ_NONE, 0, hdr, hdrlen);
    ++				       &c, hdr, hdrlen);
     +	if (fd < 0)
     +		return -1;
      
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
    - 
    - 	close_loose_object(fd);
    - 
    --	if (mtime) {
    --		struct utimbuf utb;
    --		utb.actime = mtime;
    --		utb.modtime = mtime;
    --		if (utime(tmp_file.buf, &utb) < 0 &&
    --		    !(flags & HASH_SILENT))
    --			warning_errno(_("failed utime() on %s"), tmp_file.buf);
    --	}
    --
    --	return finalize_object_file(tmp_file.buf, filename.buf);
    -+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
    -+					       mtime, flags);
    - }
    - 
    - static int freshen_loose_object(const struct object_id *oid)
3:  18dd21122d < -:  ---------- object-file.c: remove the slash for directory_size()
4:  964715451b ! 3:  dd13614985 object-file.c: add "stream_loose_object()" to handle large object
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
     +				       &stream, compressed, sizeof(compressed),
    -+				       &c, OBJ_BLOB, len, hdr, hdrlen);
    ++				       &c, hdr, hdrlen);
     +	if (fd < 0) {
     +		err = -1;
     +		goto cleanup;
5:  3f620466fe ! 4:  cd84e27b08 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      		write_object(nr, type, buf, size);
      }
     
    - ## t/t5329-unpack-large-objects.sh ##
    -@@ t/t5329-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
    + ## t/t5328-unpack-large-objects.sh ##
    +@@ t/t5328-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
      
      prepare_dest () {
      	test_when_finished "rm -rf dest.git" &&
    @@ t/t5329-unpack-large-objects.sh: test_description='git unpack-objects with large
     +	fi
      }
      
    - assert_no_loose () {
    -@@ t/t5329-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
    + test_no_loose () {
    +@@ t/t5328-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
      '
      
      test_expect_success 'unpack-objects failed under memory limitation' '
    @@ t/t5329-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1
     -	prepare_dest &&
     +	prepare_dest 2m &&
      	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    - 	assert_no_loose &&
    - 	assert_no_pack
    + 	test_no_loose &&
    + 	test_dir_is_empty dest.git/objects/pack
      '
      
     +test_expect_success 'unpack big object in stream' '
     +	prepare_dest 1m &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
    -+	assert_no_pack
    ++	test_dir_is_empty dest.git/objects/pack
     +'
     +
     +test_expect_success 'do not unpack existing large objects' '
     +	prepare_dest 1m &&
     +	git -C dest.git index-pack --stdin <test-$PACK.pack &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
    -+	assert_no_loose
    ++	test_no_loose
     +'
     +
      test_done
6:  8073a3888d = 5:  59f0ad95c7 object-file API: add a format_object_header() function
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 1/5] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps Han Xin
                             ` (3 subsequent siblings)
  5 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 39 +++++++++++++++++++--------
 t/t5328-unpack-large-objects.sh | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 11 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c6d6c17072 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
new file mode 100755
index 0000000000..45a3316e06
--- /dev/null
+++ b/t/t5328-unpack-large-objects.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	eval "echo $glob" >actual &&
+	test_cmp expect actual
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	test_no_loose &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
  2022-01-20 11:21           ` [PATCH v9 1/5] " Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                             ` (2 subsequent siblings)
  5 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in latter commit to resolve
this issue.

Before introducing a stream vesion function for writing loose object,
do some refactoring on "write_loose_object()" to reuse code for both
versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new funciton
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 105 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 30 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..422b43212a 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1857,6 +1857,59 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1871,28 +1924,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1907,14 +1950,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid to "parano_oid".
+	 */
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (2 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 5/5] object-file API: add a format_object_header() function Han Xin
  5 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in latter commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   9 +++++
 2 files changed, 110 insertions(+)

diff --git a/object-file.c b/object-file.c
index 422b43212a..a738f47cb2 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1996,6 +1996,107 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	close_loose_object(fd);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..cc41c64d69 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +238,9 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (3 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 5/5] object-file API: add a format_object_header() function Han Xin
  5 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()". However, unpack
non-delta objects from a stream instead of from an entrie buffer will
have 10% performance penalty.

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

Therefore, only unpack objects larger than the "core.bigFileThreshold"
in zstream. Until now, the config variable has been used in the
following cases, and our new case belongs to the packfile category.

 * Archive:

   + archive.c: write_entry(): write large blob entries to archive in
     stream.

 * Loose objects:

   + object-file.c: index_fd(): when hashing large files in worktree,
     read files in a stream, and create one packfile per large blob if
     want to save files to git object store.

   + object-file.c: read_loose_object(): when checking loose objects
     using "git-fsck", do not read full content of large loose objects.

 * Packfile:

   + fast-import.c: parse_and_store_blob(): streaming large blob from
     foreign source to packfile.

   + index-pack.c: check_collison(): read and check large blob in stream.

   + index-pack.c: unpack_entry_data(): do not return the entire
     contents of the big blob from packfile, but uses a fixed buf to
     perform some integrity checks on the object.

   + pack-check.c: verify_packfile(): used by "git-fsck" and will call
     check_object_signature() to check large blob in pack with the
     streaming interface.

   + pack-objects.c: get_object_details(): set "no_try_delta" for large
     blobs when counting objects.

   + pack-objects.c: write_no_reuse_object(): streaming large blob to
     pack.

   + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
     stream from packfile.

 * Others:

   + diff.c: diff_populate_filespec(): treat large blob file as binary.

   + streaming.c: istream_source(): as a helper of "open_istream()" to
     select proper streaming interface to read large blob from packfile.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5328-unpack-large-objects.sh | 23 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c6d6c17072..e9ec2b349d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
index 45a3316e06..f4129979f9 100755
--- a/t/t5328-unpack-large-objects.sh
+++ b/t/t5328-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_no_loose () {
@@ -33,16 +37,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	test_no_loose &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_no_loose
+'
+
 test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* [PATCH v9 5/5] object-file API: add a format_object_header() function
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (4 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-20 11:21           ` Han Xin
  5 siblings, 0 replies; 165+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 16 ++++++++++++----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..8a6ce77940 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..9e685f0f1a 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f0c044dcf7 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index a738f47cb2..0dce5d2fec 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2011,7 +2019,7 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
 
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
 
 	/* Common steps for write_loose_object and stream_loose_object to
 	 * start writing loose oject:
@@ -2152,7 +2160,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 165+ messages in thread

* C99 %z (was: [PATCH v7 2/5] object-file API: add a format_object_header() function)
  2021-12-21 14:30           ` René Scharfe
@ 2022-02-01 14:28             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-01 14:28 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, Han Xin


On Tue, Dec 21 2021, René Scharfe wrote:

> Am 21.12.21 um 12:51 schrieb Han Xin:
>> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> [...]
>>  		the_hash_algo->init_fn(&c);
>>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>>  	} else
>> diff --git a/bulk-checkin.c b/bulk-checkin.c
>> index 8785b2ac80..1733a1de4f 100644
>> --- a/bulk-checkin.c
>> +++ b/bulk-checkin.c
>> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>>  	if (seekback == (off_t) -1)
>>  		return error("cannot find the current offset");
>>
>> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
>> -			       type_name(type), (uintmax_t)size) + 1;
>> +	header_len = format_object_header((char *)obuf, sizeof(obuf),
>> +					 type, (uintmax_t)size);
>                                                ^^^^^^^^^^^
> Same here, just that size is already of type size_t, so a cast makes
> even less sense.

Thanks, this and the below is something I made sure to include in a
re-roll I'm about to send (to do these cleanups in object-file.c
separately from Han Xin's series).

>> +int format_object_header_extended(char *str, size_t size, enum object_type type,
>> +				 const char *typestr, size_t objsize)
>> +{
>> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
>> +
>> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
>                                                       ^^^^^^^^^^^
> This cast is necessary to match PRIuMAX.  And that is used because the z
> modifier (as in e.g. printf("%zu", sizeof(size_t));) was only added in
> C99 and not all platforms may have it.  (Perhaps this cautious approach
> is worth revisiting separately, now that some time has passed, but this
> patch series should still use PRIuMAX, as it does.)

I tried to use %z recently and found that the CI breaks on Windows, but
this was a few months ago. But I think the status of that particular C99
feature is that we can't use it freely, unfortunately. I may be wrong
about that, I haven't looked it any detail beyond running those CI
errors.

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
@ 2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
  2022-02-02  8:32               ` Han Xin
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-01 21:24 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin


On Thu, Jan 20 2022, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Changes since v8:
> * Rename "assert_no_loose ()" into "test_no_loose ()" in
>   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
>   "test_dir_is_empty" instead.
>
> * Revert changes to "create_tmpfile()" and error handling is now in
>   "start_loose_object_common()".
>
> * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
>   for "write_loose_object()" now. 
>
> * Remove the commit "object-file.c: remove the slash for directory_size()",
>   it can be in a separate patch if necessary.
>
> Han Xin (4):
>   unpack-objects: low memory footprint for get_data() in dry_run mode
>   object-file.c: refactor write_loose_object() to several steps
>   object-file.c: add "stream_loose_object()" to handle large object
>   unpack-objects: unpack_non_delta_entry() read data in a stream
>
> Ævar Arnfjörð Bjarmason (1):
>   object-file API: add a format_object_header() function

I sent
https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
today which suggests splitting out the 5/5 cleanup you'd integrated.

I then rebased these patches of yours on top of that, the result is
here:
https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9

The range-diff to your version is below. There's a few unrelated
fixes/nits in it.

I think with/without basing this on top of my series above your patches
here look good with the nits pointed out in the diff below addressed
(and some don't need to be). I.e. the dependency on it is rather
trivial, and the two could be split up.

What do you think is a good way to proceed? I could just submit the
below as a proposed v10 if you'd like & agree...

1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ Commit message
         unpack-objects: low memory footprint for get_data() in dry_run mode
     
         As the name implies, "get_data(size)" will allocate and return a given
    -    size of memory. Allocating memory for a large blob object may cause the
    +    amount of memory. Allocating memory for a large blob object may cause the
         system to run out of memory. Before preparing to replace calling of
         "get_data()" to unpack large blob objects in latter commits, refactor
         "get_data()" to reduce memory footprint for dry_run mode.
    @@ Commit message
     
         Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void use(int bytes)
    @@ t/t5328-unpack-large-objects.sh (new)
     +
     +test_no_loose () {
     +	glob=dest.git/objects/?? &&
    -+	echo "$glob" >expect &&
    -+	eval "echo $glob" >actual &&
    ++	echo $glob >expect &&
    ++	echo "$glob" >actual &&
     +	test_cmp expect actual
     +}
     +
-:  ----------- > 2:  c6b0437db03 object-file.c: do fsync() and close() before post-write die()
2:  88c91affd61 ! 3:  77bcfe3da6f object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
         When writing a large blob using "write_loose_object()", we have to pass
         a buffer with the whole content of the blob, and this behavior will
         consume lots of memory and may cause OOM. We will introduce a stream
    -    version function ("stream_loose_object()") in latter commit to resolve
    +    version function ("stream_loose_object()") in later commit to resolve
         this issue.
     
    -    Before introducing a stream vesion function for writing loose object,
    -    do some refactoring on "write_loose_object()" to reuse code for both
    -    versions.
    +    Before introducing that streaming function, do some refactoring on
    +    "write_loose_object()" to reuse code for both versions.
     
         Rewrite "write_loose_object()" as follows:
     
    @@ Commit message
     
          3. Compress data.
     
    -     4. Move common steps for ending zlib stream into a new funciton
    +     4. Move common steps for ending zlib stream into a new function
             "end_loose_object_common()".
     
          5. Close fd and finalize the object file.
    @@ Commit message
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
     @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
      	return fd;
      }
      
    ++/**
    ++ * Common steps for loose object writers to start writing loose
    ++ * objects:
    ++ *
    ++ * - Create tmpfile for the loose object.
    ++ * - Setup zlib stream for compression.
    ++ * - Start to feed header to zlib stream.
    ++ *
    ++ * Returns a "fd", which should later be provided to
    ++ * end_loose_object_common().
    ++ */
     +static int start_loose_object_common(struct strbuf *tmp_file,
     +				     const char *filename, unsigned flags,
     +				     git_zstream *stream,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	return fd;
     +}
     +
    -+static void end_loose_object_common(int ret, git_hash_ctx *c,
    ++/**
    ++ * Common steps for loose object writers to end writing loose objects:
    ++ *
    ++ * - End the compression of zlib stream.
    ++ * - Get the calculated oid to "parano_oid".
    ++ * - fsync() and close() the "fd"
    ++ */
    ++static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
     +				    git_zstream *stream,
     +				    struct object_id *parano_oid,
     +				    const struct object_id *expected_oid,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	if (ret != Z_OK)
     +		die(_(die_msg2_fmt), ret, expected_oid);
     +	the_hash_algo->final_oid_fn(parano_oid, c);
    ++
    ++	/*
    ++	 * We already did a write_buffer() to the "fd", let's fsync()
    ++	 * and close().
    ++	 *
    ++	 * We might still die() on a subsequent sanity check, but
    ++	 * let's not add to that confusion by not flushing any
    ++	 * outstanding writes to disk first.
    ++	 */
    ++	close_loose_object(fd);
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	while (git_deflate(&stream, 0) == Z_OK)
     -		; /* nothing */
     -	the_hash_algo->update_fn(&c, hdr, hdrlen);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * start writing loose oject:
    -+	 *
    -+	 *  - Create tmpfile for the loose object.
    -+	 *  - Setup zlib stream for compression.
    -+	 *  - Start to feed header to zlib stream.
    -+	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
     +				       &stream, compressed, sizeof(compressed),
     +				       &c, hdr, hdrlen);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
     -		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * end writing loose oject:
    -+	 *
    -+	 *  - End the compression of zlib stream.
    -+	 *  - Get the calculated oid to "parano_oid".
    -+	 */
    -+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
    +-
    +-	/*
    +-	 * We already did a write_buffer() to the "fd", let's fsync()
    +-	 * and close().
    +-	 *
    +-	 * We might still die() on a subsequent sanity check, but
    +-	 * let's not add to that confusion by not flushing any
    +-	 * outstanding writes to disk first.
    +-	 */
    +-	close_loose_object(fd);
    ++	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
     +				N_("unable to deflate new object %s (%d)"),
     +				N_("deflateEnd on object %s failed (%d)"));
    -+
    + 
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
    - 		    oid_to_hex(oid));
3:  054a00ed21d ! 4:  71c10e734d1 object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
     
         Add a new function "stream_loose_object()", which is a stream version of
         "write_loose_object()" but with a low memory footprint. We will use this
    -    function to unpack large blob object in latter commit.
    +    function to unpack large blob object in later commit.
     
         Another difference with "write_loose_object()" is that we have no chance
         to run "write_object_file_prepare()" to calculate the oid in advance.
         In "write_loose_object()", we know the oid and we can write the
         temporary file in the same directory as the final object, but for an
         object with an undetermined oid, we don't know the exact directory for
    -    the object, so we have to save the temporary file in ".git/objects/"
    -    directory instead.
    +    the object.
    +
    +    Still, we need to save the temporary file we're preparing
    +    somewhere. We'll do that in the top-level ".git/objects/"
    +    directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
    +    streamed it we'll know the OID, and will move it to its canonical
    +    path.
     
         "freshen_packed_object()" or "freshen_loose_object()" will be called
         inside "stream_loose_object()" after obtaining the "oid".
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +
     +	/* Since oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
    -+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
     +
     +	/* Common steps for write_loose_object and stream_loose_object to
     +	 * start writing loose oject:
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 *  - End the compression of zlib stream.
     +	 *  - Get the calculated oid.
     +	 */
    -+	end_loose_object_common(ret, &c, &stream, oid, NULL,
    ++	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
     +				N_("unable to stream deflate new object (%d)"),
     +				N_("deflateEnd on stream object failed (%d)"));
     +
    -+	close_loose_object(fd);
    -+
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
     +		goto cleanup;
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +}
     +
      int write_object_file_flags(const void *buf, unsigned long len,
    - 			    const char *type, struct object_id *oid,
    + 			    enum object_type type, struct object_id *oid,
      			    unsigned flags)
     
      ## object-store.h ##
    @@ object-store.h: static inline int write_object_file(const void *buf, unsigned lo
      
     +int stream_loose_object(struct input_stream *in_stream, size_t len,
     +			struct object_id *oid);
    -+
    - int hash_object_file_literally(const void *buf, unsigned long len,
    - 			       const char *type, struct object_id *oid,
    - 			       unsigned flags);
    + int hash_write_object_file_literally(const void *buf, unsigned long len,
    + 				     const char *type, struct object_id *oid,
    + 				     unsigned flags);
-:  ----------- > 5:  3c1d788d69d core doc: modernize core.bigFileThreshold documentation
4:  6bcba6bce66 ! 6:  8b83f6d6b83 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack_non_delta_entry() read data in a stream
    +    unpack-objects: use stream_loose_object() to unpack large objects
     
    -    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    -    entire contents of a blob object, no matter how big it is. This
    -    implementation may consume all the memory and cause OOM.
    +    Make use of the stream_loose_object() function introduced in the
    +    preceding commit to unpack large objects. Before this we'd need to
    +    malloc() the size of the blob before unpacking it, which could cause
    +    OOM with very large blobs.
     
    -    By implementing a zstream version of input_stream interface, we can use
    -    a small fixed buffer for "unpack_non_delta_entry()". However, unpack
    -    non-delta objects from a stream instead of from an entrie buffer will
    -    have 10% performance penalty.
    +    We could use this new interface to unpack all blobs, but doing so
    +    would result in a performance penalty of around 10%, as the below
    +    "hyperfine" benchmark will show. We therefore limit this to files
    +    larger than "core.bigFileThreshold":
     
             $ hyperfine \
               --setup \
    @@ Commit message
                         -c core.bigFileThreshold=16k unpack-objects
                         <small.pack' in 'HEAD~1'
     
    -    Therefore, only unpack objects larger than the "core.bigFileThreshold"
    -    in zstream. Until now, the config variable has been used in the
    -    following cases, and our new case belongs to the packfile category.
    +    An earlier version of this patch introduced a new
    +    "core.bigFileStreamingThreshold" instead of re-using the existing
    +    "core.bigFileThreshold" variable[1]. As noted in a detailed overview
    +    of its users in [2] using it has several different meanings.
     
    -     * Archive:
    +    Still, we consider it good enough to simply re-use it. While it's
    +    possible that someone might want to e.g. consider objects "small" for
    +    the purposes of diffing but "big" for the purposes of writing them
    +    such use-cases are probably too obscure to worry about. We can always
    +    split up "core.bigFileThreshold" in the future if there's a need for
    +    that.
     
    -       + archive.c: write_entry(): write large blob entries to archive in
    -         stream.
    -
    -     * Loose objects:
    -
    -       + object-file.c: index_fd(): when hashing large files in worktree,
    -         read files in a stream, and create one packfile per large blob if
    -         want to save files to git object store.
    -
    -       + object-file.c: read_loose_object(): when checking loose objects
    -         using "git-fsck", do not read full content of large loose objects.
    -
    -     * Packfile:
    -
    -       + fast-import.c: parse_and_store_blob(): streaming large blob from
    -         foreign source to packfile.
    -
    -       + index-pack.c: check_collison(): read and check large blob in stream.
    -
    -       + index-pack.c: unpack_entry_data(): do not return the entire
    -         contents of the big blob from packfile, but uses a fixed buf to
    -         perform some integrity checks on the object.
    -
    -       + pack-check.c: verify_packfile(): used by "git-fsck" and will call
    -         check_object_signature() to check large blob in pack with the
    -         streaming interface.
    -
    -       + pack-objects.c: get_object_details(): set "no_try_delta" for large
    -         blobs when counting objects.
    -
    -       + pack-objects.c: write_no_reuse_object(): streaming large blob to
    -         pack.
    -
    -       + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
    -         stream from packfile.
    -
    -     * Others:
    -
    -       + diff.c: diff_populate_filespec(): treat large blob file as binary.
    -
    -       + streaming.c: istream_source(): as a helper of "open_istream()" to
    -         select proper streaming interface to read large blob from packfile.
    +    1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
    +    2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    + ## Documentation/config/core.txt ##
    +@@ Documentation/config/core.txt: usage, at the slight expense of increased disk usage.
    + * Will be generally be streamed when written, which avoids excessive
    + memory usage, at the cost of some fixed overhead. Commands that make
    + use of this include linkgit:git-archive[1],
    +-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
    +-linkgit:git-fsck[1].
    ++linkgit:git-fast-import[1], linkgit:git-index-pack[1],
    ++linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
    + 
    + core.excludesFile::
    + 	Specifies the pathname to the file that contains patterns to
    +
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
      	}
5:  1bfaf89ee0b < -:  ----------- object-file API: add a format_object_header() function

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
@ 2022-02-02  8:32               ` Han Xin
  2022-02-02 10:59                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 165+ messages in thread
From: Han Xin @ 2022-02-02  8:32 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin

On Wed, Feb 2, 2022 at 5:28 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Thu, Jan 20 2022, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Changes since v8:
> > * Rename "assert_no_loose ()" into "test_no_loose ()" in
> >   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
> >   "test_dir_is_empty" instead.
> >
> > * Revert changes to "create_tmpfile()" and error handling is now in
> >   "start_loose_object_common()".
> >
> > * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
> >   for "write_loose_object()" now.
> >
> > * Remove the commit "object-file.c: remove the slash for directory_size()",
> >   it can be in a separate patch if necessary.
> >
> > Han Xin (4):
> >   unpack-objects: low memory footprint for get_data() in dry_run mode
> >   object-file.c: refactor write_loose_object() to several steps
> >   object-file.c: add "stream_loose_object()" to handle large object
> >   unpack-objects: unpack_non_delta_entry() read data in a stream
> >
> > Ævar Arnfjörð Bjarmason (1):
> >   object-file API: add a format_object_header() function
>
> I sent
> https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
> today which suggests splitting out the 5/5 cleanup you'd integrated.
>
> I then rebased these patches of yours on top of that, the result is
> here:
> https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9
>
> The range-diff to your version is below. There's a few unrelated
> fixes/nits in it.
>
> I think with/without basing this on top of my series above your patches
> here look good with the nits pointed out in the diff below addressed
> (and some don't need to be). I.e. the dependency on it is rather
> trivial, and the two could be split up.
>
> What do you think is a good way to proceed? I could just submit the
> below as a proposed v10 if you'd like & agree...
>

Yes, thanks for the suggestions, and I'm glad you're happy to do so.

Thanks.
-Han Xin

> 1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
>     @@ Commit message
>          unpack-objects: low memory footprint for get_data() in dry_run mode
>
>          As the name implies, "get_data(size)" will allocate and return a given
>     -    size of memory. Allocating memory for a large blob object may cause the
>     +    amount of memory. Allocating memory for a large blob object may cause the
>          system to run out of memory. Before preparing to replace calling of
>          "get_data()" to unpack large blob objects in latter commits, refactor
>          "get_data()" to reduce memory footprint for dry_run mode.
>     @@ Commit message
>
>          Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>          Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>     +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
>       ## builtin/unpack-objects.c ##
>      @@ builtin/unpack-objects.c: static void use(int bytes)
>     @@ t/t5328-unpack-large-objects.sh (new)
>      +
>      +test_no_loose () {
>      +  glob=dest.git/objects/?? &&
>     -+  echo "$glob" >expect &&
>     -+  eval "echo $glob" >actual &&
>     ++  echo $glob >expect &&
>     ++  echo "$glob" >actual &&
>      +  test_cmp expect actual
>      +}
>      +

I have a small doubt with this, it works fine with dash, but not
others like zsh. Wouldn't
it be better to do compatibility, or would it introduce other issues
that I don't know?

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 165+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-02-02  8:32               ` Han Xin
@ 2022-02-02 10:59                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 165+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-02 10:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin


On Wed, Feb 02 2022, Han Xin wrote:

> On Wed, Feb 2, 2022 at 5:28 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>
>>
>> On Thu, Jan 20 2022, Han Xin wrote:
>>
>> > From: Han Xin <hanxin.hx@alibaba-inc.com>
>> >
>> > Changes since v8:
>> > * Rename "assert_no_loose ()" into "test_no_loose ()" in
>> >   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
>> >   "test_dir_is_empty" instead.
>> >
>> > * Revert changes to "create_tmpfile()" and error handling is now in
>> >   "start_loose_object_common()".
>> >
>> > * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
>> >   for "write_loose_object()" now.
>> >
>> > * Remove the commit "object-file.c: remove the slash for directory_size()",
>> >   it can be in a separate patch if necessary.
>> >
>> > Han Xin (4):
>> >   unpack-objects: low memory footprint for get_data() in dry_run mode
>> >   object-file.c: refactor write_loose_object() to several steps
>> >   object-file.c: add "stream_loose_object()" to handle large object
>> >   unpack-objects: unpack_non_delta_entry() read data in a stream
>> >
>> > Ævar Arnfjörð Bjarmason (1):
>> >   object-file API: add a format_object_header() function
>>
>> I sent
>> https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
>> today which suggests splitting out the 5/5 cleanup you'd integrated.
>>
>> I then rebased these patches of yours on top of that, the result is
>> here:
>> https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9
>>
>> The range-diff to your version is below. There's a few unrelated
>> fixes/nits in it.
>>
>> I think with/without basing this on top of my series above your patches
>> here look good with the nits pointed out in the diff below addressed
>> (and some don't need to be). I.e. the dependency on it is rather
>> trivial, and the two could be split up.
>>
>> What do you think is a good way to proceed? I could just submit the
>> below as a proposed v10 if you'd like & agree...
>>
>
> Yes, thanks for the suggestions, and I'm glad you're happy to do so.

Willdo.

>> 1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
>>     @@ Commit message
>>          unpack-objects: low memory footprint for get_data() in dry_run mode
>>
>>          As the name implies, "get_data(size)" will allocate and return a given
>>     -    size of memory. Allocating memory for a large blob object may cause the
>>     +    amount of memory. Allocating memory for a large blob object may cause the
>>          system to run out of memory. Before preparing to replace calling of
>>          "get_data()" to unpack large blob objects in latter commits, refactor
>>          "get_data()" to reduce memory footprint for dry_run mode.
>>     @@ Commit message
>>
>>          Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>>          Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>>     +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>>
>>       ## builtin/unpack-objects.c ##
>>      @@ builtin/unpack-objects.c: static void use(int bytes)
>>     @@ t/t5328-unpack-large-objects.sh (new)
>>      +
>>      +test_no_loose () {
>>      +  glob=dest.git/objects/?? &&
>>     -+  echo "$glob" >expect &&
>>     -+  eval "echo $glob" >actual &&
>>     ++  echo $glob >expect &&
>>     ++  echo "$glob" >actual &&
>>      +  test_cmp expect actual
>>      +}
>>      +
>
> I have a small doubt with this, it works fine with dash, but not
> others like zsh. Wouldn't
> it be better to do compatibility, or would it introduce other issues
> that I don't know?

Ah, I hadn't spotted that zsh issue. I don't think the test suite will
run on it in general, but in any case I'll fix this.

There's a few other tests that do this just by piping "find" to "wc -l",
it's probably better to ju