[PATCH] unpack-objects: unpack large object in stream

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH] unpack-objects: unpack large object in stream
@ 2021-10-09  8:20 Han Xin
  2021-10-19  7:37 ` Han Xin
                   ` (14 more replies)
  0 siblings, 15 replies; 211+ messages in thread
From: Han Xin @ 2021-10-09  8:20 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use the setting of "core.bigFileThreshold" as
the threshold for large object.

Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          |  41 +++++++-
 object-file.c                     | 149 +++++++++++++++++++++++++++---
 object-store.h                    |   9 ++
 t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
 4 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8ac77e60a8 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+static void fill_stream(struct git_zstream *stream)
+{
+	stream->next_in = fill(1);
+	stream->avail_in = len;
+}
+
+static void use_stream(struct git_zstream *stream)
+{
+	use(len - stream->avail_in);
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	struct git_zstream_reader reader;
+	struct object_id *oid = &obj_list[nr].oid;
+
+	reader.fill = &fill_stream;
+	reader.use = &use_stream;
+
+	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
+				     oid, dry_run))
+		die("failed to write object in stream");
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index a8be899481..06c1693675 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
+			       int fd, unsigned char *compressed,
+			       int compressed_len, const void *buf,
+			       size_t len, int flush)
+{
+	int ret;
+
+	stream->next_in = (void *)buf;
+	stream->avail_in = len;
+	do {
+		unsigned char *in0 = stream->next_in;
+		ret = git_deflate(stream, flush);
+		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream->next_out = compressed;
+		stream->avail_out = compressed_len;
+	} while (ret == Z_OK);
+
+	return ret;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime)
@@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
-	do {
-		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	ret = write_object_buffer(&stream, &c, fd, compressed,
+				  sizeof(compressed), buf, len,
+				  Z_FINISH);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
@@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
 	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
 }
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid,
+			     int dry_run)
+{
+	git_zstream istream, ostream;
+	unsigned char buf[8192], compressed[4096];
+	char hdr[MAX_HEADER_LEN];
+	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
+	int ret = 0;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+
+	/* Write tmpfile in objects dir, because oid is unknown */
+	if (!dry_run) {
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (errno == EACCES)
+				ret = error(_("insufficient permission for adding an object to repository database %s"),
+					get_object_directory());
+			else
+				ret = error_errno(_("unable to create temporary file"));
+			goto cleanup;
+		}
+	}
+
+	memset(&istream, 0, sizeof(istream));
+	istream.next_out = buf;
+	istream.avail_out = sizeof(buf);
+	git_inflate_init(&istream);
+
+	if (!dry_run) {
+		/* Set it up */
+		git_deflate_init(&ostream, zlib_compression_level);
+		ostream.next_out = compressed;
+		ostream.avail_out = sizeof(compressed);
+		the_hash_algo->init_fn(&c);
+
+		/* First header */
+		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
+				(uintmax_t)len) + 1;
+		ostream.next_in = (unsigned char *)hdr;
+		ostream.avail_in = hdrlen;
+		while (git_deflate(&ostream, 0) == Z_OK)
+			; /* nothing */
+		the_hash_algo->update_fn(&c, hdr, hdrlen);
+	}
+
+	/* Then the data itself */
+	do {
+		unsigned char *last_out = istream.next_out;
+		reader->fill(&istream);
+		istatus = git_inflate(&istream, 0);
+		if (istatus == Z_STREAM_END)
+			flush = Z_FINISH;
+		reader->use(&istream);
+		if (!dry_run)
+			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
+						      sizeof(compressed), last_out,
+						      istream.next_out - last_out,
+						      flush);
+		istream.next_out = buf;
+		istream.avail_out = sizeof(buf);
+	} while (istatus == Z_OK);
+
+	if (istream.total_out != len || istatus != Z_STREAM_END)
+		die( _("inflate returned %d"), istatus);
+	git_inflate_end(&istream);
+
+	if (dry_run)
+		goto cleanup;
+
+	if (ostatus != Z_STREAM_END)
+		die(_("unable to deflate new object (%d)"), ostatus);
+	ostatus = git_deflate_end_gently(&ostream);
+	if (ostatus != Z_OK)
+		die(_("deflateEnd on object failed (%d)"), ostatus);
+	the_hash_algo->final_fn(oid->hash, &c);
+	close_loose_object(fd);
+
+	/* We get the oid now */
+	loose_object_path(the_repository, &filename, oid);
+
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		/*
+		 * Make sure the directory exists; note that the contents
+		 * of the buffer are undefined after mkstemp returns an
+		 * error, so we have to rewrite the whole buffer from
+		 * scratch.
+		 */
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
+			unlink_or_warn(tmp_file.buf);
+			strbuf_release(&dir);
+			ret = -1;
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	ret = finalize_object_file(tmp_file.buf, filename.buf);
+
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return ret;
+}
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags)
diff --git a/object-store.h b/object-store.h
index d24915ced1..12b113ef93 100644
--- a/object-store.h
+++ b/object-store.h
@@ -33,6 +33,11 @@ struct object_directory {
 	char *path;
 };
 
+struct git_zstream_reader {
+	void (*fill)(struct git_zstream *);
+	void (*use)(struct git_zstream *);
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 int write_object_file(const void *buf, unsigned long len,
 		      const char *type, struct object_id *oid);
 
+int write_stream_object_file(struct git_zstream_reader *reader,
+			     unsigned long len, const char *type,
+			     struct object_id *oid, int dry_run);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.0.1.g09a6bb964f.dirty


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
@ 2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-10-19  7:37 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道：
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
@ 2021-10-20 14:42 ` Philip Oakley
  2021-10-21  3:42   ` Han Xin
  2021-11-03  1:48 ` Han Xin
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Philip Oakley @ 2021-10-20 14:42 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

On 09/10/2021 09:20, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +static void fill_stream(struct git_zstream *stream)
> +{
> +	stream->next_in = fill(1);
> +	stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +	use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)

Can we use size_t for the `size`, and possibly `nr`, to improve
compatibility with Windows systems where unsigned long is only 32 bits?

There has been some work in the past on providing large file support on
Windows, which requires numerous long -> size_t changes.

Philip
> +{
> +	struct git_zstream_reader reader;
> +	struct object_id *oid = &obj_list[nr].oid;
> +
> +	reader.fill = &fill_stream;
> +	reader.use = &use_stream;
> +
> +	if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +				     oid, dry_run))
> +		die("failed to write object in stream");
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  				   unsigned nr)
>  {
> -	void *buf = get_data(size);
> +	void *buf;
> +
> +	/* Write large blob in stream without allocating full buffer. */
> +	if (type == OBJ_BLOB && size > big_file_threshold) {
> +		write_stream_blob(nr, size);
> +		return;
> +	}
>  
> +	buf = get_data(size);
>  	if (!dry_run && buf)
>  		write_object(nr, type, buf, size);
>  	else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +			       int fd, unsigned char *compressed,
> +			       int compressed_len, const void *buf,
> +			       size_t len, int flush)
> +{
> +	int ret;
> +
> +	stream->next_in = (void *)buf;
> +	stream->avail_in = len;
> +	do {
> +		unsigned char *in0 = stream->next_in;
> +		ret = git_deflate(stream, flush);
> +		the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +		if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream->next_out = compressed;
> +		stream->avail_out = compressed_len;
> +	} while (ret == Z_OK);
> +
> +	return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> -	stream.next_in = (void *)buf;
> -	stream.avail_in = len;
> -	do {
> -		unsigned char *in0 = stream.next_in;
> -		ret = git_deflate(&stream, Z_FINISH);
> -		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -			die(_("unable to write loose object file"));
> -		stream.next_out = compressed;
> -		stream.avail_out = sizeof(compressed);
> -	} while (ret == Z_OK);
> +	ret = write_object_buffer(&stream, &c, fd, compressed,
> +				  sizeof(compressed), buf, len,
> +				  Z_FINISH);
>  
>  	if (ret != Z_STREAM_END)
>  		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>  	return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid,
> +			     int dry_run)
> +{
> +	git_zstream istream, ostream;
> +	unsigned char buf[8192], compressed[4096];
> +	char hdr[MAX_HEADER_LEN];
> +	int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +	int ret = 0;
> +	git_hash_ctx c;
> +	struct strbuf tmp_file = STRBUF_INIT;
> +	struct strbuf filename = STRBUF_INIT;
> +
> +	/* Write tmpfile in objects dir, because oid is unknown */
> +	if (!dry_run) {
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +		fd = create_tmpfile(&tmp_file, filename.buf);
> +		if (fd < 0) {
> +			if (errno == EACCES)
> +				ret = error(_("insufficient permission for adding an object to repository database %s"),
> +					get_object_directory());
> +			else
> +				ret = error_errno(_("unable to create temporary file"));
> +			goto cleanup;
> +		}
> +	}
> +
> +	memset(&istream, 0, sizeof(istream));
> +	istream.next_out = buf;
> +	istream.avail_out = sizeof(buf);
> +	git_inflate_init(&istream);
> +
> +	if (!dry_run) {
> +		/* Set it up */
> +		git_deflate_init(&ostream, zlib_compression_level);
> +		ostream.next_out = compressed;
> +		ostream.avail_out = sizeof(compressed);
> +		the_hash_algo->init_fn(&c);
> +
> +		/* First header */
> +		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +				(uintmax_t)len) + 1;
> +		ostream.next_in = (unsigned char *)hdr;
> +		ostream.avail_in = hdrlen;
> +		while (git_deflate(&ostream, 0) == Z_OK)
> +			; /* nothing */
> +		the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	}
> +
> +	/* Then the data itself */
> +	do {
> +		unsigned char *last_out = istream.next_out;
> +		reader->fill(&istream);
> +		istatus = git_inflate(&istream, 0);
> +		if (istatus == Z_STREAM_END)
> +			flush = Z_FINISH;
> +		reader->use(&istream);
> +		if (!dry_run)
> +			ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +						      sizeof(compressed), last_out,
> +						      istream.next_out - last_out,
> +						      flush);
> +		istream.next_out = buf;
> +		istream.avail_out = sizeof(buf);
> +	} while (istatus == Z_OK);
> +
> +	if (istream.total_out != len || istatus != Z_STREAM_END)
> +		die( _("inflate returned %d"), istatus);
> +	git_inflate_end(&istream);
> +
> +	if (dry_run)
> +		goto cleanup;
> +
> +	if (ostatus != Z_STREAM_END)
> +		die(_("unable to deflate new object (%d)"), ostatus);
> +	ostatus = git_deflate_end_gently(&ostream);
> +	if (ostatus != Z_OK)
> +		die(_("deflateEnd on object failed (%d)"), ostatus);
> +	the_hash_algo->final_fn(oid->hash, &c);
> +	close_loose_object(fd);
> +
> +	/* We get the oid now */
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		/*
> +		 * Make sure the directory exists; note that the contents
> +		 * of the buffer are undefined after mkstemp returns an
> +		 * error, so we have to rewrite the whole buffer from
> +		 * scratch.
> +		 */
> +		strbuf_add(&dir, filename.buf, dirlen - 1);
> +		if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +			unlink_or_warn(tmp_file.buf);
> +			strbuf_release(&dir);
> +			ret = -1;
> +			goto cleanup;
> +		}
> +		strbuf_release(&dir);
> +	}
> +
> +	ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);
> +	return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct git_zstream_reader {
> +	void (*fill)(struct git_zstream *);
> +	void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)
>  
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>  		      const char *type, struct object_id *oid);
>  
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +			     unsigned long len, const char *type,
> +			     struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>  			       const char *type, struct object_id *oid,
>  			       unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort
> +	) >expect &&
> +	git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +	test_must_fail git push dest.git HEAD 2>err &&
> +	test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +	git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +	git push dest.git HEAD &&
> +	git -C dest.git fsck &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +	PACK=$(echo main | git pack-objects --progress --revs test) &&
> +	unset GIT_ALLOC_LIMIT &&
> +	git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 2m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +	(
> +		cd unpack-test.git &&
> +		git config core.bigFileThreshold 1m &&
> +		git unpack-objects -n <../test-$PACK.pack
> +	) &&
> +	(
> +		cd unpack-test.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_done


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-20 14:42 ` Philip Oakley
@ 2021-10-21  3:42   ` Han Xin
  2021-10-21 22:47     ` Philip Oakley
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-10-21  3:42 UTC (permalink / raw)
  To: Philip Oakley; +Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Han Xin

Philip Oakley <philipoakley@iee.email> 于2021年10月20日周三 下午10:43写道：
>
> On 09/10/2021 09:20, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When calling "unpack_non_delta_entry()", will allocate full memory for
> > the whole size of the unpacked object and write the buffer to loose file
> > on disk. This may lead to OOM for the git-unpack-objects process when
> > unpacking a very large object.
> >
> > In function "unpack_delta_entry()", will also allocate full memory to
> > buffer the whole delta, but since there will be no delta for an object
> > larger than "core.bigFileThreshold", this issue is moderate.
> >
> > To resolve the OOM issue in "git-unpack-objects", we can unpack large
> > object to file in stream, and use the setting of "core.bigFileThreshold" as
> > the threshold for large object.
> >
> > Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  builtin/unpack-objects.c          |  41 +++++++-
> >  object-file.c                     | 149 +++++++++++++++++++++++++++---
> >  object-store.h                    |   9 ++
> >  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
> >  4 files changed, 279 insertions(+), 12 deletions(-)
> >  create mode 100755 t/t5590-receive-unpack-objects.sh
> >
> > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> > index 4a9466295b..8ac77e60a8 100644
> > --- a/builtin/unpack-objects.c
> > +++ b/builtin/unpack-objects.c
> > @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
> >       }
> >  }
> >
> > +static void fill_stream(struct git_zstream *stream)
> > +{
> > +     stream->next_in = fill(1);
> > +     stream->avail_in = len;
> > +}
> > +
> > +static void use_stream(struct git_zstream *stream)
> > +{
> > +     use(len - stream->avail_in);
> > +}
> > +
> > +static void write_stream_blob(unsigned nr, unsigned long size)
>
> Can we use size_t for the `size`, and possibly `nr`, to improve
> compatibility with Windows systems where unsigned long is only 32 bits?
>
> There has been some work in the past on providing large file support on
> Windows, which requires numerous long -> size_t changes.
>
> Philip

Thanks for your review. I'm not sure if I should do this change in this patch,
it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
`write_object()` and many others.

> > +{
> > +     struct git_zstream_reader reader;
> > +     struct object_id *oid = &obj_list[nr].oid;
> > +
> > +     reader.fill = &fill_stream;
> > +     reader.use = &use_stream;
> > +
> > +     if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> > +                                  oid, dry_run))
> > +             die("failed to write object in stream");
> > +     if (strict && !dry_run) {
> > +             struct blob *blob = lookup_blob(the_repository, oid);
> > +             if (blob)
> > +                     blob->object.flags |= FLAG_WRITTEN;
> > +             else
> > +                     die("invalid blob object from stream");
> > +     }
> > +     obj_list[nr].obj = NULL;
> > +}
> > +
> >  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
> >                                  unsigned nr)
> >  {
> > -     void *buf = get_data(size);
> > +     void *buf;
> > +
> > +     /* Write large blob in stream without allocating full buffer. */
> > +     if (type == OBJ_BLOB && size > big_file_threshold) {
> > +             write_stream_blob(nr, size);
> > +             return;
> > +     }
> >
> > +     buf = get_data(size);
> >       if (!dry_run && buf)
> >               write_object(nr, type, buf, size);
> >       else
> > diff --git a/object-file.c b/object-file.c
> > index a8be899481..06c1693675 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> > +                            int fd, unsigned char *compressed,
> > +                            int compressed_len, const void *buf,
> > +                            size_t len, int flush)
> > +{
> > +     int ret;
> > +
> > +     stream->next_in = (void *)buf;
> > +     stream->avail_in = len;
> > +     do {
> > +             unsigned char *in0 = stream->next_in;
> > +             ret = git_deflate(stream, flush);
> > +             the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> > +             if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> > +                     die(_("unable to write loose object file"));
> > +             stream->next_out = compressed;
> > +             stream->avail_out = compressed_len;
> > +     } while (ret == Z_OK);
> > +
> > +     return ret;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> >                             int hdrlen, const void *buf, unsigned long len,
> >                             time_t mtime)
> > @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > -     stream.next_in = (void *)buf;
> > -     stream.avail_in = len;
> > -     do {
> > -             unsigned char *in0 = stream.next_in;
> > -             ret = git_deflate(&stream, Z_FINISH);
> > -             the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> > -             if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> > -                     die(_("unable to write loose object file"));
> > -             stream.next_out = compressed;
> > -             stream.avail_out = sizeof(compressed);
> > -     } while (ret == Z_OK);
> > +     ret = write_object_buffer(&stream, &c, fd, compressed,
> > +                               sizeof(compressed), buf, len,
> > +                               Z_FINISH);
> >
> >       if (ret != Z_STREAM_END)
> >               die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> > @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
> >       return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
> >  }
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid,
> > +                          int dry_run)
> > +{
> > +     git_zstream istream, ostream;
> > +     unsigned char buf[8192], compressed[4096];
> > +     char hdr[MAX_HEADER_LEN];
> > +     int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> > +     int ret = 0;
> > +     git_hash_ctx c;
> > +     struct strbuf tmp_file = STRBUF_INIT;
> > +     struct strbuf filename = STRBUF_INIT;
> > +
> > +     /* Write tmpfile in objects dir, because oid is unknown */
> > +     if (!dry_run) {
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +             fd = create_tmpfile(&tmp_file, filename.buf);
> > +             if (fd < 0) {
> > +                     if (errno == EACCES)
> > +                             ret = error(_("insufficient permission for adding an object to repository database %s"),
> > +                                     get_object_directory());
> > +                     else
> > +                             ret = error_errno(_("unable to create temporary file"));
> > +                     goto cleanup;
> > +             }
> > +     }
> > +
> > +     memset(&istream, 0, sizeof(istream));
> > +     istream.next_out = buf;
> > +     istream.avail_out = sizeof(buf);
> > +     git_inflate_init(&istream);
> > +
> > +     if (!dry_run) {
> > +             /* Set it up */
> > +             git_deflate_init(&ostream, zlib_compression_level);
> > +             ostream.next_out = compressed;
> > +             ostream.avail_out = sizeof(compressed);
> > +             the_hash_algo->init_fn(&c);
> > +
> > +             /* First header */
> > +             hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> > +                             (uintmax_t)len) + 1;
> > +             ostream.next_in = (unsigned char *)hdr;
> > +             ostream.avail_in = hdrlen;
> > +             while (git_deflate(&ostream, 0) == Z_OK)
> > +                     ; /* nothing */
> > +             the_hash_algo->update_fn(&c, hdr, hdrlen);
> > +     }
> > +
> > +     /* Then the data itself */
> > +     do {
> > +             unsigned char *last_out = istream.next_out;
> > +             reader->fill(&istream);
> > +             istatus = git_inflate(&istream, 0);
> > +             if (istatus == Z_STREAM_END)
> > +                     flush = Z_FINISH;
> > +             reader->use(&istream);
> > +             if (!dry_run)
> > +                     ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> > +                                                   sizeof(compressed), last_out,
> > +                                                   istream.next_out - last_out,
> > +                                                   flush);
> > +             istream.next_out = buf;
> > +             istream.avail_out = sizeof(buf);
> > +     } while (istatus == Z_OK);
> > +
> > +     if (istream.total_out != len || istatus != Z_STREAM_END)
> > +             die( _("inflate returned %d"), istatus);
> > +     git_inflate_end(&istream);
> > +
> > +     if (dry_run)
> > +             goto cleanup;
> > +
> > +     if (ostatus != Z_STREAM_END)
> > +             die(_("unable to deflate new object (%d)"), ostatus);
> > +     ostatus = git_deflate_end_gently(&ostream);
> > +     if (ostatus != Z_OK)
> > +             die(_("deflateEnd on object failed (%d)"), ostatus);
> > +     the_hash_algo->final_fn(oid->hash, &c);
> > +     close_loose_object(fd);
> > +
> > +     /* We get the oid now */
> > +     loose_object_path(the_repository, &filename, oid);
> > +
> > +     dirlen = directory_size(filename.buf);
> > +     if (dirlen) {
> > +             struct strbuf dir = STRBUF_INIT;
> > +             /*
> > +              * Make sure the directory exists; note that the contents
> > +              * of the buffer are undefined after mkstemp returns an
> > +              * error, so we have to rewrite the whole buffer from
> > +              * scratch.
> > +              */
> > +             strbuf_add(&dir, filename.buf, dirlen - 1);
> > +             if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> > +                     unlink_or_warn(tmp_file.buf);
> > +                     strbuf_release(&dir);
> > +                     ret = -1;
> > +                     goto cleanup;
> > +             }
> > +             strbuf_release(&dir);
> > +     }
> > +
> > +     ret = finalize_object_file(tmp_file.buf, filename.buf);
> > +
> > +cleanup:
> > +     strbuf_release(&tmp_file);
> > +     strbuf_release(&filename);
> > +     return ret;
> > +}
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags)
> > diff --git a/object-store.h b/object-store.h
> > index d24915ced1..12b113ef93 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -33,6 +33,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct git_zstream_reader {
> > +     void (*fill)(struct git_zstream *);
> > +     void (*use)(struct git_zstream *);
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)
> >
> > @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
> >  int write_object_file(const void *buf, unsigned long len,
> >                     const char *type, struct object_id *oid);
> >
> > +int write_stream_object_file(struct git_zstream_reader *reader,
> > +                          unsigned long len, const char *type,
> > +                          struct object_id *oid, int dry_run);
> > +
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> >                              const char *type, struct object_id *oid,
> >                              unsigned flags);
> > diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> > new file mode 100755
> > index 0000000000..7e63dfc0db
> > --- /dev/null
> > +++ b/t/t5590-receive-unpack-objects.sh
> > @@ -0,0 +1,92 @@
> > +#!/bin/sh
> > +#
> > +# Copyright (c) 2021 Han Xin
> > +#
> > +
> > +test_description='Test unpack-objects when receive pack'
> > +
> > +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> > +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> > +
> > +. ./test-lib.sh
> > +
> > +test_expect_success "create commit with big blobs (1.5 MB)" '
> > +     test-tool genrandom foo 1500000 >big-blob &&
> > +     test_commit --append foo big-blob &&
> > +     test-tool genrandom bar 1500000 >big-blob &&
> > +     test_commit --append bar big-blob &&
> > +     (
> > +             cd .git &&
> > +             find objects/?? -type f | sort
> > +     ) >expect &&
> > +     git repack -ad
> > +'
> > +
> > +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> > +     GIT_ALLOC_LIMIT=1m &&
> > +     export GIT_ALLOC_LIMIT
> > +'
> > +
> > +test_expect_success 'prepare dest repository' '
> > +     git init --bare dest.git &&
> > +     git -C dest.git config core.bigFileThreshold 2m &&
> > +     git -C dest.git config receive.unpacklimit 100
> > +'
> > +
> > +test_expect_success 'fail to push: cannot allocate' '
> > +     test_must_fail git push dest.git HEAD 2>err &&
> > +     test_i18ngrep "remote: fatal: attempting to allocate" err &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     ! test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'set a lower bigfile threshold' '
> > +     git -C dest.git config core.bigFileThreshold 1m
> > +'
> > +
> > +test_expect_success 'unpack big object in stream' '
> > +     git push dest.git HEAD &&
> > +     git -C dest.git fsck &&
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
> > +     ) >actual &&
> > +     test_cmp expect actual
> > +'
> > +
> > +test_expect_success 'setup for unpack-objects dry-run test' '
> > +     PACK=$(echo main | git pack-objects --progress --revs test) &&
> > +     unset GIT_ALLOC_LIMIT &&
> > +     git init --bare unpack-test.git
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with large threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 2m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_expect_success 'unpack-objects dry-run with small threshold' '
> > +     (
> > +             cd unpack-test.git &&
> > +             git config core.bigFileThreshold 1m &&
> > +             git unpack-objects -n <../test-$PACK.pack
> > +     ) &&
> > +     (
> > +             cd unpack-test.git &&
> > +             find objects/ -type f
> > +     ) >actual &&
> > +     test_must_be_empty actual
> > +'
> > +
> > +test_done
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-21  3:42   ` Han Xin
@ 2021-10-21 22:47     ` Philip Oakley
  0 siblings, 0 replies; 211+ messages in thread
From: Philip Oakley @ 2021-10-21 22:47 UTC (permalink / raw)
  To: Han Xin; +Cc: Han Xin, Jiang Xin, Git List

On 21/10/2021 04:42, Han Xin wrote:
>>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> Can we use size_t for the `size`, and possibly `nr`, to improve
>> compatibility with Windows systems where unsigned long is only 32 bits?
>>
>> There has been some work in the past on providing large file support on
>> Windows, which requires numerous long -> size_t changes.
>>
>> Philip
> Thanks for your review. I'm not sure if I should do this change in this patch,
> it will also change the type defined in `unpack_one()`,`unpack_non_delta_entry`,
> `write_object()` and many others.
>
I was mainly raising the issue regarding the 4GB (sometime 2GB)
limitations on Windows which has been a problem for many years.

I had been thinking of not changing the `nr` (number of objects limit)
as 2G objects is hopefully already sufficient, even for thargest of
repos (though IIUC their index file size did break the 32bit size limit).

Staying with the existing types won't make the situation any worse, so
from that perspective the change isn't needed.
--
Philip

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
  2021-10-19  7:37 ` Han Xin
  2021-10-20 14:42 ` Philip Oakley
@ 2021-11-03  1:48 ` Han Xin
  2021-11-03 10:07   ` Philip Oakley
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-03  1:48 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

Any more suggestions?

Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道：
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use the setting of "core.bigFileThreshold" as
> the threshold for large object.
>
> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          |  41 +++++++-
>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>  object-store.h                    |   9 ++
>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>  4 files changed, 279 insertions(+), 12 deletions(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..8ac77e60a8 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +static void fill_stream(struct git_zstream *stream)
> +{
> +       stream->next_in = fill(1);
> +       stream->avail_in = len;
> +}
> +
> +static void use_stream(struct git_zstream *stream)
> +{
> +       use(len - stream->avail_in);
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       struct git_zstream_reader reader;
> +       struct object_id *oid = &obj_list[nr].oid;
> +
> +       reader.fill = &fill_stream;
> +       reader.use = &use_stream;
> +
> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
> +                                    oid, dry_run))
> +               die("failed to write object in stream");
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {
> +               write_stream_blob(nr, size);
> +               return;
> +       }
>
> +       buf = get_data(size);
>         if (!dry_run && buf)
>                 write_object(nr, type, buf, size);
>         else
> diff --git a/object-file.c b/object-file.c
> index a8be899481..06c1693675 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
> +                              int fd, unsigned char *compressed,
> +                              int compressed_len, const void *buf,
> +                              size_t len, int flush)
> +{
> +       int ret;
> +
> +       stream->next_in = (void *)buf;
> +       stream->avail_in = len;
> +       do {
> +               unsigned char *in0 = stream->next_in;
> +               ret = git_deflate(stream, flush);
> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
> +                       die(_("unable to write loose object file"));
> +               stream->next_out = compressed;
> +               stream->avail_out = compressed_len;
> +       } while (ret == Z_OK);
> +
> +       return ret;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, const void *buf, unsigned long len,
>                               time_t mtime)
> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
> -       do {
> -               unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> -                       die(_("unable to write loose object file"));
> -               stream.next_out = compressed;
> -               stream.avail_out = sizeof(compressed);
> -       } while (ret == Z_OK);
> +       ret = write_object_buffer(&stream, &c, fd, compressed,
> +                                 sizeof(compressed), buf, len,
> +                                 Z_FINISH);
>
>         if (ret != Z_STREAM_END)
>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>  }
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid,
> +                            int dry_run)
> +{
> +       git_zstream istream, ostream;
> +       unsigned char buf[8192], compressed[4096];
> +       char hdr[MAX_HEADER_LEN];
> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
> +       int ret = 0;
> +       git_hash_ctx c;
> +       struct strbuf tmp_file = STRBUF_INIT;
> +       struct strbuf filename = STRBUF_INIT;
> +
> +       /* Write tmpfile in objects dir, because oid is unknown */
> +       if (!dry_run) {
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (errno == EACCES)
> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
> +                                       get_object_directory());
> +                       else
> +                               ret = error_errno(_("unable to create temporary file"));
> +                       goto cleanup;
> +               }
> +       }
> +
> +       memset(&istream, 0, sizeof(istream));
> +       istream.next_out = buf;
> +       istream.avail_out = sizeof(buf);
> +       git_inflate_init(&istream);
> +
> +       if (!dry_run) {
> +               /* Set it up */
> +               git_deflate_init(&ostream, zlib_compression_level);
> +               ostream.next_out = compressed;
> +               ostream.avail_out = sizeof(compressed);
> +               the_hash_algo->init_fn(&c);
> +
> +               /* First header */
> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
> +                               (uintmax_t)len) + 1;
> +               ostream.next_in = (unsigned char *)hdr;
> +               ostream.avail_in = hdrlen;
> +               while (git_deflate(&ostream, 0) == Z_OK)
> +                       ; /* nothing */
> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
> +       }
> +
> +       /* Then the data itself */
> +       do {
> +               unsigned char *last_out = istream.next_out;
> +               reader->fill(&istream);
> +               istatus = git_inflate(&istream, 0);
> +               if (istatus == Z_STREAM_END)
> +                       flush = Z_FINISH;
> +               reader->use(&istream);
> +               if (!dry_run)
> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
> +                                                     sizeof(compressed), last_out,
> +                                                     istream.next_out - last_out,
> +                                                     flush);
> +               istream.next_out = buf;
> +               istream.avail_out = sizeof(buf);
> +       } while (istatus == Z_OK);
> +
> +       if (istream.total_out != len || istatus != Z_STREAM_END)
> +               die( _("inflate returned %d"), istatus);
> +       git_inflate_end(&istream);
> +
> +       if (dry_run)
> +               goto cleanup;
> +
> +       if (ostatus != Z_STREAM_END)
> +               die(_("unable to deflate new object (%d)"), ostatus);
> +       ostatus = git_deflate_end_gently(&ostream);
> +       if (ostatus != Z_OK)
> +               die(_("deflateEnd on object failed (%d)"), ostatus);
> +       the_hash_algo->final_fn(oid->hash, &c);
> +       close_loose_object(fd);
> +
> +       /* We get the oid now */
> +       loose_object_path(the_repository, &filename, oid);
> +
> +       dirlen = directory_size(filename.buf);
> +       if (dirlen) {
> +               struct strbuf dir = STRBUF_INIT;
> +               /*
> +                * Make sure the directory exists; note that the contents
> +                * of the buffer are undefined after mkstemp returns an
> +                * error, so we have to rewrite the whole buffer from
> +                * scratch.
> +                */
> +               strbuf_add(&dir, filename.buf, dirlen - 1);
> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
> +                       unlink_or_warn(tmp_file.buf);
> +                       strbuf_release(&dir);
> +                       ret = -1;
> +                       goto cleanup;
> +               }
> +               strbuf_release(&dir);
> +       }
> +
> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
> +
> +cleanup:
> +       strbuf_release(&tmp_file);
> +       strbuf_release(&filename);
> +       return ret;
> +}
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags)
> diff --git a/object-store.h b/object-store.h
> index d24915ced1..12b113ef93 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -33,6 +33,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct git_zstream_reader {
> +       void (*fill)(struct git_zstream *);
> +       void (*use)(struct git_zstream *);
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>  int write_object_file(const void *buf, unsigned long len,
>                       const char *type, struct object_id *oid);
>
> +int write_stream_object_file(struct git_zstream_reader *reader,
> +                            unsigned long len, const char *type,
> +                            struct object_id *oid, int dry_run);
> +
>  int hash_object_file_literally(const void *buf, unsigned long len,
>                                const char *type, struct object_id *oid,
>                                unsigned flags);
> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
> new file mode 100755
> index 0000000000..7e63dfc0db
> --- /dev/null
> +++ b/t/t5590-receive-unpack-objects.sh
> @@ -0,0 +1,92 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +       test-tool genrandom foo 1500000 >big-blob &&
> +       test_commit --append foo big-blob &&
> +       test-tool genrandom bar 1500000 >big-blob &&
> +       test_commit --append bar big-blob &&
> +       (
> +               cd .git &&
> +               find objects/?? -type f | sort
> +       ) >expect &&
> +       git repack -ad
> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +       GIT_ALLOC_LIMIT=1m &&
> +       export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +       git init --bare dest.git &&
> +       git -C dest.git config core.bigFileThreshold 2m &&
> +       git -C dest.git config receive.unpacklimit 100
> +'
> +
> +test_expect_success 'fail to push: cannot allocate' '
> +       test_must_fail git push dest.git HEAD 2>err &&
> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       ! test_cmp expect actual
> +'
> +
> +test_expect_success 'set a lower bigfile threshold' '
> +       git -C dest.git config core.bigFileThreshold 1m
> +'
> +
> +test_expect_success 'unpack big object in stream' '
> +       git push dest.git HEAD &&
> +       git -C dest.git fsck &&
> +       (
> +               cd dest.git &&
> +               find objects/?? -type f | sort
> +       ) >actual &&
> +       test_cmp expect actual
> +'
> +
> +test_expect_success 'setup for unpack-objects dry-run test' '
> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
> +       unset GIT_ALLOC_LIMIT &&
> +       git init --bare unpack-test.git
> +'
> +
> +test_expect_success 'unpack-objects dry-run with large threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 2m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run with small threshold' '
> +       (
> +               cd unpack-test.git &&
> +               git config core.bigFileThreshold 1m &&
> +               git unpack-objects -n <../test-$PACK.pack
> +       ) &&
> +       (
> +               cd unpack-test.git &&
> +               find objects/ -type f
> +       ) >actual &&
> +       test_must_be_empty actual
> +'
> +
> +test_done
> --
> 2.33.0.1.g09a6bb964f.dirty
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH] unpack-objects: unpack large object in stream
  2021-11-03  1:48 ` Han Xin
@ 2021-11-03 10:07   ` Philip Oakley
  0 siblings, 0 replies; 211+ messages in thread
From: Philip Oakley @ 2021-11-03 10:07 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin; +Cc: Han Xin

(replies to the alibaba-inc.com aren't getting through for me)

On 03/11/2021 01:48, Han Xin wrote:
> Any more suggestions?
>
> Han Xin <chiyutianyi@gmail.com> 于2021年10月9日周六 下午4:21写道：
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> When calling "unpack_non_delta_entry()", will allocate full memory for
>> the whole size of the unpacked object and write the buffer to loose file
>> on disk. This may lead to OOM for the git-unpack-objects process when
>> unpacking a very large object.

Is it possible to split the patch into smaller pieces, taking each item
separately?

For large files (as above), it should be possible to stream the
unpacking direct to disk, in the same way that the zlib reading is
chunked. However having the same 'code' in two places would need to be
addressed (the DRY principle).

At the moment on LLP64 systems (Windows) there is already a long (32bit)
vs size_t (64bit) problem there (zlib stream), and the size_t problem
then permeates the wider codebase.

The normal Git file operations does tend to memory map whole files, but
here it looks like you can bypass that.
>>
>> In function "unpack_delta_entry()", will also allocate full memory to
>> buffer the whole delta, but since there will be no delta for an object
>> larger than "core.bigFileThreshold", this issue is moderate.

What does 'moderate' mean here? Does it mean there is a simple test that
allows you to side step the whole problem?

>>
>> To resolve the OOM issue in "git-unpack-objects", we can unpack large
>> object to file in stream, and use the setting of "core.bigFileThreshold" as
>> the threshold for large object.

Is this "core.bigFileThreshold" the core element? If so, it is too far
down the commit message. The readers have already (potentially) misread
the message and reacted too soon.  Perhaps: "use `core.bigFileThreshold`
to avoid mmap OOM limits when unpacking".

--
Philip
>>
>> Reviewed-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>> ---
>>  builtin/unpack-objects.c          |  41 +++++++-
>>  object-file.c                     | 149 +++++++++++++++++++++++++++---
>>  object-store.h                    |   9 ++
>>  t/t5590-receive-unpack-objects.sh |  92 ++++++++++++++++++
>>  4 files changed, 279 insertions(+), 12 deletions(-)
>>  create mode 100755 t/t5590-receive-unpack-objects.sh
>>
>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>> index 4a9466295b..8ac77e60a8 100644
>> --- a/builtin/unpack-objects.c
>> +++ b/builtin/unpack-objects.c
>> @@ -320,11 +320,50 @@ static void added_object(unsigned nr, enum object_type type,
>>         }
>>  }
>>
>> +static void fill_stream(struct git_zstream *stream)
>> +{
>> +       stream->next_in = fill(1);
>> +       stream->avail_in = len;
>> +}
>> +
>> +static void use_stream(struct git_zstream *stream)
>> +{
>> +       use(len - stream->avail_in);
>> +}
>> +
>> +static void write_stream_blob(unsigned nr, unsigned long size)
>> +{
>> +       struct git_zstream_reader reader;
>> +       struct object_id *oid = &obj_list[nr].oid;
>> +
>> +       reader.fill = &fill_stream;
>> +       reader.use = &use_stream;
>> +
>> +       if (write_stream_object_file(&reader, size, type_name(OBJ_BLOB),
>> +                                    oid, dry_run))
>> +               die("failed to write object in stream");
>> +       if (strict && !dry_run) {
>> +               struct blob *blob = lookup_blob(the_repository, oid);
>> +               if (blob)
>> +                       blob->object.flags |= FLAG_WRITTEN;
>> +               else
>> +                       die("invalid blob object from stream");
>> +       }
>> +       obj_list[nr].obj = NULL;
>> +}
>> +
>>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>>                                    unsigned nr)
>>  {
>> -       void *buf = get_data(size);
>> +       void *buf;
>> +
>> +       /* Write large blob in stream without allocating full buffer. */
>> +       if (type == OBJ_BLOB && size > big_file_threshold) {
>> +               write_stream_blob(nr, size);
>> +               return;
>> +       }
>>
>> +       buf = get_data(size);
>>         if (!dry_run && buf)
>>                 write_object(nr, type, buf, size);
>>         else
>> diff --git a/object-file.c b/object-file.c
>> index a8be899481..06c1693675 100644
>> --- a/object-file.c
>> +++ b/object-file.c
>> @@ -1913,6 +1913,28 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>>         return fd;
>>  }
>>
>> +static int write_object_buffer(struct git_zstream *stream, git_hash_ctx *c,
>> +                              int fd, unsigned char *compressed,
>> +                              int compressed_len, const void *buf,
>> +                              size_t len, int flush)
>> +{
>> +       int ret;
>> +
>> +       stream->next_in = (void *)buf;
>> +       stream->avail_in = len;
>> +       do {
>> +               unsigned char *in0 = stream->next_in;
>> +               ret = git_deflate(stream, flush);
>> +               the_hash_algo->update_fn(c, in0, stream->next_in - in0);
>> +               if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
>> +                       die(_("unable to write loose object file"));
>> +               stream->next_out = compressed;
>> +               stream->avail_out = compressed_len;
>> +       } while (ret == Z_OK);
>> +
>> +       return ret;
>> +}
>> +
>>  static int write_loose_object(const struct object_id *oid, char *hdr,
>>                               int hdrlen, const void *buf, unsigned long len,
>>                               time_t mtime)
>> @@ -1949,17 +1971,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>>
>>         /* Then the data itself.. */
>> -       stream.next_in = (void *)buf;
>> -       stream.avail_in = len;
>> -       do {
>> -               unsigned char *in0 = stream.next_in;
>> -               ret = git_deflate(&stream, Z_FINISH);
>> -               the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>> -                       die(_("unable to write loose object file"));
>> -               stream.next_out = compressed;
>> -               stream.avail_out = sizeof(compressed);
>> -       } while (ret == Z_OK);
>> +       ret = write_object_buffer(&stream, &c, fd, compressed,
>> +                                 sizeof(compressed), buf, len,
>> +                                 Z_FINISH);
>>
>>         if (ret != Z_STREAM_END)
>>                 die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
>> @@ -2020,6 +2034,119 @@ int write_object_file(const void *buf, unsigned long len, const char *type,
>>         return write_loose_object(oid, hdr, hdrlen, buf, len, 0);
>>  }
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid,
>> +                            int dry_run)
>> +{
>> +       git_zstream istream, ostream;
>> +       unsigned char buf[8192], compressed[4096];
>> +       char hdr[MAX_HEADER_LEN];
>> +       int istatus, ostatus, fd = 0, hdrlen, dirlen, flush = 0;
>> +       int ret = 0;
>> +       git_hash_ctx c;
>> +       struct strbuf tmp_file = STRBUF_INIT;
>> +       struct strbuf filename = STRBUF_INIT;
>> +
>> +       /* Write tmpfile in objects dir, because oid is unknown */
>> +       if (!dry_run) {
>> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
>> +               strbuf_addch(&filename, '/');
>> +               fd = create_tmpfile(&tmp_file, filename.buf);
>> +               if (fd < 0) {
>> +                       if (errno == EACCES)
>> +                               ret = error(_("insufficient permission for adding an object to repository database %s"),
>> +                                       get_object_directory());
>> +                       else
>> +                               ret = error_errno(_("unable to create temporary file"));
>> +                       goto cleanup;
>> +               }
>> +       }
>> +
>> +       memset(&istream, 0, sizeof(istream));
>> +       istream.next_out = buf;
>> +       istream.avail_out = sizeof(buf);
>> +       git_inflate_init(&istream);
>> +
>> +       if (!dry_run) {
>> +               /* Set it up */
>> +               git_deflate_init(&ostream, zlib_compression_level);
>> +               ostream.next_out = compressed;
>> +               ostream.avail_out = sizeof(compressed);
>> +               the_hash_algo->init_fn(&c);
>> +
>> +               /* First header */
>> +               hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %" PRIuMAX, type,
>> +                               (uintmax_t)len) + 1;
>> +               ostream.next_in = (unsigned char *)hdr;
>> +               ostream.avail_in = hdrlen;
>> +               while (git_deflate(&ostream, 0) == Z_OK)
>> +                       ; /* nothing */
>> +               the_hash_algo->update_fn(&c, hdr, hdrlen);
>> +       }
>> +
>> +       /* Then the data itself */
>> +       do {
>> +               unsigned char *last_out = istream.next_out;
>> +               reader->fill(&istream);
>> +               istatus = git_inflate(&istream, 0);
>> +               if (istatus == Z_STREAM_END)
>> +                       flush = Z_FINISH;
>> +               reader->use(&istream);
>> +               if (!dry_run)
>> +                       ostatus = write_object_buffer(&ostream, &c, fd, compressed,
>> +                                                     sizeof(compressed), last_out,
>> +                                                     istream.next_out - last_out,
>> +                                                     flush);
>> +               istream.next_out = buf;
>> +               istream.avail_out = sizeof(buf);
>> +       } while (istatus == Z_OK);
>> +
>> +       if (istream.total_out != len || istatus != Z_STREAM_END)
>> +               die( _("inflate returned %d"), istatus);
>> +       git_inflate_end(&istream);
>> +
>> +       if (dry_run)
>> +               goto cleanup;
>> +
>> +       if (ostatus != Z_STREAM_END)
>> +               die(_("unable to deflate new object (%d)"), ostatus);
>> +       ostatus = git_deflate_end_gently(&ostream);
>> +       if (ostatus != Z_OK)
>> +               die(_("deflateEnd on object failed (%d)"), ostatus);
>> +       the_hash_algo->final_fn(oid->hash, &c);
>> +       close_loose_object(fd);
>> +
>> +       /* We get the oid now */
>> +       loose_object_path(the_repository, &filename, oid);
>> +
>> +       dirlen = directory_size(filename.buf);
>> +       if (dirlen) {
>> +               struct strbuf dir = STRBUF_INIT;
>> +               /*
>> +                * Make sure the directory exists; note that the contents
>> +                * of the buffer are undefined after mkstemp returns an
>> +                * error, so we have to rewrite the whole buffer from
>> +                * scratch.
>> +                */
>> +               strbuf_add(&dir, filename.buf, dirlen - 1);
>> +               if (mkdir(dir.buf, 0777) && errno != EEXIST) {
>> +                       unlink_or_warn(tmp_file.buf);
>> +                       strbuf_release(&dir);
>> +                       ret = -1;
>> +                       goto cleanup;
>> +               }
>> +               strbuf_release(&dir);
>> +       }
>> +
>> +       ret = finalize_object_file(tmp_file.buf, filename.buf);
>> +
>> +cleanup:
>> +       strbuf_release(&tmp_file);
>> +       strbuf_release(&filename);
>> +       return ret;
>> +}
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags)
>> diff --git a/object-store.h b/object-store.h
>> index d24915ced1..12b113ef93 100644
>> --- a/object-store.h
>> +++ b/object-store.h
>> @@ -33,6 +33,11 @@ struct object_directory {
>>         char *path;
>>  };
>>
>> +struct git_zstream_reader {
>> +       void (*fill)(struct git_zstream *);
>> +       void (*use)(struct git_zstream *);
>> +};
>> +
>>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>>         struct object_directory *, 1, fspathhash, fspatheq)
>>
>> @@ -225,6 +230,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
>>  int write_object_file(const void *buf, unsigned long len,
>>                       const char *type, struct object_id *oid);
>>
>> +int write_stream_object_file(struct git_zstream_reader *reader,
>> +                            unsigned long len, const char *type,
>> +                            struct object_id *oid, int dry_run);
>> +
>>  int hash_object_file_literally(const void *buf, unsigned long len,
>>                                const char *type, struct object_id *oid,
>>                                unsigned flags);
>> diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
>> new file mode 100755
>> index 0000000000..7e63dfc0db
>> --- /dev/null
>> +++ b/t/t5590-receive-unpack-objects.sh
>> @@ -0,0 +1,92 @@
>> +#!/bin/sh
>> +#
>> +# Copyright (c) 2021 Han Xin
>> +#
>> +
>> +test_description='Test unpack-objects when receive pack'
>> +
>> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
>> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>> +
>> +. ./test-lib.sh
>> +
>> +test_expect_success "create commit with big blobs (1.5 MB)" '
>> +       test-tool genrandom foo 1500000 >big-blob &&
>> +       test_commit --append foo big-blob &&
>> +       test-tool genrandom bar 1500000 >big-blob &&
>> +       test_commit --append bar big-blob &&
>> +       (
>> +               cd .git &&
>> +               find objects/?? -type f | sort
>> +       ) >expect &&
>> +       git repack -ad
>> +'
>> +
>> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
>> +       GIT_ALLOC_LIMIT=1m &&
>> +       export GIT_ALLOC_LIMIT
>> +'
>> +
>> +test_expect_success 'prepare dest repository' '
>> +       git init --bare dest.git &&
>> +       git -C dest.git config core.bigFileThreshold 2m &&
>> +       git -C dest.git config receive.unpacklimit 100
>> +'
>> +
>> +test_expect_success 'fail to push: cannot allocate' '
>> +       test_must_fail git push dest.git HEAD 2>err &&
>> +       test_i18ngrep "remote: fatal: attempting to allocate" err &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       ! test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'set a lower bigfile threshold' '
>> +       git -C dest.git config core.bigFileThreshold 1m
>> +'
>> +
>> +test_expect_success 'unpack big object in stream' '
>> +       git push dest.git HEAD &&
>> +       git -C dest.git fsck &&
>> +       (
>> +               cd dest.git &&
>> +               find objects/?? -type f | sort
>> +       ) >actual &&
>> +       test_cmp expect actual
>> +'
>> +
>> +test_expect_success 'setup for unpack-objects dry-run test' '
>> +       PACK=$(echo main | git pack-objects --progress --revs test) &&
>> +       unset GIT_ALLOC_LIMIT &&
>> +       git init --bare unpack-test.git
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with large threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 2m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_expect_success 'unpack-objects dry-run with small threshold' '
>> +       (
>> +               cd unpack-test.git &&
>> +               git config core.bigFileThreshold 1m &&
>> +               git unpack-objects -n <../test-$PACK.pack
>> +       ) &&
>> +       (
>> +               cd unpack-test.git &&
>> +               find objects/ -type f
>> +       ) >actual &&
>> +       test_must_be_empty actual
>> +'
>> +
>> +test_done
>> --
>> 2.33.0.1.g09a6bb964f.dirty
>>


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (2 preceding siblings ...)
  2021-11-03  1:48 ` Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  4:59   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Refactor write_loose_object() to support inputstream, in the same way
that zlib reading is chunked.

Using "in_stream" instead of "void *buf", we needn't to allocate enough
memory in advance, and only part of the contents will be read when
called "in_stream.read()".

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index 02b7970274..1ad2cb579c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct input_data_from_buffer {
+	const char *buf;
+	unsigned long len;
+};
+
+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
+{
+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
+
+	if (input->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = input->len;
+	input->len = 0;
+	return input->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const char *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream->data, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = (void *)&(struct input_data_from_buffer) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct input_data_from_buffer data;
+	struct input_stream in_stream = {
+		.read = read_input_stream_from_buffer,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..f1b67e9100 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const char *(*read)(void* data, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (3 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:42   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We will use "write_loose_object()" later to handle large blob object,
which needs to work in dry_run mode.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1ad2cb579c..b0838c847e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, struct input_stream *in_stream,
+			      int dry_run,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
+	if (!dry_run) {
+		fd = create_tmpfile(&tmp_file, filename.buf);
+		if (fd < 0) {
+			if (flags & HASH_SILENT)
+				return -1;
+			else if (errno == EACCES)
+				return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			else
+				return error_errno(_("unable to create temporary file"));
+		}
 	}
 
 	/* Set it up */
@@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		unsigned char *in0 = stream.next_in;
 		ret = git_deflate(&stream, Z_FINISH);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
@@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
+	if (dry_run)
+		return 0;
+
 	close_loose_object(fd);
 
 	if (mtime) {
@@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
 
 cleanup:
 	free(header);
@@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	data.buf = buf;
 	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
 	free(buf);
 
 	return ret;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (4 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:49   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When read input stream, oid can't get before reading all, and it will be
filled after reading.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index b0838c847e..8393659f0d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const char *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else
+		loose_object_path(the_repository, &filename, oid);
 
 	if (!dry_run) {
 		fd = create_tmpfile(&tmp_file, filename.buf);
@@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
@@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		/* copy oid */
+		oidcpy((struct object_id *)oid, &parano_oid);
+		/* We get the oid now */
+		loose_object_path(the_repository, &filename, oid);
+
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			/*
+			 * Make sure the directory exists; note that the
+			 * contents of the buffer are undefined after mkstemp
+			 * returns an error, so we have to rewrite the whole
+			 * buffer from scratch.
+			 */
+			strbuf_reset(&dir);
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (5 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  5:56   ` Jiang Xin
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Read input stream repeatedly in write_loose_object() unless reach the
end, so that we can divide the large blob write into many small blocks.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 8393659f0d..e333448c54 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const char *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream->data, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v2 5/6] object-store.h: add write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (6 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

For large loose object files, that should be possible to stream it
direct to disk with "write_loose_object()".
Unlike "write_object_file()", you need to implement an "input_stream"
instead of giving void *buf.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 8 ++++----
 object-store.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index e333448c54..60eb29db97 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,10 +1878,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
 	return input->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      int dry_run,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret = 0;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index f1b67e9100..f6faa8d6d3 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,11 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       int dry_run,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (7 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
@ 2021-11-12  9:40 ` Han Xin
  2021-11-18  7:14   ` Jiang Xin
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-12  9:40 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When calling "unpack_non_delta_entry()", will allocate full memory for
the whole size of the unpacked object and write the buffer to loose file
on disk. This may lead to OOM for the git-unpack-objects process when
unpacking a very large object.

In function "unpack_delta_entry()", will also allocate full memory to
buffer the whole delta, but since there will be no delta for an object
larger than "core.bigFileThreshold", this issue is moderate.

To resolve the OOM issue in "git-unpack-objects", we can unpack large
object to file in stream, and use "core.bigFileThreshold" to avoid OOM
limits when called "get_data()".

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
 t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-receive-unpack-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..6c757d823b 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_data_from_zstream {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
+{
+	struct input_data_from_zstream *input = data;
+	git_zstream *zstream = input->zstream;
+	void *in = fill(1);
+
+	if (!len || input->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = input->buf;
+	zstream->avail_out = sizeof(input->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	input->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(input->buf) - zstream->avail_out;
+
+	return (const char *)input->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_data_from_zstream data;
+	struct input_stream in_stream = {
+		.read = read_inflate_in_stream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/t/t5590-receive-unpack-objects.sh b/t/t5590-receive-unpack-objects.sh
new file mode 100755
index 0000000000..7e63dfc0db
--- /dev/null
+++ b/t/t5590-receive-unpack-objects.sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	git repack -ad
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to push: cannot allocate' '
+	test_must_fail git push dest.git HEAD 2>err &&
+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git push dest.git HEAD &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	PACK=$(echo main | git pack-objects --progress --revs test) &&
+	unset GIT_ALLOC_LIMIT &&
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run with large threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 2m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run with small threshold' '
+	(
+		cd unpack-test.git &&
+		git config core.bigFileThreshold 1m &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.33.1.44.g9344627884.agit.6.5.4


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
@ 2021-11-18  4:59   ` Jiang Xin
  2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: Jiang Xin @ 2021-11-18  4:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>

It would be better to provide a cover letter describing changes in v2, such as:

* Make "write_loose_object()" a public method, so we can
   reuse it in "unpack_non_delta_entry()".
   (But I doubt we can use "write_object_file_flags()" public
     function, without make this change.)

* Add an new interface "input_stream" as an argument for
   "write_loose_object()", so that we can feed data to
   "write_loose_object()" from buffer or from zlib stream.

> Refactor write_loose_object() to support inputstream, in the same way
> that zlib reading is chunked.

In the beginning of your commit log, you should describe the problem, such as:

We used to read the full content of a blob into buffer in
"unpack_non_delta_entry()" by calling:

    void *buf = get_data(size);

This will consume lots of memory for a very big blob object.

> Using "in_stream" instead of "void *buf", we needn't to allocate enough
> memory in advance, and only part of the contents will be read when
> called "in_stream.read()".
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 02b7970274..1ad2cb579c 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>         return fd;
>  }
>
> +struct input_data_from_buffer {
> +       const char *buf;
> +       unsigned long len;
> +};
> +
> +static const char *read_input_stream_from_buffer(void *data, unsigned long *len)

Use "const void *" for the type of return variable, just like input
argument for write_loose_object()?

> +{
> +       struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
> +
> +       if (input->len == 0) {
> +               *len = 0;
> +               return NULL;
> +       }
> +       *len = input->len;
> +       input->len = 0;
> +       return input->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -                             int hdrlen, const void *buf, unsigned long len,
> +                             int hdrlen, struct input_stream *in_stream,
>                               time_t mtime, unsigned flags)
>  {
>         int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         struct object_id parano_oid;
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
> +       const char *buf;

Can we use the same prototype as the original:  "const void *buf" ?

> +       unsigned long len;
>
>         loose_object_path(the_repository, &filename, oid);
>
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> +       buf = in_stream->read(in_stream->data, &len);
>         stream.next_in = (void *)buf;
>         stream.avail_in = len;
>         do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen = sizeof(hdr);
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* Normally if we have it in the pack then we do not bother writing
>          * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>         char *header;
>         int hdrlen, status = 0;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = (void *)&(struct input_data_from_buffer) {
> +                       .buf = buf,
> +                       .len = len,
> +               },
> +       };
>
>         /* type string, SP, %lu of the length plus NUL must fit this */
>         hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         char hdr[MAX_HEADER_LEN];
>         int hdrlen;
>         int ret;
> +       struct input_data_from_buffer data;
> +       struct input_stream in_stream = {
> +               .read = read_input_stream_from_buffer,
> +               .data = &data,
> +       };
>
>         if (has_loose_object(oid))
>                 return 0;
>         buf = read_object(the_repository, oid, &type, &len);
>         if (!buf)
>                 return error(_("cannot read object for %s"), oid_to_hex(oid));
> +       data.buf = buf;
> +       data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>         free(buf);
>
>         return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..f1b67e9100 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>         char *path;
>  };
>
> +struct input_stream {
> +       const char *(*read)(void* data, unsigned long *len);
> +       void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>         struct object_directory *, 1, fspathhash, fspatheq)
>
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
@ 2021-11-18  5:42   ` Jiang Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-11-18  5:42 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We will use "write_loose_object()" later to handle large blob object,
> which needs to work in dry_run mode.

The dry_run mode comes from "builtin/unpack-object.c", throw the
buffer read from "get_data()".
So why not add "dry_run" to "get_data()" instead?

If we have a dry_run version of get_data, such as "get_data(size,
dry_run)", we do not have to add dry_run mode for ”
write_loose_object()".

See: git grep -A5 get_data builtin/unpack-objects.c
builtin/unpack-objects.c:       void *buf = get_data(size);
builtin/unpack-objects.c-
builtin/unpack-objects.c-       if (!dry_run && buf)
builtin/unpack-objects.c-               write_object(nr, type, buf, size);
builtin/unpack-objects.c-       else
builtin/unpack-objects.c-               free(buf);
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }
--
builtin/unpack-objects.c:               delta_data = get_data(delta_size);
builtin/unpack-objects.c-               if (dry_run || !delta_data) {
builtin/unpack-objects.c-                       free(delta_data);
builtin/unpack-objects.c-                       return;
builtin/unpack-objects.c-               }


> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 32 +++++++++++++++++++-------------
>  1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 1ad2cb579c..b0838c847e 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,9 +1880,10 @@ static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
>
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>                               int hdrlen, struct input_stream *in_stream,
> +                             int dry_run,
>                               time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1894,14 +1895,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         loose_object_path(the_repository, &filename, oid);
>
> -       fd = create_tmpfile(&tmp_file, filename.buf);
> -       if (fd < 0) {
> -               if (flags & HASH_SILENT)
> -                       return -1;
> -               else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -               else
> -                       return error_errno(_("unable to create temporary file"));
> +       if (!dry_run) {
> +               fd = create_tmpfile(&tmp_file, filename.buf);
> +               if (fd < 0) {
> +                       if (flags & HASH_SILENT)
> +                               return -1;
> +                       else if (errno == EACCES)
> +                               return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       else
> +                               return error_errno(_("unable to create temporary file"));
> +               }
>         }
>
>         /* Set it up */
> @@ -1925,7 +1928,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 unsigned char *in0 = stream.next_in;
>                 ret = git_deflate(&stream, Z_FINISH);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> -               if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +               if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
>                 stream.next_out = compressed;
>                 stream.avail_out = sizeof(compressed);
> @@ -1943,6 +1946,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> +       if (dry_run)
> +               return 0;
> +
>         close_loose_object(fd);
>
>         if (mtime) {
> @@ -1996,7 +2002,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>                                   &hdrlen);
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 return 0;
> -       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> +       return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0, flags);
>  }
>
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -2023,7 +2029,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>                 goto cleanup;
>         if (freshen_packed_object(oid) || freshen_loose_object(oid))
>                 goto cleanup;
> -       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> +       status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0, 0);
>
>  cleanup:
>         free(header);
> @@ -2052,7 +2058,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>         data.buf = buf;
>         data.len = len;
>         hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> +       ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, mtime, 0);
>         free(buf);
>
>         return ret;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
@ 2021-11-18  5:49   ` Jiang Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-11-18  5:49 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When read input stream, oid can't get before reading all, and it will be
> filled after reading.

Under what circumstances is the oid a null oid?  Can we get the oid
from “obj_list[nr].oid” ?
See unpack_non_delta_entry() of builtin/unpack-objects.c.

> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 34 ++++++++++++++++++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index b0838c847e..8393659f0d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1893,7 +1893,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         const char *buf;
>         unsigned long len;
>
> -       loose_object_path(the_repository, &filename, oid);
> +       if (is_null_oid(oid)) {
> +               /* When oid is not determined, save tmp file to odb path. */
> +               strbuf_reset(&filename);
> +               strbuf_addstr(&filename, the_repository->objects->odb->path);
> +               strbuf_addch(&filename, '/');
> +       } else
> +               loose_object_path(the_repository, &filename, oid);
>
>         if (!dry_run) {
>                 fd = create_tmpfile(&tmp_file, filename.buf);
> @@ -1942,7 +1948,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>                 die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>                     ret);
>         the_hash_algo->final_oid_fn(&parano_oid, &c);
> -       if (!oideq(oid, &parano_oid))
> +       if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>                 die(_("confused by unstable object source data for %s"),
>                     oid_to_hex(oid));
>
> @@ -1951,6 +1957,30 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>         close_loose_object(fd);
>
> +       if (is_null_oid(oid)) {
> +               int dirlen;
> +
> +               /* copy oid */
> +               oidcpy((struct object_id *)oid, &parano_oid);
> +               /* We get the oid now */
> +               loose_object_path(the_repository, &filename, oid);
> +
> +               dirlen = directory_size(filename.buf);
> +               if (dirlen) {
> +                       struct strbuf dir = STRBUF_INIT;
> +                       /*
> +                        * Make sure the directory exists; note that the
> +                        * contents of the buffer are undefined after mkstemp
> +                        * returns an error, so we have to rewrite the whole
> +                        * buffer from scratch.
> +                        */
> +                       strbuf_reset(&dir);
> +                       strbuf_add(&dir, filename.buf, dirlen - 1);
> +                       if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +                               return -1;
> +               }
> +       }
> +
>         if (mtime) {
>                 struct utimbuf utb;
>                 utb.actime = mtime;
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 4/6] object-file.c: read input stream repeatedly in write_loose_object()
  2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
@ 2021-11-18  5:56   ` Jiang Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-11-18  5:56 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Read input stream repeatedly in write_loose_object() unless reach the
> end, so that we can divide the large blob write into many small blocks.

In order to prepare the stream version of "write_loose_object()", we need ...

>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 8393659f0d..e333448c54 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1891,7 +1891,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         static struct strbuf tmp_file = STRBUF_INIT;
>         static struct strbuf filename = STRBUF_INIT;
>         const char *buf;
> -       unsigned long len;
> +       int flush = 0;
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> @@ -1927,12 +1927,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>         the_hash_algo->update_fn(&c, hdr, hdrlen);
>
>         /* Then the data itself.. */
> -       buf = in_stream->read(in_stream->data, &len);
> -       stream.next_in = (void *)buf;
> -       stream.avail_in = len;
>         do {
>                 unsigned char *in0 = stream.next_in;
> -               ret = git_deflate(&stream, Z_FINISH);
> +               if (!stream.avail_in) {
> +                       if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {

if ((buf = in_stream->read(in_stream->data, &stream.avail_in)) != NULL) {

Or split this long line into:

    buf = in_stream->read(in_stream->data, &stream.avail_in);
    if (buf) {

> +                               stream.next_in = (void *)buf;
> +                               in0 = (unsigned char *)buf;
> +                       } else
> +                               flush = Z_FINISH;

Add {} around this single line, see:

  https://github.com/git/git/blob/master/Documentation/CodingGuidelines#L279-L289

> +               }
> +               ret = git_deflate(&stream, flush);
>                 the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>                 if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>                         die(_("unable to write loose object file"));
> --
> 2.33.1.44.g9344627884.agit.6.5.4
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream
  2021-11-18  4:59   ` Jiang Xin
@ 2021-11-18  6:45     ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2021-11-18  6:45 UTC (permalink / raw)
  To: Jiang Xin; +Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Jiang Xin <worldhello.net@gmail.com> writes:

> On Fri, Nov 12, 2021 at 5:43 PM Han Xin <chiyutianyi@gmail.com> wrote:
>>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> It would be better to provide a cover letter describing changes in v2, such as:
>
> * Make "write_loose_object()" a public method, so we can
>    reuse it in "unpack_non_delta_entry()".
>    (But I doubt we can use "write_object_file_flags()" public
>      function, without make this change.)
>
> * Add an new interface "input_stream" as an argument for
>    "write_loose_object()", so that we can feed data to
>    "write_loose_object()" from buffer or from zlib stream.
>
>> Refactor write_loose_object() to support inputstream, in the same way
>> that zlib reading is chunked.
>
> In the beginning of your commit log, you should describe the problem, such as:
>
> We used to read the full content of a blob into buffer in
> "unpack_non_delta_entry()" by calling:
>
>     void *buf = get_data(size);
>
> This will consume lots of memory for a very big blob object.

I was not sure where "in_stream" came from---"use X insteads of Y",
when X is what these patches invent and introduce, does not make a
good explanation without explaining what X is, what problem X is
attempting to solve and how.

Thanks for helping to clarify the proposed log message.  

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v2 6/6] unpack-objects: unpack large object in stream
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-18  7:14   ` Jiang Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-11-18  7:14 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Fri, Nov 12, 2021 at 5:42 PM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When calling "unpack_non_delta_entry()", will allocate full memory for
> the whole size of the unpacked object and write the buffer to loose file
> on disk. This may lead to OOM for the git-unpack-objects process when
> unpacking a very large object.
>
> In function "unpack_delta_entry()", will also allocate full memory to
> buffer the whole delta, but since there will be no delta for an object
> larger than "core.bigFileThreshold", this issue is moderate.
>
> To resolve the OOM issue in "git-unpack-objects", we can unpack large
> object to file in stream, and use "core.bigFileThreshold" to avoid OOM
> limits when called "get_data()".
>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c          | 76 ++++++++++++++++++++++++-
>  t/t5590-receive-unpack-objects.sh | 92 +++++++++++++++++++++++++++++++
>  2 files changed, 167 insertions(+), 1 deletion(-)
>  create mode 100755 t/t5590-receive-unpack-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..6c757d823b 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -320,11 +320,85 @@ static void added_object(unsigned nr, enum object_type type,
>         }
>  }
>
> +struct input_data_from_zstream {
> +       git_zstream *zstream;
> +       unsigned char buf[4096];
> +       int status;
> +};
> +
> +static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
> +{
> +       struct input_data_from_zstream *input = data;
> +       git_zstream *zstream = input->zstream;
> +       void *in = fill(1);
> +
> +       if (!len || input->status == Z_STREAM_END) {
> +               *readlen = 0;
> +               return NULL;
> +       }
> +
> +       zstream->next_out = input->buf;
> +       zstream->avail_out = sizeof(input->buf);
> +       zstream->next_in = in;
> +       zstream->avail_in = len;
> +
> +       input->status = git_inflate(zstream, 0);
> +       use(len - zstream->avail_in);
> +       *readlen = sizeof(input->buf) - zstream->avail_out;
> +
> +       return (const char *)input->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +       char hdr[32];
> +       int hdrlen;
> +       git_zstream zstream;
> +       struct input_data_from_zstream data;
> +       struct input_stream in_stream = {
> +               .read = read_inflate_in_stream,
> +               .data = &data,
> +       };
> +       struct object_id *oid = &obj_list[nr].oid;
> +       int ret;
> +
> +       memset(&zstream, 0, sizeof(zstream));
> +       memset(&data, 0, sizeof(data));
> +       data.zstream = &zstream;
> +       git_inflate_init(&zstream);
> +
> +       /* Generate the header */
> +       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +       if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
> +               die(_("failed to write object in stream %d"), ret);
> +
> +       if (zstream.total_out != size || data.status != Z_STREAM_END)
> +               die(_("inflate returned %d"), data.status);
> +       git_inflate_end(&zstream);
> +
> +       if (strict && !dry_run) {
> +               struct blob *blob = lookup_blob(the_repository, oid);
> +               if (blob)
> +                       blob->object.flags |= FLAG_WRITTEN;
> +               else
> +                       die("invalid blob object from stream");
> +       }
> +       obj_list[nr].obj = NULL;
> +}
> +
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>                                    unsigned nr)
>  {
> -       void *buf = get_data(size);
> +       void *buf;
> +
> +       /* Write large blob in stream without allocating full buffer. */
> +       if (type == OBJ_BLOB && size > big_file_threshold) {

Default size of big_file_threshold is 512m.  Can we use
"write_stream_blob" for all objects?  Can we get a more suitable
threshold through some benchmark data?

> +               write_stream_blob(nr, size);
> +               return;
> +       }

--
Jiang Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v3 0/5] unpack large objects in stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (8 preceding siblings ...)
  2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29  7:01   ` Han Xin
                     ` (6 more replies)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                   ` (4 subsequent siblings)
  14 siblings, 7 replies; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Although we do not recommend users push large binary files to the git repositories, 
it's difficult to prevent them from doing so. Once, we found a problem with a surge 
in memory usage on the server. The source of the problem is that a user submitted 
a single object with a size of 15GB. Once someone initiates a git push, the git 
process will immediately allocate 15G of memory, resulting in an OOM risk.

Through further analysis, we found that when we execute git unpack-objects, in 
unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate 
memory equal to the size of the object. This is quite a scary thing, because the 
pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.

I got inspiration from the deflate process of zlib, maybe it would be a good idea 
to change unpack-objects to stream deflate.

Changes since v2:
* Rewrite commit messages and make changes suggested by Jiang Xin.
* Remove the commit "object-file.c: add dry_run mode for write_loose_object()" and
  use a new commit "unpack-objects.c: add dry_run mode for get_data()" instead.

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            | 92 +++++++++++++++++++++++++--
 object-file.c                       | 98 +++++++++++++++++++++++++----
 object-store.h                      |  9 +++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++
 4 files changed, 257 insertions(+), 18 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v2:
1:  01672f50a0 ! 1:  8640b04f6d object-file: refactor write_loose_object() to support inputstream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file: refactor write_loose_object() to support inputstream
    +    object-file: refactor write_loose_object() to read buffer from stream
     
    -    Refactor write_loose_object() to support inputstream, in the same way
    -    that zlib reading is chunked.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    Using "in_stream" instead of "void *buf", we needn't to allocate enough
    -    memory in advance, and only part of the contents will be read when
    -    called "in_stream.read()".
    +    This can be improved by feeding data to "write_loose_object()" in a
    +    stream. The input stream is implemented as an interface. In the first
    +    step, we make a simple implementation, feeding the entire buffer in the
    +    "stream" to "write_loose_object()" as a refactor.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
      	return fd;
      }
      
    -+struct input_data_from_buffer {
    -+	const char *buf;
    ++struct simple_input_stream_data {
    ++	const void *buf;
     +	unsigned long len;
     +};
     +
    -+static const char *read_input_stream_from_buffer(void *data, unsigned long *len)
    ++static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
     +{
    -+	struct input_data_from_buffer *input = (struct input_data_from_buffer *)data;
    ++	struct simple_input_stream_data *data = in_stream->data;
     +
    -+	if (input->len == 0) {
    ++	if (data->len == 0) {
     +		*len = 0;
     +		return NULL;
     +	}
    -+	*len = input->len;
    -+	input->len = 0;
    -+	return input->buf;
    ++	*len = data->len;
    ++	data->len = 0;
    ++	return data->buf;
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	struct object_id parano_oid;
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    -+	const char *buf;
    ++	const void *buf;
     +	unsigned long len;
      
      	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    -+	buf = in_stream->read(in_stream->data, &len);
    ++	buf = in_stream->read(in_stream, &len);
      	stream.next_in = (void *)buf;
      	stream.avail_in = len;
      	do {
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen = sizeof(hdr);
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
      	char *header;
      	int hdrlen, status = 0;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    -+		.data = (void *)&(struct input_data_from_buffer) {
    ++		.read = feed_simple_input_stream,
    ++		.data = (void *)&(struct simple_input_stream_data) {
     +			.buf = buf,
     +			.len = len,
     +		},
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	char hdr[MAX_HEADER_LEN];
      	int hdrlen;
      	int ret;
    -+	struct input_data_from_buffer data;
    ++	struct simple_input_stream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_input_stream_from_buffer,
    ++		.read = feed_simple_input_stream,
     +		.data = &data,
     +	};
      
    @@ object-store.h: struct object_directory {
      };
      
     +struct input_stream {
    -+	const char *(*read)(void* data, unsigned long *len);
    ++	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
     +};
     +
2:  a309b7e391 < -:  ---------- object-file.c: add dry_run mode for write_loose_object()
3:  b0a5b53710 ! 2:  d4a2caf2bd object-file.c: handle nil oid in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: handle nil oid in write_loose_object()
    +    object-file.c: handle undetermined oid in write_loose_object()
     
    -    When read input stream, oid can't get before reading all, and it will be
    -    filled after reading.
    +    When streaming a large blob object to "write_loose_object()", we have no
    +    chance to run "write_object_file_prepare()" to calculate the oid in
    +    advance. So we need to handle undetermined oid in function
    +    "write_loose_object()".
    +
    +    In the original implementation, we know the oid and we can write the
    +    temporary file in the same directory as the final object, but for an
    +    object with an undetermined oid, we don't know the exact directory for
    +    the object, so we have to save the temporary file in ".git/objects/"
    +    directory instead.
     
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 	const char *buf;
    + 	const void *buf;
      	unsigned long len;
      
     -	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +		strbuf_reset(&filename);
     +		strbuf_addstr(&filename, the_repository->objects->odb->path);
     +		strbuf_addch(&filename, '/');
    -+	} else
    ++	} else {
     +		loose_object_path(the_repository, &filename, oid);
    ++	}
      
    - 	if (!dry_run) {
    - 		fd = create_tmpfile(&tmp_file, filename.buf);
    + 	fd = create_tmpfile(&tmp_file, filename.buf);
    + 	if (fd < 0) {
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
      		    ret);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
      
    -@@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 
      	close_loose_object(fd);
      
     +	if (is_null_oid(oid)) {
     +		int dirlen;
     +
    -+		/* copy oid */
     +		oidcpy((struct object_id *)oid, &parano_oid);
    -+		/* We get the oid now */
     +		loose_object_path(the_repository, &filename, oid);
     +
    ++		/* We finally know the object path, and create the missing dir. */
     +		dirlen = directory_size(filename.buf);
     +		if (dirlen) {
     +			struct strbuf dir = STRBUF_INIT;
    -+			/*
    -+			 * Make sure the directory exists; note that the
    -+			 * contents of the buffer are undefined after mkstemp
    -+			 * returns an error, so we have to rewrite the whole
    -+			 * buffer from scratch.
    -+			 */
    -+			strbuf_reset(&dir);
     +			strbuf_add(&dir, filename.buf, dirlen - 1);
     +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
     +				return -1;
    ++			if (adjust_shared_perm(dir.buf))
    ++				return -1;
    ++			strbuf_release(&dir);
     +		}
     +	}
     +
4:  09d438b692 ! 3:  2575900449 object-file.c: read input stream repeatedly in write_loose_object()
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: read input stream repeatedly in write_loose_object()
    +    object-file.c: read stream in a loop in write_loose_object()
     
    -    Read input stream repeatedly in write_loose_object() unless reach the
    -    end, so that we can divide the large blob write into many small blocks.
    +    In order to prepare the stream version of "write_loose_object()", read
    +    the input stream in a loop in "write_loose_object()", so that we can
    +    feed the contents of large blob object to "write_loose_object()" using
    +    a small fixed buffer.
     
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      	static struct strbuf tmp_file = STRBUF_INIT;
      	static struct strbuf filename = STRBUF_INIT;
    - 	const char *buf;
    + 	const void *buf;
     -	unsigned long len;
     +	int flush = 0;
      
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	the_hash_algo->update_fn(&c, hdr, hdrlen);
      
      	/* Then the data itself.. */
    --	buf = in_stream->read(in_stream->data, &len);
    +-	buf = in_stream->read(in_stream, &len);
     -	stream.next_in = (void *)buf;
     -	stream.avail_in = len;
      	do {
      		unsigned char *in0 = stream.next_in;
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
    -+			if ((buf = in_stream->read(in_stream->data, &stream.avail_in))) {
    ++			buf = in_stream->read(in_stream, &stream.avail_in);
    ++			if (buf) {
     +				stream.next_in = (void *)buf;
     +				in0 = (unsigned char *)buf;
    -+			} else
    ++			} else {
     +				flush = Z_FINISH;
    ++			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
    - 		if (!dry_run && write_buffer(fd, compressed, stream.next_out - compressed) < 0)
    + 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
5:  9fb188d437 < -:  ---------- object-store.h: add write_loose_object()
-:  ---------- > 4:  ca93ecc780 unpack-objects.c: add dry_run mode for get_data()
6:  80468a6fbc ! 5:  39a072ee2a unpack-objects: unpack large object in stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack large object in stream
    +    unpack-objects: unpack_non_delta_entry() read data in a stream
     
    -    When calling "unpack_non_delta_entry()", will allocate full memory for
    -    the whole size of the unpacked object and write the buffer to loose file
    -    on disk. This may lead to OOM for the git-unpack-objects process when
    -    unpacking a very large object.
    +    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    +    entire contents of a blob object, no matter how big it is. This
    +    implementation may consume all the memory and cause OOM.
     
    -    In function "unpack_delta_entry()", will also allocate full memory to
    -    buffer the whole delta, but since there will be no delta for an object
    -    larger than "core.bigFileThreshold", this issue is moderate.
    +    By implementing a zstream version of input_stream interface, we can use
    +    a small fixed buffer for "unpack_non_delta_entry()".
     
    -    To resolve the OOM issue in "git-unpack-objects", we can unpack large
    -    object to file in stream, and use "core.bigFileThreshold" to avoid OOM
    -    limits when called "get_data()".
    +    However, unpack non-delta objects from a stream instead of from an entrie
    +    buffer will have 10% performance penalty. Therefore, only unpack object
    +    larger than the "big_file_threshold" in zstream. See the following
    +    benchmarks:
     
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    +          Range (min … max):    9.786 s … 10.603 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    +          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    +          Range (min … max):    9.884 s … 12.192 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    +          Range (min … max):    2.639 s …  2.743 s    10 runs
    +
    +        $ hyperfine \
    +        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    +        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    +          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    +          Range (min … max):    2.679 s …  3.125 s    10 runs
    +
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## builtin/unpack-objects.c ##
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      	}
      }
      
    -+struct input_data_from_zstream {
    ++struct input_zstream_data {
     +	git_zstream *zstream;
     +	unsigned char buf[4096];
     +	int status;
     +};
     +
    -+static const char *read_inflate_in_stream(void *data, unsigned long *readlen)
    ++static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
     +{
    -+	struct input_data_from_zstream *input = data;
    -+	git_zstream *zstream = input->zstream;
    ++	struct input_zstream_data *data = in_stream->data;
    ++	git_zstream *zstream = data->zstream;
     +	void *in = fill(1);
     +
    -+	if (!len || input->status == Z_STREAM_END) {
    ++	if (!len || data->status == Z_STREAM_END) {
     +		*readlen = 0;
     +		return NULL;
     +	}
     +
    -+	zstream->next_out = input->buf;
    -+	zstream->avail_out = sizeof(input->buf);
    ++	zstream->next_out = data->buf;
    ++	zstream->avail_out = sizeof(data->buf);
     +	zstream->next_in = in;
     +	zstream->avail_in = len;
     +
    -+	input->status = git_inflate(zstream, 0);
    ++	data->status = git_inflate(zstream, 0);
     +	use(len - zstream->avail_in);
    -+	*readlen = sizeof(input->buf) - zstream->avail_out;
    ++	*readlen = sizeof(data->buf) - zstream->avail_out;
     +
    -+	return (const char *)input->buf;
    ++	return data->buf;
     +}
     +
     +static void write_stream_blob(unsigned nr, unsigned long size)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	char hdr[32];
     +	int hdrlen;
     +	git_zstream zstream;
    -+	struct input_data_from_zstream data;
    ++	struct input_zstream_data data;
     +	struct input_stream in_stream = {
    -+		.read = read_inflate_in_stream,
    ++		.read = feed_input_zstream,
     +		.data = &data,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	/* Generate the header */
     +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
     +
    -+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, dry_run, 0, 0)))
    ++	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
     +		die(_("failed to write object in stream %d"), ret);
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      static void unpack_non_delta_entry(enum object_type type, unsigned long size,
      				   unsigned nr)
      {
    --	void *buf = get_data(size);
    +-	void *buf = get_data(size, dry_run);
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (type == OBJ_BLOB && size > big_file_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
      
    -+	buf = get_data(size);
    ++	buf = get_data(size, dry_run);
      	if (!dry_run && buf)
      		write_object(nr, type, buf, size);
      	else
     
    - ## t/t5590-receive-unpack-objects.sh (new) ##
    + ## object-file.c ##
    +@@ object-file.c: static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
    + 	return data->buf;
    + }
    + 
    +-static int write_loose_object(const struct object_id *oid, char *hdr,
    +-			      int hdrlen, struct input_stream *in_stream,
    +-			      time_t mtime, unsigned flags)
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags)
    + {
    + 	int fd, ret;
    + 	unsigned char compressed[4096];
    +
    + ## object-store.h ##
    +@@ object-store.h: int hash_object_file(const struct git_hash_algo *algo, const void *buf,
    + 		     unsigned long len, const char *type,
    + 		     struct object_id *oid);
    + 
    ++int write_loose_object(const struct object_id *oid, char *hdr,
    ++		       int hdrlen, struct input_stream *in_stream,
    ++		       time_t mtime, unsigned flags);
    ++
    + int write_object_file_flags(const void *buf, unsigned long len,
    + 			    const char *type, struct object_id *oid,
    + 			    unsigned flags);
    +
    + ## t/t5590-unpack-non-delta-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    @@ t/t5590-receive-unpack-objects.sh (new)
     +		cd .git &&
     +		find objects/?? -type f | sort
     +	) >expect &&
    -+	git repack -ad
    ++	PACK=$(echo main | git pack-objects --progress --revs test)
     +'
     +
     +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
    @@ t/t5590-receive-unpack-objects.sh (new)
     +	git -C dest.git config receive.unpacklimit 100
     +'
     +
    -+test_expect_success 'fail to push: cannot allocate' '
    -+	test_must_fail git push dest.git HEAD 2>err &&
    -+	test_i18ngrep "remote: fatal: attempting to allocate" err &&
    ++test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	test_i18ngrep "fatal: attempting to allocate" err &&
     +	(
     +		cd dest.git &&
     +		find objects/?? -type f | sort
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'unpack big object in stream' '
    -+	git push dest.git HEAD &&
    ++	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
     +		cd dest.git &&
    @@ t/t5590-receive-unpack-objects.sh (new)
     +'
     +
     +test_expect_success 'setup for unpack-objects dry-run test' '
    -+	PACK=$(echo main | git pack-objects --progress --revs test) &&
    -+	unset GIT_ALLOC_LIMIT &&
     +	git init --bare unpack-test.git
     +'
     +
    -+test_expect_success 'unpack-objects dry-run with large threshold' '
    -+	(
    -+		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 2m &&
    -+		git unpack-objects -n <../test-$PACK.pack
    -+	) &&
    -+	(
    -+		cd unpack-test.git &&
    -+		find objects/ -type f
    -+	) >actual &&
    -+	test_must_be_empty actual
    -+'
    -+
    -+test_expect_success 'unpack-objects dry-run with small threshold' '
    ++test_expect_success 'unpack-objects dry-run' '
     +	(
     +		cd unpack-test.git &&
    -+		git config core.bigFileThreshold 1m &&
     +		git unpack-objects -n <../test-$PACK.pack
     +	) &&
     +	(
-- 
2.34.0.6.g676eedc724


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (9 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-23 23:24   ` Junio C Hamano
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                   ` (3 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  5 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index c3d866a287..227f53a0de 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..ccc1fc9c1a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0.6.g676eedc724


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (10 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 15:10   ` Derrick Stolee
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 227f53a0de..78fd2a5d39 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v3 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (11 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/object-file.c b/object-file.c
index 78fd2a5d39..93bcfaca50 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,12 +1925,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			if (buf) {
+				stream.next_in = (void *)buf;
+				in0 = (unsigned char *)buf;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
-- 
2.34.0.6.g676eedc724


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (12 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  14 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0.6.g676eedc724


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
                   ` (13 preceding siblings ...)
  2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-11-22  3:32 ` Han Xin
  2021-11-29 17:37   ` Derrick Stolee
  14 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-22  3:32 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
      Range (min … max):    9.786 s … 10.603 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
      Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
      Range (min … max):    9.884 s … 12.192 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
      Range (min … max):    2.639 s …  2.743 s    10 runs

    $ hyperfine \
    --prepare 'rm -rf dest.git && git init --bare dest.git' \
    'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
      Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
      Range (min … max):    2.679 s …  3.125 s    10 runs

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 76 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 +++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bfc254a236 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,85 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[4096];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index 93bcfaca50..bd7631f7ef 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index ccc1fc9c1a..cbd95c47e2 100644
--- a/object-store.h
+++ b/object-store.h
@@ -228,6 +228,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0.6.g676eedc724


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-11-23 23:24   ` Junio C Hamano
  2021-11-24  9:00     ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2021-11-23 23:24 UTC (permalink / raw)
  To: Han Xin; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Han Xin <chiyutianyi@gmail.com> writes:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.

Possibly a stupid question (not a review).

How does this compare with "struct git_istream" implemented for a
few existing codepaths?  It seems that the existing users are
pack-objects, index-pack and archive and all of them use the
interface to obtain data given an object name without having to grab
everything in core at once.

If we are adding a new streaming interface to go in the opposite
direction, i.e. from the working tree data to object store, I would
understand it as a complementary interface (but then I suspect there
is a half of it already in bulk-checkin API), but I am not sure how
this new thing fits in the larger picture.



> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  5 +++++
>  2 files changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index c3d866a287..227f53a0de 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};
> +
> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {
> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
> -			      int hdrlen, const void *buf, unsigned long len,
> +			      int hdrlen, struct input_stream *in_stream,
>  			      time_t mtime, unsigned flags)
>  {
>  	int fd, ret;
> @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	struct object_id parano_oid;
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
> +	const void *buf;
> +	unsigned long len;
>  
>  	loose_object_path(the_repository, &filename, oid);
>  
> @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	the_hash_algo->update_fn(&c, hdr, hdrlen);
>  
>  	/* Then the data itself.. */
> +	buf = in_stream->read(in_stream, &len);
>  	stream.next_in = (void *)buf;
>  	stream.avail_in = len;
>  	do {
> @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  {
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen = sizeof(hdr);
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* Normally if we have it in the pack then we do not bother writing
>  	 * it out into .git/objects/??/?{38} file.
> @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  				  &hdrlen);
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		return 0;
> -	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
>  }
>  
>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +	};
>  
>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> +	data.buf = buf;
> +	data.len = len;
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
>  	free(buf);
>  
>  	return ret;
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..ccc1fc9c1a 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>  	char *path;
>  };
>  
> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-23 23:24   ` Junio C Hamano
@ 2021-11-24  9:00     ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-24  9:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Junio C Hamano <gitster@pobox.com> writes:

>
> Han Xin <chiyutianyi@gmail.com> writes:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
>
> Possibly a stupid question (not a review).
>
> How does this compare with "struct git_istream" implemented for a
> few existing codepaths?  It seems that the existing users are
> pack-objects, index-pack and archive and all of them use the
> interface to obtain data given an object name without having to grab
> everything in core at once.
>
> If we are adding a new streaming interface to go in the opposite
> direction, i.e. from the working tree data to object store, I would
> understand it as a complementary interface (but then I suspect there
> is a half of it already in bulk-checkin API), but I am not sure how
> this new thing fits in the larger picture.
>

Thank you for your reply.

Before starting to make this patch, I did consider whether I should
reuse "struct  git_istream" to solve the problem, but I found that in the
process of git unpack-objects, the data comes from stdin, and we
cannot get an oid in advance until the whole object data is read.
Also, we can't do "lseek()“ on stdin to change the data reading position.

I compared the implementation of "bulk-checkin", and they do have
some similarities.
I think the difference in the reverse implementation is that we do not
always clearly know where the boundary of the target data is. For
example, in the process of "unpack-objects", the "buffer" has been
partially read after calling "fill()". And the "buffer" remaining after
reading cannot be discarded because it is the beginning of the next
object.
Perhaps "struct input_stream" can make some improvements to
"index_bulk_checkin()", so that it can read from an inner buffer in
addition to reading from "fd" if necessary.

>
>
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  5 +++++
> >  2 files changed, 51 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index c3d866a287..227f53a0de 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
> > +
> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> > -                           int hdrlen, const void *buf, unsigned long len,
> > +                           int hdrlen, struct input_stream *in_stream,
> >                             time_t mtime, unsigned flags)
> >  {
> >       int fd, ret;
> > @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       struct object_id parano_oid;
> >       static struct strbuf tmp_file = STRBUF_INIT;
> >       static struct strbuf filename = STRBUF_INIT;
> > +     const void *buf;
> > +     unsigned long len;
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       the_hash_algo->update_fn(&c, hdr, hdrlen);
> >
> >       /* Then the data itself.. */
> > +     buf = in_stream->read(in_stream, &len);
> >       stream.next_in = (void *)buf;
> >       stream.avail_in = len;
> >       do {
> > @@ -1960,6 +1981,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >  {
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen = sizeof(hdr);
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* Normally if we have it in the pack then we do not bother writing
> >        * it out into .git/objects/??/?{38} file.
> > @@ -1968,7 +1996,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
> >                                 &hdrlen);
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               return 0;
> > -     return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> > +     return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
> >  }
> >
> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2005,13 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +     };
> >
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2023,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2038,21 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
> > +     data.buf = buf;
> > +     data.len = len;
> >       hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> > -     ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> > +     ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
> >       free(buf);
> >
> >       return ret;
> > diff --git a/object-store.h b/object-store.h
> > index 952efb6a4b..ccc1fc9c1a 100644
> > --- a/object-store.h
> > +++ b/object-store.h
> > @@ -34,6 +34,11 @@ struct object_directory {
> >       char *path;
> >  };
> >
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +};
> > +
> >  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
> >       struct object_directory *, 1, fspathhash, fspatheq)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
@ 2021-11-29  7:01   ` Han Xin
  2021-11-29 19:12     ` Jeff King
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-29  7:01 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley; +Cc: Han Xin

Han Xin <chiyutianyi@gmail.com> writes:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Although we do not recommend users push large binary files to the git repositories,
> it's difficult to prevent them from doing so. Once, we found a problem with a surge
> in memory usage on the server. The source of the problem is that a user submitted
> a single object with a size of 15GB. Once someone initiates a git push, the git
> process will immediately allocate 15G of memory, resulting in an OOM risk.
>
> Through further analysis, we found that when we execute git unpack-objects, in
> unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> memory equal to the size of the object. This is quite a scary thing, because the
> pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
>
> I got inspiration from the deflate process of zlib, maybe it would be a good idea
> to change unpack-objects to stream deflate.
>

Hi, Jeff.

I hope you can share with me how Github solves this problem.

As you said in your reply at：
https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
"we don't have a match in unpack-objects, but we always run index-pack
on incoming packs".

In the original implementation of "index-pack", for objects larger than
big_file_threshold, "fixed_buf" with a size of 8192 will be used to
complete the calculation of "oid".

I tried the implementation in jk/no-more-unpack-objects, as you noted:
  /* XXX This will expand too-large objects! */
  if (!data)
  data = new_data = get_data_from_pack(obj_entry);
If the conditions of --unpack are given, there will be risks here.
When I create an object larger than 1GB and execute index-pack, the
result is as follows:
  $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
  fatal: attempting to allocate 1228800001 over limit 1073741824

Looking forward to your reply.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-11-29 15:10   ` Derrick Stolee
  2021-11-29 20:44     ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: Derrick Stolee @ 2021-11-29 15:10 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
> 
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.

My first reaction is to not write into .git/objects/ directly, but
instead make a .git/objects/tmp/ directory and write within that
directory. The idea is to prevent leaving stale files in the
.git/objects/ directory if the process terminates strangely (say,
a power outage or segfault).

If this was an interesting idea to pursue, it does leave a question:
should we clean up the tmp/ directory when it is empty? That would
require adding a check in finalize_object_file() that is probably
best left unchecked (the lstat() would add a cost per loose object
write that is probably too costly). I would rather leave an empty
tmp/ directory than add that cost per loose object write.

I suppose another way to do it would be to register the check as
an event at the end of the process, so we only check once, and
that only happens if we created a loose object with this streaming
method.

With all of these complications in mind, I think cleaning up the
stale tmp/ directory could (at the very least) be delayed to another
commit or patch series. Hopefully adding the directory is not too
much complication to add here.

> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

Here, you could instead of the strbuf_addch() do

	strbuf_add(&filename, "/tmp/", 5);
	if (safe_create_leading_directories(filename.buf)) {
		error(_("failed to create '%s'"));
		strbuf_release(&filename);
		return -1;
	}		

> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				return -1;
> +			if (adjust_shared_perm(dir.buf))
> +				return -1;
> +			strbuf_release(&dir);
> +		}
> +	}
> +

Upon first reading I was asking "where is the file rename?" but
it is part of finalize_object_file() which is called further down.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-11-29 17:37   ` Derrick Stolee
  2021-11-30 13:49     ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Derrick Stolee @ 2021-11-29 17:37 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley
  Cc: Han Xin

On 11/21/2021 10:32 PM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
> 
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
> 
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
>       Range (min … max):    9.786 s … 10.603 s    10 runs
> 
>     $ hyperfine \
>     --prepare 'rm -rf dest.git && git init --bare dest.git' \
>     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
>     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
>       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
>       Range (min … max):    9.884 s … 12.192 s    10 runs

It seems that you want us to compare this pair of results, and
hyperfine can assist with that by including multiple benchmarks
(with labels, using '-n') as follows:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
  Range (min … max):   20.741 s … 20.909 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
  Range (min … max):   26.419 s … 26.611 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
  Range (min … max):   26.416 s … 26.739 s    10 runs
 
Summary
  'old' ran
    1.27 ± 0.00 times faster than 'new'
    1.27 ± 0.01 times faster than 'new (small threshold)'

(Here, 'old' is testing a compiled version of the latest 'master'
branch, while 'new' has your patches applied on top.)

Notice from this example I had a pack with many small objects (mostly
commits and trees) and I see that this change introduces significant
overhead to this case.

It would be nice to understand this overhead and fix it before taking
this change any further.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29  7:01   ` Han Xin
@ 2021-11-29 19:12     ` Jeff King
  2021-11-30  2:57       ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Jeff King @ 2021-11-29 19:12 UTC (permalink / raw)
  To: Han Xin; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Mon, Nov 29, 2021 at 03:01:47PM +0800, Han Xin wrote:

> Han Xin <chiyutianyi@gmail.com> writes:
> >
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Although we do not recommend users push large binary files to the git repositories,
> > it's difficult to prevent them from doing so. Once, we found a problem with a surge
> > in memory usage on the server. The source of the problem is that a user submitted
> > a single object with a size of 15GB. Once someone initiates a git push, the git
> > process will immediately allocate 15G of memory, resulting in an OOM risk.
> >
> > Through further analysis, we found that when we execute git unpack-objects, in
> > unpack_non_delta_entry(), "void *buf = get_data(size);" will directly allocate
> > memory equal to the size of the object. This is quite a scary thing, because the
> > pre-receive hook has not been executed at this time, and we cannot avoid this by hooks.
> >
> > I got inspiration from the deflate process of zlib, maybe it would be a good idea
> > to change unpack-objects to stream deflate.
> >
> 
> Hi, Jeff.
> 
> I hope you can share with me how Github solves this problem.
> 
> As you said in your reply at：
> https://lore.kernel.org/git/YVaw6agcPNclhws8@coredump.intra.peff.net/
> "we don't have a match in unpack-objects, but we always run index-pack
> on incoming packs".
> 
> In the original implementation of "index-pack", for objects larger than
> big_file_threshold, "fixed_buf" with a size of 8192 will be used to
> complete the calculation of "oid".

We set transfer.unpackLimit to "1", so we never run unpack-objects at
all. We always run index-pack, and every push, no matter how small,
results in a pack.

We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
custom code in index-pack to detect large objects (where our definition
of "large" is 100MB by default):

  - for large blobs, we do index it as normal, writing the oid out to a
    file which is then processed by a pre-receive hook (since people
    often push up large files accidentally, the hook generates a nice
    error message, including finding the path at which the blob is
    referenced)

  - for other large objects, we die immediately (with an error message).
    100MB commit messages aren't a common user error, and it closes off
    a whole set of possible integer-overflow parsing attacks (e.g.,
    index-pack in strict-mode will run every tree through fsck_tree(),
    so there's otherwise nothing stopping you from having a 4GB filename
    in a tree).

> I tried the implementation in jk/no-more-unpack-objects, as you noted:
>   /* XXX This will expand too-large objects! */
>   if (!data)
>   data = new_data = get_data_from_pack(obj_entry);
> If the conditions of --unpack are given, there will be risks here.
> When I create an object larger than 1GB and execute index-pack, the
> result is as follows:
>   $GIT_ALLOC_LIMIT=1024m git index-pack --unpack --stdin <large.pack
>   fatal: attempting to allocate 1228800001 over limit 1073741824

Yeah, that issue was one of the reasons I never sent the "index-pack
--unpack" code to the list. We don't actually use those patches at
GitHub. It was something I was working on for upstream but never
finished.

-Peff

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 15:10   ` Derrick Stolee
@ 2021-11-29 20:44     ` Junio C Hamano
  2021-11-29 22:18       ` Derrick Stolee
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2021-11-29 20:44 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

Derrick Stolee <stolee@gmail.com> writes:

> My first reaction is to not write into .git/objects/ directly, but
> instead make a .git/objects/tmp/ directory and write within that
> directory. The idea is to prevent leaving stale files in the
> .git/objects/ directory if the process terminates strangely (say,
> a power outage or segfault).

Even if we know the name of the object we are writing beforehand, I
do not think it is a good idea to open-write-close the final object
file.  The approach we already use everywhere is to write into a
tmpfile/lockfile and rename it to the final name 

object-file.c::write_loose_object() uses create_tmpfile() to prepare
a temporary file whose name begins with "tmp_obj_", so that "gc" can
recognize stale ones and remove them.

> If this was an interesting idea to pursue, it does leave a question:
> should we clean up the tmp/ directory when it is empty? That would
> require adding a check in finalize_object_file() that is probably
> best left unchecked (the lstat() would add a cost per loose object
> write that is probably too costly). I would rather leave an empty
> tmp/ directory than add that cost per loose object write.

I am not sure why we want a new tmp/ directory.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 20:44     ` Junio C Hamano
@ 2021-11-29 22:18       ` Derrick Stolee
  2021-11-30  3:23         ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Derrick Stolee @ 2021-11-29 22:18 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley, Han Xin

On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> Derrick Stolee <stolee@gmail.com> writes:
> 
>> My first reaction is to not write into .git/objects/ directly, but
>> instead make a .git/objects/tmp/ directory and write within that
>> directory. The idea is to prevent leaving stale files in the
>> .git/objects/ directory if the process terminates strangely (say,
>> a power outage or segfault).
> 
> Even if we know the name of the object we are writing beforehand, I
> do not think it is a good idea to open-write-close the final object
> file.  The approach we already use everywhere is to write into a
> tmpfile/lockfile and rename it to the final name 
> 
> object-file.c::write_loose_object() uses create_tmpfile() to prepare
> a temporary file whose name begins with "tmp_obj_", so that "gc" can
> recognize stale ones and remove them.

The only difference is that the tmp_obj_* file would go into the
loose object directory corresponding to the first two hex characters
of the OID, but that no longer happens now.
 
>> If this was an interesting idea to pursue, it does leave a question:
>> should we clean up the tmp/ directory when it is empty? That would
>> require adding a check in finalize_object_file() that is probably
>> best left unchecked (the lstat() would add a cost per loose object
>> write that is probably too costly). I would rather leave an empty
>> tmp/ directory than add that cost per loose object write.
> 
> I am not sure why we want a new tmp/ directory.

I'm just thinking of a case where this fails repeatedly I would
rather have those failed tmp_obj_* files isolated in their own
directory. It's an extremely minor point, so I'm fine to drop
the recommendation.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 0/5] unpack large objects in stream
  2021-11-29 19:12     ` Jeff King
@ 2021-11-30  2:57       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-30  2:57 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git List, Jiang Xin, Philip Oakley, Han Xin

On Tue, Nov 30, 2021 at 3:12 AM Jeff King <peff@peff.net> wrote:
> We set transfer.unpackLimit to "1", so we never run unpack-objects at
> all. We always run index-pack, and every push, no matter how small,
> results in a pack.
>
> We also set GIT_ALLOC_LIMIT to limit any single allocation. We also have
> custom code in index-pack to detect large objects (where our definition
> of "large" is 100MB by default):
>
>   - for large blobs, we do index it as normal, writing the oid out to a
>     file which is then processed by a pre-receive hook (since people
>     often push up large files accidentally, the hook generates a nice
>     error message, including finding the path at which the blob is
>     referenced)
>
>   - for other large objects, we die immediately (with an error message).
>     100MB commit messages aren't a common user error, and it closes off
>     a whole set of possible integer-overflow parsing attacks (e.g.,
>     index-pack in strict-mode will run every tree through fsck_tree(),
>     so there's otherwise nothing stopping you from having a 4GB filename
>     in a tree).

Thank you very much for sharing.

The way Github handles it reminds me of what Shawn Pearce introduced in
"Scaling up JGit". I guess "mulit-pack-index" and "bitmap" must play an
important role in this.

I will seriously consider this solution, thanks a lot.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-29 22:18       ` Derrick Stolee
@ 2021-11-30  3:23         ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-11-30  3:23 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Tue, Nov 30, 2021 at 6:18 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/29/2021 3:44 PM, Junio C Hamano wrote:
> > Derrick Stolee <stolee@gmail.com> writes:
> >
> >> My first reaction is to not write into .git/objects/ directly, but
> >> instead make a .git/objects/tmp/ directory and write within that
> >> directory. The idea is to prevent leaving stale files in the
> >> .git/objects/ directory if the process terminates strangely (say,
> >> a power outage or segfault).
> >
> > Even if we know the name of the object we are writing beforehand, I
> > do not think it is a good idea to open-write-close the final object
> > file.  The approach we already use everywhere is to write into a
> > tmpfile/lockfile and rename it to the final name
> >
> > object-file.c::write_loose_object() uses create_tmpfile() to prepare
> > a temporary file whose name begins with "tmp_obj_", so that "gc" can
> > recognize stale ones and remove them.
>
> The only difference is that the tmp_obj_* file would go into the
> loose object directory corresponding to the first two hex characters
> of the OID, but that no longer happens now.
>

At the beginning of this patch, I did save the temporary object in a
two hex characters directory of "null_oid", but this is also a very
strange behavior. "Gc" will indeed clean up these tmp_obj_* files, no
matter if they are in .git/objects/ or .git/objects/xx.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-29 17:37   ` Derrick Stolee
@ 2021-11-30 13:49     ` Han Xin
  2021-11-30 18:38       ` Derrick Stolee
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-11-30 13:49 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/21/2021 10:32 PM, Han Xin wrote:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > By implementing a zstream version of input_stream interface, we can use
> > a small fixed buffer for "unpack_non_delta_entry()".
> >
> > However, unpack non-delta objects from a stream instead of from an entrie
> > buffer will have 10% performance penalty. Therefore, only unpack object
> > larger than the "big_file_threshold" in zstream. See the following
> > benchmarks:
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
> >       Range (min … max):    9.786 s … 10.603 s    10 runs
> >
> >     $ hyperfine \
> >     --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >     'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
> >     Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
> >       Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
> >       Range (min … max):    9.884 s … 12.192 s    10 runs
>
> It seems that you want us to compare this pair of results, and
> hyperfine can assist with that by including multiple benchmarks
> (with labels, using '-n') as follows:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>   Range (min … max):   20.741 s … 20.909 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>   Range (min … max):   26.419 s … 26.611 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>   Range (min … max):   26.416 s … 26.739 s    10 runs
>
> Summary
>   'old' ran
>     1.27 ± 0.00 times faster than 'new'
>     1.27 ± 0.01 times faster than 'new (small threshold)'
>
> (Here, 'old' is testing a compiled version of the latest 'master'
> branch, while 'new' has your patches applied on top.)
>
> Notice from this example I had a pack with many small objects (mostly
> commits and trees) and I see that this change introduces significant
> overhead to this case.
>
> It would be nice to understand this overhead and fix it before taking
> this change any further.
>
> Thanks,
> -Stolee

Can you show me the specific information of the repository you
tested, so that I can analyze it further.

I test this repository, but did not meet the problem:

 Unpacking objects: 100% (18345/18345), 43.15 MiB

hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' 'git -C dest.git unpack-objects <big.pack' \
        -n 'new' 'new/git -C dest.git unpack-objects <big.pack' \
        -n 'new (small threshold)' 'new/git -c
core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
Benchmark 1: old
  Time (mean ± σ):     17.403 s ±  0.880 s    [User: 4.996 s, System: 11.803 s]
  Range (min … max):   15.911 s … 19.368 s    10 runs

Benchmark 2: new
  Time (mean ± σ):     17.788 s ±  0.199 s    [User: 5.054 s, System: 12.257 s]
  Range (min … max):   17.420 s … 18.195 s    10 runs

Benchmark 3: new (small threshold)
  Time (mean ± σ):     18.433 s ±  0.711 s    [User: 4.982 s, System: 12.338 s]
  Range (min … max):   17.518 s … 19.775 s    10 runs

Summary
  'old' ran
    1.02 ± 0.05 times faster than 'new'
    1.06 ± 0.07 times faster than 'new (small threshold)'

Thanks,
- Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 13:49     ` Han Xin
@ 2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  0 siblings, 2 replies; 211+ messages in thread
From: Derrick Stolee @ 2021-11-30 18:38 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On 11/30/2021 8:49 AM, Han Xin wrote:
> On Tue, Nov 30, 2021 at 1:37 AM Derrick Stolee <stolee@gmail.com> wrote:
>> $ hyperfine \
>>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>>
>> Benchmark 1: old
>>   Time (mean ± σ):     20.835 s ±  0.058 s    [User: 14.510 s, System: 6.284 s]
>>   Range (min … max):   20.741 s … 20.909 s    10 runs
>>
>> Benchmark 2: new
>>   Time (mean ± σ):     26.515 s ±  0.072 s    [User: 19.783 s, System: 6.696 s]
>>   Range (min … max):   26.419 s … 26.611 s    10 runs
>>
>> Benchmark 3: new (small threshold)
>>   Time (mean ± σ):     26.523 s ±  0.101 s    [User: 19.805 s, System: 6.680 s]
>>   Range (min … max):   26.416 s … 26.739 s    10 runs
>>
>> Summary
>>   'old' ran
>>     1.27 ± 0.00 times faster than 'new'
>>     1.27 ± 0.01 times faster than 'new (small threshold)'
>>
>> (Here, 'old' is testing a compiled version of the latest 'master'
>> branch, while 'new' has your patches applied on top.)
>>
>> Notice from this example I had a pack with many small objects (mostly
>> commits and trees) and I see that this change introduces significant
>> overhead to this case.
>>
>> It would be nice to understand this overhead and fix it before taking
>> this change any further.
>>
>> Thanks,
>> -Stolee
> 
> Can you show me the specific information of the repository you
> tested, so that I can analyze it further.

I used a pack-file from an internal repo. It happened to be using
partial clone, so here is a repro with the git/git repository
after cloning this way:

$ git clone --no-checkout --filter=blob:none https://github.com/git/git

(copy the large .pack from git/.git/objects/pack/ to big.pack)

$ hyperfine \
	--prepare 'rm -rf dest.git && git init --bare dest.git' \
	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'

Benchmark 1: old
  Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
  Range (min … max):   82.042 s … 83.587 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
  Range (min … max):   100.866 s … 102.633 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
  Range (min … max):   100.639 s … 101.375 s    10 runs
 
Summary
  'old' ran
    1.22 ± 0.01 times faster than 'new (small threshold)'
    1.23 ± 0.01 times faster than 'new'

I'm also able to repro this with a smaller repo (microsoft/scalar)
so the tests complete much faster:

$ hyperfine \
        --prepare 'rm -rf dest.git && git init --bare dest.git' \
        -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
        -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
        -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'

Benchmark 1: old
  Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
  Range (min … max):    3.269 s …  3.351 s    10 runs
 
Benchmark 2: new
  Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
  Range (min … max):    3.378 s …  3.679 s    10 runs
 
Benchmark 3: new (small threshold)
  Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
  Range (min … max):    3.359 s …  3.747 s    10 runs
 
Summary
  'old' ran
    1.09 ± 0.04 times faster than 'new (small threshold)'
    1.09 ± 0.03 times faster than 'new'

It's not the same relative overhead, but still significant.

These pack-files contain (mostly) small objects, no large blobs.
I know that's not the target of your efforts, but it would be
good to avoid a regression here.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 211+ messages in thread

* "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...])
  2021-11-30 18:38       ` Derrick Stolee
@ 2021-12-01 20:37         ` Ævar Arnfjörð Bjarmason
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-01 20:37 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Han Xin, David Peter

I hadn't sent a shameless plug for my "git hyperfine" script to the
list, perhaps this is a good time. It's just a thin shellscript wrapper
around "hyperfine" that I wrote the other day, which...

On Tue, Nov 30 2021, Derrick Stolee wrote:

> [...]
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
> 	--prepare 'rm -rf dest.git && git init --bare dest.git' \
> 	-n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
> 	-n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>  
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>  
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>  
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'

...adds enough sugar around "hyperfine" itself to do this as e.g. (the
"-s" is a feature I submitted to hyperfine itself, it's not in a release
yet[1], but in this case you could also use "-p"):

    git hyperfine -L rev v2.20.0,origin/master \
        -s 'if ! test -d redis.git; then git clone --bare --filter=blob:none https://github.com/redis/redis; fi && make' \
        -p 'rm -rf dest.git; git init --bare dest.git' \
        './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)'

The sugar being that for each named "rev" parameter it'll set up "git
worktree" for you, so under the hood each of those is chdir-ing to the
respective revision of:

    $ git worktree list
    [...]
    /run/user/1001/git-hyperfine/origin/master  abe6bb39053 (detached HEAD)
    /run/user/1001/git-hyperfine/v2.33.0        225bc32a989 (detached HEAD)

That they're named revisions and not git-rev-parse'd is intentional,
since you'll benefit from faster incremental "make" (even if using
"ccache"). I'm typically benchmarking HEAD~1,HEAD~0.

The output will then use those "rev" parameters, and be e.g.:

    Benchmark 1: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0
      Time (mean ± σ):      6.678 s ±  0.046 s    [User: 4.525 s, System: 2.117 s]
      Range (min … max):    6.619 s …  6.765 s    10 runs

    Benchmark 2: ./git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master
      Time (mean ± σ):      6.756 s ±  0.074 s    [User: 4.586 s, System: 2.134 s]
      Range (min … max):    6.691 s …  6.941 s    10 runs

    Summary
      './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'v2.20.0' ran
        1.01 ± 0.01 times faster than './git -C dest.git unpack-objects <$(echo redis.git/objects/pack/*.pack)' in 'origin/master'

I think if you're routinely benchmarking N different git versions you'll
find it handy, it also has configurable hook support (using git config),
so e.g. it's easy to copy your config.mak in-place in the
worktrees. E.g. my config is:

    $ git -P config --get-regexp '^hyperfine'
    hyperfine.run-dir $XDG_RUNTIME_DIR/git-hyperfine
    hyperfine.xargs-options -r
    hyperfine.hook.setup ~/g/git.meta/config.mak.sh

It's hosted at https://github.com/avar/git-hyperfine/ and
https://gitlab.com/avar/git-hyperfine/; It's implemented in (portable)
POSIX shell script.

There's surely some bugs in it, one known one is that unlike hyperfine
it doesn't accept there being spaces in the parameters to -L, because
I'm screwing up some quoting-within-quoting in the (shellscript)
implementation (suggestions for that particular one most welcome).

I hacked it up after this suggestion from Jeff King[2] of moving t/perf
over to it.

I haven't done any of that legwork, but I think a wrapper like
"git-hyperfine" that prepares worktrees for the N revisions we're
benchmarking is a good direction to go in.

We don't use git-worktrees in t/perf, but probably could for most/all
tests. In any case it would be easy to have the script setup the revs to
be benchmarked in some hookable custom manner to have it do exactly what
t/perf/run is doing now.

1. https://github.com/sharkdp/hyperfine/commit/017d55a
2. https://lore.kernel.org/git/YV+zFqi4VmBVJYex@coredump.intra.peff.net/

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-30 18:38       ` Derrick Stolee
  2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
@ 2021-12-02  7:33         ` Han Xin
  2021-12-02 13:53           ` Derrick Stolee
  1 sibling, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-02  7:33 UTC (permalink / raw)
  To: Derrick Stolee
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> I used a pack-file from an internal repo. It happened to be using
> partial clone, so here is a repro with the git/git repository
> after cloning this way:
>
> $ git clone --no-checkout --filter=blob:none https://github.com/git/git
>
> (copy the large .pack from git/.git/objects/pack/ to big.pack)
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <big.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <big.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <big.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):     82.748 s ±  0.445 s    [User: 50.512 s, System: 32.049 s]
>   Range (min … max):   82.042 s … 83.587 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):     101.644 s ±  0.524 s    [User: 67.470 s, System: 34.047 s]
>   Range (min … max):   100.866 s … 102.633 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):     101.093 s ±  0.269 s    [User: 67.404 s, System: 33.559 s]
>   Range (min … max):   100.639 s … 101.375 s    10 runs
>
> Summary
>   'old' ran
>     1.22 ± 0.01 times faster than 'new (small threshold)'
>     1.23 ± 0.01 times faster than 'new'
>
> I'm also able to repro this with a smaller repo (microsoft/scalar)
> so the tests complete much faster:
>
> $ hyperfine \
>         --prepare 'rm -rf dest.git && git init --bare dest.git' \
>         -n 'old' '~/_git/git-upstream/git -C dest.git unpack-objects <small.pack' \
>         -n 'new' '~/_git/git/git -C dest.git unpack-objects <small.pack' \
>         -n 'new (small threshold)' '~/_git/git/git -c core.bigfilethreshold=64k -C dest.git unpack-objects <small.pack'
>
> Benchmark 1: old
>   Time (mean ± σ):      3.295 s ±  0.023 s    [User: 1.063 s, System: 2.228 s]
>   Range (min … max):    3.269 s …  3.351 s    10 runs
>
> Benchmark 2: new
>   Time (mean ± σ):      3.592 s ±  0.105 s    [User: 1.261 s, System: 2.328 s]
>   Range (min … max):    3.378 s …  3.679 s    10 runs
>
> Benchmark 3: new (small threshold)
>   Time (mean ± σ):      3.584 s ±  0.144 s    [User: 1.241 s, System: 2.339 s]
>   Range (min … max):    3.359 s …  3.747 s    10 runs
>
> Summary
>   'old' ran
>     1.09 ± 0.04 times faster than 'new (small threshold)'
>     1.09 ± 0.03 times faster than 'new'
>
> It's not the same relative overhead, but still significant.
>
> These pack-files contain (mostly) small objects, no large blobs.
> I know that's not the target of your efforts, but it would be
> good to avoid a regression here.
>
> Thanks,
> -Stolee

With your help, I did catch this performance problem, which was
introduced in this patch:
https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/

This patch changes the original data reading ino to stream reading, but
its problem is that even for the original reading of the whole object data,
it still generates an additional git_deflate() and subsequent transfer.

I will fix it in a follow-up patch.

Thanks,
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-02 13:53           ` Derrick Stolee
  0 siblings, 0 replies; 211+ messages in thread
From: Derrick Stolee @ 2021-12-02 13:53 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Han Xin

On 12/2/2021 2:33 AM, Han Xin wrote:
> On Wed, Dec 1, 2021 at 2:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>> These pack-files contain (mostly) small objects, no large blobs.
>> I know that's not the target of your efforts, but it would be
>> good to avoid a regression here.
>>
>> Thanks,
>> -Stolee
> 
> With your help, I did catch this performance problem, which was
> introduced in this patch:
> https://lore.kernel.org/git/20211122033220.32883-4-chiyutianyi@gmail.com/
> 
> This patch changes the original data reading ino to stream reading, but
> its problem is that even for the original reading of the whole object data,
> it still generates an additional git_deflate() and subsequent transfer.

I'm glad you found it!

> I will fix it in a follow-up patch.

Looking forward to it.

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v4 0/5] unpack large objects in stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-07 16:18     ` Derrick Stolee
                       ` (7 more replies)
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
                     ` (4 subsequent siblings)
  6 siblings, 8 replies; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v3:
* Add "size" to "struct input_stream" which used by following commits.

* Increase the buffer size of "struct input_zstream_data" from 4096 to
  8192, which is consistent with the "fixed_buf" in the "index-pack.c".

* Refactor "read stream in a loop in write_loose_object()" which
  introduced a performance problem reported by Derrick Stolee[1].

* Rewrite benchmarks in "unpack-objects: unpack_non_delta_entry() read
  data in a stream" with sugguestions by Derrick Stolee[1] and
  Ævar Arnfjörð Bjarmason[2]. 
  Now use "scalar.git" to benchmark, which contains more than 28000
  objects and 96 objects larger than 16kB.

1. https://lore.kernel.org/git/8ff89e50-1b80-7932-f0e2-af401ee04bb1@gmail.com/
2. https://lore.kernel.org/git/211201.86r1aw9gbd.gmgdl@evledraar.gmail.com/

Han Xin (5):
  object-file: refactor write_loose_object() to read buffer from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 builtin/unpack-objects.c            |  93 +++++++++++++++++++++++--
 object-file.c                       | 102 ++++++++++++++++++++++++----
 object-store.h                      |  10 +++
 t/t5590-unpack-non-delta-objects.sh |  76 +++++++++++++++++++++
 4 files changed, 262 insertions(+), 19 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v3:
1:  8640b04f6d ! 1:  af707ef304 object-file: refactor write_loose_object() to read buffer from stream
    @@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* Normally if we have it in the pack then we do not bother writing
    @@ object-file.c: int hash_object_file_literally(const void *buf, unsigned long len
     +			.buf = buf,
     +			.len = len,
     +		},
    ++		.size = len,
     +	};
      
      	/* type string, SP, %lu of the length plus NUL must fit this */
    @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	if (has_loose_object(oid))
      		return 0;
      	buf = read_object(the_repository, oid, &type, &len);
    ++	in_stream.size = len;
      	if (!buf)
      		return error(_("cannot read object for %s"), oid_to_hex(oid));
     +	data.buf = buf;
    @@ object-store.h: struct object_directory {
     +struct input_stream {
     +	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
    ++	size_t size;
     +};
     +
      KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
2:  d4a2caf2bd = 2:  321ad90d8e object-file.c: handle undetermined oid in write_loose_object()
3:  2575900449 ! 3:  1992ac39af object-file.c: read stream in a loop in write_loose_object()
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		ret = git_deflate(&stream, Z_FINISH);
     +		if (!stream.avail_in) {
     +			buf = in_stream->read(in_stream, &stream.avail_in);
    -+			if (buf) {
    -+				stream.next_in = (void *)buf;
    -+				in0 = (unsigned char *)buf;
    -+			} else {
    ++			stream.next_in = (void *)buf;
    ++			in0 = (unsigned char *)buf;
    ++			/* All data has been read. */
    ++			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
     +				flush = Z_FINISH;
    -+			}
     +		}
     +		ret = git_deflate(&stream, flush);
      		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
      		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
      			die(_("unable to write loose object file"));
    + 		stream.next_out = compressed;
    + 		stream.avail_out = sizeof(compressed);
    +-	} while (ret == Z_OK);
    ++	} while (ret == Z_OK || ret == Z_BUF_ERROR);
    + 
    + 	if (ret != Z_STREAM_END)
    + 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
4:  ca93ecc780 = 4:  c41eb06533 unpack-objects.c: add dry_run mode for get_data()
5:  39a072ee2a ! 5:  9427775bdc unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Commit message
         larger than the "big_file_threshold" in zstream. See the following
         benchmarks:
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.029 s ±  0.270 s    [User: 8.265 s, System: 1.522 s]
    -          Range (min … max):    9.786 s … 10.603 s    10 runs
    +        hyperfine \
    +          --setup \
    +          'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
    +          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    +          -n 'old' 'git -C dest.git unpack-objects <small.pack' \
    +          -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
    +          -n 'new (small threshold)' \
    +          'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    +        Benchmark 1: old
    +          Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
    +          Range (min … max):    6.018 s …  6.189 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_320M.pack
    -          Time (mean ± σ):     10.859 s ±  0.774 s    [User: 8.813 s, System: 1.898 s]
    -          Range (min … max):    9.884 s … 12.192 s    10 runs
    +        Benchmark 2: new
    +          Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
    +          Range (min … max):    6.030 s …  6.142 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.678 s ±  0.037 s    [User: 2.205 s, System: 0.450 s]
    -          Range (min … max):    2.639 s …  2.743 s    10 runs
    +        Benchmark 3: new (small threshold)
    +          Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
    +          Range (min … max):    6.711 s …  6.809 s    10 runs
     
    -        $ hyperfine \
    -        --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -        'git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack'
    -        Benchmark 1: git -c core.bigFileThreshold=2m -C dest.git unpack-objects <binary_96M.pack
    -          Time (mean ± σ):      2.819 s ±  0.124 s    [User: 2.216 s, System: 0.564 s]
    -          Range (min … max):    2.679 s …  3.125 s    10 runs
    +        Summary
    +          'old' ran
    +            1.00 ± 0.01 times faster than 'new'
    +            1.11 ± 0.01 times faster than 'new (small threshold)'
     
    +    Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      
     +struct input_zstream_data {
     +	git_zstream *zstream;
    -+	unsigned char buf[4096];
    ++	unsigned char buf[8192];
     +	int status;
     +};
     +
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	struct input_stream in_stream = {
     +		.read = feed_input_zstream,
     +		.data = &data,
    ++		.size = size,
     +	};
     +	struct object_id *oid = &obj_list[nr].oid;
     +	int ret;
-- 
2.34.0


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
  2021-11-29  7:01   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  6 ++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb972cdccd..82656f7428 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,14 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1997,7 @@ int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
+	in_stream.size = len;
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..a84d891d60 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	size_t size;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (2 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
                     ` (2 subsequent siblings)
  6 siblings, 2 replies; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 82656f7428..1c41587bfb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	const void *buf;
 	unsigned long len;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (is_null_oid(oid)) {
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_reset(&filename);
+		strbuf_addstr(&filename, the_repository->objects->odb->path);
+		strbuf_addch(&filename, '/');
+	} else {
+		loose_object_path(the_repository, &filename, oid);
+	}
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
@@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (is_null_oid(oid)) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				return -1;
+			if (adjust_shared_perm(dir.buf))
+				return -1;
+			strbuf_release(&dir);
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v4 3/5] object-file.c: read stream in a loop in write_loose_object()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (3 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 1c41587bfb..fa54e39c2c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1890,7 +1890,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	const void *buf;
-	unsigned long len;
+	int flush = 0;
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
@@ -1925,18 +1925,23 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	buf = in_stream->read(in_stream, &len);
-	stream.next_in = (void *)buf;
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (!stream.avail_in) {
+			buf = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)buf;
+			in0 = (unsigned char *)buf;
+			/* All data has been read. */
+			if (in_stream->size + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (4 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..8d68acd662 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 4096 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
                     ` (5 preceding siblings ...)
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03  9:35   ` Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
                       ` (2 more replies)
  6 siblings, 3 replies; 211+ messages in thread
From: Han Xin @ 2021-12-03  9:35 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "big_file_threshold" in zstream. See the following
benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      -n 'old' 'git -C dest.git unpack-objects <small.pack' \
      -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
      -n 'new (small threshold)' \
      'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    Benchmark 1: old
      Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
      Range (min … max):    6.018 s …  6.189 s    10 runs

    Benchmark 2: new
      Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
      Range (min … max):    6.030 s …  6.142 s    10 runs

    Benchmark 3: new (small threshold)
      Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
      Range (min … max):    6.711 s …  6.809 s    10 runs

    Summary
      'old' ran
        1.00 ± 0.01 times faster than 'new'
        1.11 ± 0.01 times faster than 'new (small threshold)'

Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 77 ++++++++++++++++++++++++++++-
 object-file.c                       |  6 +--
 object-store.h                      |  4 ++
 t/t5590-unpack-non-delta-objects.sh | 76 ++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 4 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 8d68acd662..bedc494e2d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,86 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	char hdr[32];
+	int hdrlen;
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+		.size = size,
+	};
+	struct object_id *oid = &obj_list[nr].oid;
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	/* Generate the header */
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
+
+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/object-file.c b/object-file.c
index fa54e39c2c..71d510614b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1878,9 +1878,9 @@ static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
 	return data->buf;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, struct input_stream *in_stream,
-			      time_t mtime, unsigned flags)
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags)
 {
 	int fd, ret;
 	unsigned char compressed[4096];
diff --git a/object-store.h b/object-store.h
index a84d891d60..ac5b11ec16 100644
--- a/object-store.h
+++ b/object-store.h
@@ -229,6 +229,10 @@ int hash_object_file(const struct git_hash_algo *algo, const void *buf,
 		     unsigned long len, const char *type,
 		     struct object_id *oid);
 
+int write_loose_object(const struct object_id *oid, char *hdr,
+		       int hdrlen, struct input_stream *in_stream,
+		       time_t mtime, unsigned flags);
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags);
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..01d950d119
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+test_expect_success "create commit with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --progress --revs test)
+'
+
+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'prepare dest repository' '
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold 2m &&
+	git -C dest.git config receive.unpacklimit 100
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	test_i18ngrep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'set a lower bigfile threshold' '
+	git -C dest.git config core.bigFileThreshold 1m
+'
+
+test_expect_success 'unpack big object in stream' '
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'setup for unpack-objects dry-run test' '
+	git init --bare unpack-test.git
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	(
+		cd unpack-test.git &&
+		git unpack-objects -n <../test-$PACK.pack
+	) &&
+	(
+		cd unpack-test.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:42       ` Han Xin
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:07 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an entrie
> buffer will have 10% performance penalty. Therefore, only unpack object
> larger than the "big_file_threshold" in zstream. See the following
> benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git' \
>       -n 'old' 'git -C dest.git unpack-objects <small.pack' \
>       -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
>       -n 'new (small threshold)' \
>       'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
>     Benchmark 1: old
>       Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
>       Range (min … max):    6.018 s …  6.189 s    10 runs
>
>     Benchmark 2: new
>       Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
>       Range (min … max):    6.030 s …  6.142 s    10 runs
>
>     Benchmark 3: new (small threshold)
>       Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
>       Range (min … max):    6.711 s …  6.809 s    10 runs
>
>     Summary
>       'old' ran
>         1.00 ± 0.01 times faster than 'new'
>         1.11 ± 0.01 times faster than 'new (small threshold)'

So before we wrote used core.bigfilethreshold for two things (or more?):
Whether we show a diff for it (we mark it "binary") and whether it's
split into a loose object.

Now it's three things, we've added a "this is a threshold when we'll
stream the object" to that.

Might it make sense to squash something like this in, so we can have our
cake & eat it too?

With this I get, where HEAD~0 is this change:
    
    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0' ran
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.06 ± 0.14 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.20 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

I.e. it's 5% slower, not 20% (haven't looked into why), but we'll not
stream out 16k..128MB objects (maybe the repo has even bigger ones?)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a1..601b7a2418f 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index bedc494e2db..94ce275c807 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -400,7 +400,7 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 	void *buf;
 
 	/* Write large blob in stream without allocating full buffer. */
-	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
 		write_stream_blob(nr, size);
 		return;
 	}
diff --git a/cache.h b/cache.h
index eba12487b99..4037c7fd849 100644
--- a/cache.h
+++ b/cache.h
@@ -964,6 +964,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a706..7b122a142a8 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 9da7f3c1a19..4fcc3de7417 100644
--- a/environment.c
+++ b/environment.c
@@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:51       ` Han Xin
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:21 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);
> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');
> +	} else {
> +		loose_object_path(the_repository, &filename, oid);
> +	}
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
> @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
>  	close_loose_object(fd);
>  
> +	if (is_null_oid(oid)) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);
> +		loose_object_path(the_repository, &filename, oid);

Why are we breaking the promise that "oid" is constant here? I tested
locally with the below on top, and it seems to work (at least no tests
broke). Isn't it preferrable to the cast & the caller having its "oid"
changed?

diff --git a/object-file.c b/object-file.c
index 71d510614b9..d014e6942ea 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	close_loose_object(fd);
 
 	if (is_null_oid(oid)) {
+		struct object_id oid2;
 		int dirlen;
 
-		oidcpy((struct object_id *)oid, &parano_oid);
-		loose_object_path(the_repository, &filename, oid);
+		oidcpy(&oid2, &parano_oid);
+		loose_object_path(the_repository, &filename, &oid2);
 
 		/* We finally know the object path, and create the missing dir. */
 		dirlen = directory_size(filename.buf);

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
@ 2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
  2021-12-06  2:07       ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:28 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  6 ++++++
>  2 files changed, 55 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb972cdccd..82656f7428 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};

I see why you picked "const void *buf" here, over say const char *, it's
what "struct input_stream" uses.

But why not use size_t for the length, as input_stream does?

> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {

nit: if (!data->len)...

> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;

But isn't the body of this functin the same as:

        *len = data->len;
        if (!len)
                return NULL;
        data->len = 0;
        return data->buf;

I.e. you don't need the condition for setting "*len" if it's 0, then
data->len is also 0. You just want to return NULL afterwards, and not
set (harmless, but no need) data->len to 0)< or return data->buf.
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

Maybe it's that I'm unused to it, but I find this a bit more readable:
	
	@@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
	 {
	 	char hdr[MAX_HEADER_LEN];
	 	int hdrlen = sizeof(hdr);
	+	struct simple_input_stream_data tmp = {
	+		.buf = buf,
	+		.len = len,
	+	};
	 	struct input_stream in_stream = {
	 		.read = feed_simple_input_stream,
	-		.data = (void *)&(struct simple_input_stream_data) {
	-			.buf = buf,
	-			.len = len,
	-		},
	+		.data = (void *)&tmp,
	 		.size = len,
	 	};
	
Yes there's a temporary variable, but no denser inline casting. Also
easier to strep through in a debugger (which will have the type
information on "tmp".

>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

ditto..

>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
> +	in_stream.size = len;

Why are we setting this here?...

>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));

...Insted of after this point, as we may error and never use it?

> +	data.buf = buf;
> +	data.len = len;

Probably won't matter,  just a nit...

> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +	size_t size;
> +};
> +

Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:12       ` Han Xin
  1 sibling, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:41 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 30 ++++++++++++++++++++++++++++--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 82656f7428..1c41587bfb 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	const void *buf;
>  	unsigned long len;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (is_null_oid(oid)) {
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_reset(&filename);

Why re-use this & leak memory? An existing strbuf use in this function
doesn't leak in the same way. Just release it as in the below patch on
top (the ret v.s. err variable naming is a bit confused, maybe could do
with a prep cleanup step.).

> +		strbuf_addstr(&filename, the_repository->objects->odb->path);
> +		strbuf_addch(&filename, '/');

And once we do that this could just become:

	strbuf_addf($filename, "%s/", ...)

Is there's existing uses of this pattern, so mayb e not worth it, but it
allows you to remove the braces on the if/else.

diff --git a/object-file.c b/object-file.c
index 8bd89e7b7ba..2b52f3fc1cc 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 		       int hdrlen, struct input_stream *in_stream,
 		       time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret, err = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 
 	if (is_null_oid(oid)) {
 		/* When oid is not determined, save tmp file to odb path. */
-		strbuf_reset(&filename);
 		strbuf_addstr(&filename, the_repository->objects->odb->path);
 		strbuf_addch(&filename, '/');
 	} else {
@@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			err = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			err = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			struct strbuf dir = STRBUF_INIT;
 			strbuf_add(&dir, filename.buf, dirlen - 1);
 			if (mkdir(dir.buf, 0777) && errno != EEXIST)
-				return -1;
-			if (adjust_shared_perm(dir.buf))
-				return -1;
-			strbuf_release(&dir);
+				err = -1;
+			else if (adjust_shared_perm(dir.buf))
+				err = -1;
+			else
+				strbuf_release(&dir);
+			if (err < 0)
+				goto cleanup;
 		}
 	}
 
@@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	return err;
 }
 
 static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:17       ` Han Xin
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:54 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> new file mode 100755
> index 0000000000..01d950d119
> --- /dev/null
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -0,0 +1,76 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects when receive pack'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +test_expect_success "create commit with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort

...are thse...

> +	) >expect &&
> +	PACK=$(echo main | git pack-objects --progress --revs test)

Is --progress needed?

> +'
> +
> +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'prepare dest repository' '
> +	git init --bare dest.git &&
> +	git -C dest.git config core.bigFileThreshold 2m &&
> +	git -C dest.git config receive.unpacklimit 100

I think it would be better to just (could roll this into a function):

	test_when_finished "rm -rf dest.git" &&
	git init dest.git &&
	git -C dest.git config ...

Then you can use it with e.g. --run=3-4 and not have it error out
because of skipped setup.

A lot of our tests fail like that, but in this case fixing it seems
trivial.



> +'
> +
> +test_expect_success 'fail to unpack-objects: cannot allocate' '
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	test_i18ngrep "fatal: attempting to allocate" err &&

nit: just "grep", not "test_i18ngrep"

> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort

..."find" needed over just globbing?:

    obj=$(echo objects/*/*)

?

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
  2021-12-06  3:20       ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 13:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 03 2021, Han Xin wrote:

> +	unsigned long bufsize = dry_run ? 4096 : size;
> +	void *buf = xmallocz(bufsize);

It's probably nothing, but in your CL you note that you changed another
hardcoding from 4k to 8k, should this one still be 4k?

It's probably fine, just wondering...

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
@ 2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
  2021-12-07  6:48       ` Han Xin
  2 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-03 14:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [..]
> +static void write_stream_blob(unsigned nr, unsigned long size)
> +{
> +	char hdr[32];
> +	int hdrlen;
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +		.size = size,
> +	};
> +	struct object_id *oid = &obj_list[nr].oid;
> +	int ret;
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));
> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	/* Generate the header */
> +	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> +
> +	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
> +		die(_("failed to write object in stream %d"), ret);
> +
> +	if (zstream.total_out != size || data.status != Z_STREAM_END)
> +		die(_("inflate returned %d"), data.status);
> +	git_inflate_end(&zstream);
> +
> +	if (strict && !dry_run) {
> +		struct blob *blob = lookup_blob(the_repository, oid);
> +		if (blob)
> +			blob->object.flags |= FLAG_WRITTEN;
> +		else
> +			die("invalid blob object from stream");
> +	}
> +	obj_list[nr].obj = NULL;
> +}

Just a side-note, I think (but am not 100% sure) that these existing
occurances aren't needed due to our use of CALLOC_ARRAY():

    diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
    index 4a9466295ba..00b349412c5 100644
    --- a/builtin/unpack-objects.c
    +++ b/builtin/unpack-objects.c
    @@ -248,7 +248,6 @@ static void write_object(unsigned nr, enum object_type type,
                            die("failed to write object");
                    added_object(nr, type, buf, size);
                    free(buf);
    -               obj_list[nr].obj = NULL;
            } else if (type == OBJ_BLOB) {
                    struct blob *blob;
                    if (write_object_file(buf, size, type_name(type),
    @@ -262,7 +261,6 @@ static void write_object(unsigned nr, enum object_type type,
                            blob->object.flags |= FLAG_WRITTEN;
                    else
                            die("invalid blob object");
    -               obj_list[nr].obj = NULL;
            } else {
                    struct object *obj;
                    int eaten;

The reason I'm noting it is that the same seems to be true of your new
addition here. I.e. are these assignments to NULL needed?

Anyway, the reason I started poking at this it tha this
write_stream_blob() seems to duplicate much of write_object(). AFAICT
only the writing part is really different, the part where we
lookup_blob() after, set FLAG_WRITTEN etc. is all the same.

Why can't we call write_object() here?

The obvious answer seems to be that the call to write_object_file()
isn't prepared to do the sort of streaming that you want, so instead
you're bypassing it and calling write_loose_object() directly.

I haven't tried this myself, but isn't a better and cleaner approach
here to not add another meaning to what is_null_oid() means, but to just
add a HASH_STREAM flag that'll get passed down as "unsigned flags" to
write_loose_object()? See FLAG_BITS in object.h.

Then the "obj_list[nr].obj" here could also become
"obj_list[nr].obj.flags |= (1u<<12)" or whatever (but that wouldn't
strictly be needed I think.

But by adding the "HASH_STREAM" flag you could I think stop duplicating
the "Generate the header" etc. here and call write_object_file_flags().

I don't so much care about how it's done within unpack-objects.c, but
not having another meaning to is_null_oid() in play would be really
nice, and it this case it seems entirely avoidable.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream
  2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:07       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-06  2:07 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:41 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  6 ++++++
> >  2 files changed, 55 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb972cdccd..82656f7428 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
>
> I see why you picked "const void *buf" here, over say const char *, it's
> what "struct input_stream" uses.
>
> But why not use size_t for the length, as input_stream does?
>

Yes, "size_t" will be better here.

> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
>
> nit: if (!data->len)...
>

Will apply.

> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
>
> But isn't the body of this functin the same as:
>
>         *len = data->len;
>         if (!len)
>                 return NULL;
>         data->len = 0;
>         return data->buf;
>
> I.e. you don't need the condition for setting "*len" if it's 0, then
> data->len is also 0. You just want to return NULL afterwards, and not
> set (harmless, but no need) data->len to 0)< or return data->buf.

Will apply.

> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> Maybe it's that I'm unused to it, but I find this a bit more readable:
>
>         @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>          {
>                 char hdr[MAX_HEADER_LEN];
>                 int hdrlen = sizeof(hdr);
>         +       struct simple_input_stream_data tmp = {
>         +               .buf = buf,
>         +               .len = len,
>         +       };
>                 struct input_stream in_stream = {
>                         .read = feed_simple_input_stream,
>         -               .data = (void *)&(struct simple_input_stream_data) {
>         -                       .buf = buf,
>         -                       .len = len,
>         -               },
>         +               .data = (void *)&tmp,
>                         .size = len,
>                 };
>
> Yes there's a temporary variable, but no denser inline casting. Also
> easier to strep through in a debugger (which will have the type
> information on "tmp".
>

Will apply.

> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> ditto..
>
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> > +     in_stream.size = len;
>
> Why are we setting this here?...
>

Yes, putting "in_stream.size=len;" here was a stupid decision.

> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
>
> ...Insted of after this point, as we may error and never use it?
>
> > +     data.buf = buf;
> > +     data.len = len;
>
> Probably won't matter,  just a nit...
>
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +     size_t size;
> > +};
> > +
>
> Ah, and here's the size_t... :)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  2:51       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-06  2:51 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:27 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
> > +     } else {
> > +             loose_object_path(the_repository, &filename, oid);
> > +     }
> >
> >       fd = create_tmpfile(&tmp_file, filename.buf);
> >       if (fd < 0) {
> > @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >               die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> >                   ret);
> >       the_hash_algo->final_oid_fn(&parano_oid, &c);
> > -     if (!oideq(oid, &parano_oid))
> > +     if (!is_null_oid(oid) && !oideq(oid, &parano_oid))
> >               die(_("confused by unstable object source data for %s"),
> >                   oid_to_hex(oid));
> >
> >       close_loose_object(fd);
> >
> > +     if (is_null_oid(oid)) {
> > +             int dirlen;
> > +
> > +             oidcpy((struct object_id *)oid, &parano_oid);
> > +             loose_object_path(the_repository, &filename, oid);
>
> Why are we breaking the promise that "oid" is constant here? I tested
> locally with the below on top, and it seems to work (at least no tests
> broke). Isn't it preferrable to the cast & the caller having its "oid"
> changed?
>
> diff --git a/object-file.c b/object-file.c
> index 71d510614b9..d014e6942ea 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         close_loose_object(fd);
>
>         if (is_null_oid(oid)) {
> +               struct object_id oid2;
>                 int dirlen;
>
> -               oidcpy((struct object_id *)oid, &parano_oid);
> -               loose_object_path(the_repository, &filename, oid);
> +               oidcpy(&oid2, &parano_oid);
> +               loose_object_path(the_repository, &filename, &oid2);
>
>                 /* We finally know the object path, and create the missing dir. */
>                 dirlen = directory_size(filename.buf);

Maybe I should change the promise that "oid" is constant in
"write_loose_object()".

The original write_object_file_flags() defines a variable "oid", and
completes the calculation of the "oid" in
"write_object_file_prepare()" which will be passed to
"write_loose_object()".

If a null oid is maintained after calling "write_loose_object()",
"--strict" will become meaningless, although it does not break existing
test cases.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:12       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-06  3:12 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:54 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When streaming a large blob object to "write_loose_object()", we have no
> > chance to run "write_object_file_prepare()" to calculate the oid in
> > advance. So we need to handle undetermined oid in function
> > "write_loose_object()".
> >
> > In the original implementation, we know the oid and we can write the
> > temporary file in the same directory as the final object, but for an
> > object with an undetermined oid, we don't know the exact directory for
> > the object, so we have to save the temporary file in ".git/objects/"
> > directory instead.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 30 ++++++++++++++++++++++++++++--
> >  1 file changed, 28 insertions(+), 2 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 82656f7428..1c41587bfb 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >       const void *buf;
> >       unsigned long len;
> >
> > -     loose_object_path(the_repository, &filename, oid);
> > +     if (is_null_oid(oid)) {
> > +             /* When oid is not determined, save tmp file to odb path. */
> > +             strbuf_reset(&filename);
>
> Why re-use this & leak memory? An existing strbuf use in this function
> doesn't leak in the same way. Just release it as in the below patch on
> top (the ret v.s. err variable naming is a bit confused, maybe could do
> with a prep cleanup step.).
>
> > +             strbuf_addstr(&filename, the_repository->objects->odb->path);
> > +             strbuf_addch(&filename, '/');
>
> And once we do that this could just become:
>
>         strbuf_addf($filename, "%s/", ...)
>
> Is there's existing uses of this pattern, so mayb e not worth it, but it
> allows you to remove the braces on the if/else.
>
> diff --git a/object-file.c b/object-file.c
> index 8bd89e7b7ba..2b52f3fc1cc 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                        int hdrlen, struct input_stream *in_stream,
>                        time_t mtime, unsigned flags)
>  {
> -       int fd, ret;
> +       int fd, ret, err = 0;
>         unsigned char compressed[4096];
>         git_zstream stream;
>         git_hash_ctx c;
> @@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>
>         if (is_null_oid(oid)) {
>                 /* When oid is not determined, save tmp file to odb path. */
> -               strbuf_reset(&filename);
>                 strbuf_addstr(&filename, the_repository->objects->odb->path);
>                 strbuf_addch(&filename, '/');
>         } else {
> @@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>         fd = create_tmpfile(&tmp_file, filename.buf);
>         if (fd < 0) {
>                 if (flags & HASH_SILENT)
> -                       return -1;
> +                       err = -1;
>                 else if (errno == EACCES)
> -                       return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +                       err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>                 else
> -                       return error_errno(_("unable to create temporary file"));
> +                       err = error_errno(_("unable to create temporary file"));
> +               goto cleanup;
>         }
>
>         /* Set it up */
> @@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         struct strbuf dir = STRBUF_INIT;
>                         strbuf_add(&dir, filename.buf, dirlen - 1);
>                         if (mkdir(dir.buf, 0777) && errno != EEXIST)
> -                               return -1;
> -                       if (adjust_shared_perm(dir.buf))
> -                               return -1;
> -                       strbuf_release(&dir);
> +                               err = -1;
> +                       else if (adjust_shared_perm(dir.buf))
> +                               err = -1;
> +                       else
> +                               strbuf_release(&dir);
> +                       if (err < 0)
> +                               goto cleanup;
>                 }
>         }
>
> @@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr,
>                         warning_errno(_("failed utime() on %s"), tmp_file.buf);
>         }
>
> -       return finalize_object_file(tmp_file.buf, filename.buf);
> +       err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +       strbuf_release(&filename);
> +       return err;
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)

Yes, this will be much better. Will apply.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
@ 2021-12-06  3:20       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-06  3:20 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 10:00 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > +     unsigned long bufsize = dry_run ? 4096 : size;
> > +     void *buf = xmallocz(bufsize);
>
> It's probably nothing, but in your CL you note that you changed another
> hardcoding from 4k to 8k, should this one still be 4k?
>
> It's probably fine, just wondering...

Yes, I think this is an omission from my work.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:17       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-07  6:17 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:59 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> > new file mode 100755
> > index 0000000000..01d950d119
> > --- /dev/null
> > +++ b/t/t5590-unpack-non-delta-objects.sh
> > @@ -0,0 +1,76 @@
> > +#!/bin/sh
> > +#
> > +# Copyright (c) 2021 Han Xin
> > +#
> > +
> > +test_description='Test unpack-objects when receive pack'
> > +
> > +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> > +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> > +
> > +. ./test-lib.sh
> > +
> > +test_expect_success "create commit with big blobs (1.5 MB)" '
> > +     test-tool genrandom foo 1500000 >big-blob &&
> > +     test_commit --append foo big-blob &&
> > +     test-tool genrandom bar 1500000 >big-blob &&
> > +     test_commit --append bar big-blob &&
> > +     (
> > +             cd .git &&
> > +             find objects/?? -type f | sort
>
> ...are thse...
>
> > +     ) >expect &&
> > +     PACK=$(echo main | git pack-objects --progress --revs test)
>
> Is --progress needed?
>

"--progress" is not necessary.

> > +'
> > +
> > +test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
> > +     GIT_ALLOC_LIMIT=1m &&
> > +     export GIT_ALLOC_LIMIT
> > +'
> > +
> > +test_expect_success 'prepare dest repository' '
> > +     git init --bare dest.git &&
> > +     git -C dest.git config core.bigFileThreshold 2m &&
> > +     git -C dest.git config receive.unpacklimit 100
>
> I think it would be better to just (could roll this into a function):
>
>         test_when_finished "rm -rf dest.git" &&
>         git init dest.git &&
>         git -C dest.git config ...
>
> Then you can use it with e.g. --run=3-4 and not have it error out
> because of skipped setup.
>
> A lot of our tests fail like that, but in this case fixing it seems
> trivial.
>
>

OK, I will take it.

>
> > +'
> > +
> > +test_expect_success 'fail to unpack-objects: cannot allocate' '
> > +     test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> > +     test_i18ngrep "fatal: attempting to allocate" err &&
>
> nit: just "grep", not "test_i18ngrep"
>
> > +     (
> > +             cd dest.git &&
> > +             find objects/?? -type f | sort
>
> ..."find" needed over just globbing?:
>
>     obj=$(echo objects/*/*)
>
> ?

I tried to use "echo" instead of "find". It works well on my personal
computer, but fails due to the "info/commit-graph" generated when CI on
Github.
So it seems that ".git/objects/??" will be more rigorous?

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:42       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-07  6:42 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 9:19 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > By implementing a zstream version of input_stream interface, we can use
> > a small fixed buffer for "unpack_non_delta_entry()".
> >
> > However, unpack non-delta objects from a stream instead of from an entrie
> > buffer will have 10% performance penalty. Therefore, only unpack object
> > larger than the "big_file_threshold" in zstream. See the following
> > benchmarks:
> >
> >     hyperfine \
> >       --setup \
> >       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
> >       --prepare 'rm -rf dest.git && git init --bare dest.git' \
> >       -n 'old' 'git -C dest.git unpack-objects <small.pack' \
> >       -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
> >       -n 'new (small threshold)' \
> >       'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
> >     Benchmark 1: old
> >       Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
> >       Range (min … max):    6.018 s …  6.189 s    10 runs
> >
> >     Benchmark 2: new
> >       Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
> >       Range (min … max):    6.030 s …  6.142 s    10 runs
> >
> >     Benchmark 3: new (small threshold)
> >       Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
> >       Range (min … max):    6.711 s …  6.809 s    10 runs
> >
> >     Summary
> >       'old' ran
> >         1.00 ± 0.01 times faster than 'new'
> >         1.11 ± 0.01 times faster than 'new (small threshold)'
>
> So before we wrote used core.bigfilethreshold for two things (or more?):
> Whether we show a diff for it (we mark it "binary") and whether it's
> split into a loose object.
>
> Now it's three things, we've added a "this is a threshold when we'll
> stream the object" to that.
>
> Might it make sense to squash something like this in, so we can have our
> cake & eat it too?
>
> With this I get, where HEAD~0 is this change:
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0' ran
>         1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.00 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.06 ± 0.14 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.20 ± 0.01 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> I.e. it's 5% slower, not 20% (haven't looked into why), but we'll not
> stream out 16k..128MB objects (maybe the repo has even bigger ones?)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a1..601b7a2418f 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>
> +core.bigFileStreamingThreshold::
> +       Files larger than this will be streamed out to a temporary
> +       object file while being hashed, which will when be renamed
> +       in-place to a loose object, particularly if the
> +       `core.bigFileThreshold' setting dictates that they're always
> +       written out as loose objects.
> ++
> +Default is 128 MiB on all platforms.
> ++
> +Common unit suffixes of 'k', 'm', or 'g' are supported.
> +
>  core.excludesFile::
>         Specifies the pathname to the file that contains patterns to
>         describe paths that are not meant to be tracked, in addition
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index bedc494e2db..94ce275c807 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -400,7 +400,7 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>         void *buf;
>
>         /* Write large blob in stream without allocating full buffer. */
> -       if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
> +       if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
>                 write_stream_blob(nr, size);
>                 return;
>         }
> diff --git a/cache.h b/cache.h
> index eba12487b99..4037c7fd849 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -964,6 +964,7 @@ extern size_t packed_git_window_size;
>  extern size_t packed_git_limit;
>  extern size_t delta_base_cache_limit;
>  extern unsigned long big_file_threshold;
> +extern unsigned long big_file_streaming_threshold;
>  extern unsigned long pack_size_limit_cfg;
>
>  /*
> diff --git a/config.c b/config.c
> index c5873f3a706..7b122a142a8 100644
> --- a/config.c
> +++ b/config.c
> @@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>                 return 0;
>         }
>
> +       if (!strcmp(var, "core.bigfilestreamingthreshold")) {
> +               big_file_streaming_threshold = git_config_ulong(var, value);
> +               return 0;
> +       }
> +
>         if (!strcmp(var, "core.packedgitlimit")) {
>                 packed_git_limit = git_config_ulong(var, value);
>                 return 0;
> diff --git a/environment.c b/environment.c
> index 9da7f3c1a19..4fcc3de7417 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
>  size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
>  size_t delta_base_cache_limit = 96 * 1024 * 1024;
>  unsigned long big_file_threshold = 512 * 1024 * 1024;
> +unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
>  int pager_use_color = 1;
>  const char *editor_program;
>  const char *askpass_program;

I'm not sure if we need an additional "core.bigFileStreamingThreshold"
here, because "core.bigFileThreshold" has been widely used in
"index-pack", "read_object" and so on.

In the test case which uses "core.bigFileStreamingThreshold" instead of
"core.bigFileThreshold", I found the test case execution failed because
of "fsck", who tried to allocate 15MB of memory.
In the process of "fsck_loose()", "read_loose_object()" will be called,
which contains the following content:

  if (*oi->typep == OBJ_BLOB && *size> big_file_threshold) {
    if (check_stream_oid(&stream, hdr, *size, path, expected_oid) <0)
    goto out;
  } else {
    /* this will allocate 15MB of memory */
    *contents = unpack_loose_rest(&stream, hdr, *size, expected_oid);
    ...
  }

The same case can be found in "unpack_entry_data()":

  static char fixed_buf[8192];
  ...
  if (type == OBJ_BLOB && size > big_file_threshold)
    buf = fixed_buf;
  else
    buf = xmallocz(size);
 ...

Although I know that setting a "core.bigfilethreshold" smaller than the
default value on the server side does not help me prevent users from
creating large delta objects on the client side, it can still
effectively help me reduce the Memory allocation in "receive-pack".

If this is not the correct way to use "core.bigfilethreshold", maybe
you can share some better solutions to me, if you want.

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
@ 2021-12-07  6:48       ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-07  6:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

On Fri, Dec 3, 2021 at 10:29 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> > [..]
> > +static void write_stream_blob(unsigned nr, unsigned long size)
> > +{
> > +     char hdr[32];
> > +     int hdrlen;
> > +     git_zstream zstream;
> > +     struct input_zstream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_input_zstream,
> > +             .data = &data,
> > +             .size = size,
> > +     };
> > +     struct object_id *oid = &obj_list[nr].oid;
> > +     int ret;
> > +
> > +     memset(&zstream, 0, sizeof(zstream));
> > +     memset(&data, 0, sizeof(data));
> > +     data.zstream = &zstream;
> > +     git_inflate_init(&zstream);
> > +
> > +     /* Generate the header */
> > +     hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
> > +
> > +     if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
> > +             die(_("failed to write object in stream %d"), ret);
> > +
> > +     if (zstream.total_out != size || data.status != Z_STREAM_END)
> > +             die(_("inflate returned %d"), data.status);
> > +     git_inflate_end(&zstream);
> > +
> > +     if (strict && !dry_run) {
> > +             struct blob *blob = lookup_blob(the_repository, oid);
> > +             if (blob)
> > +                     blob->object.flags |= FLAG_WRITTEN;
> > +             else
> > +                     die("invalid blob object from stream");
> > +     }
> > +     obj_list[nr].obj = NULL;
> > +}
>
> Just a side-note, I think (but am not 100% sure) that these existing
> occurances aren't needed due to our use of CALLOC_ARRAY():
>
>     diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>     index 4a9466295ba..00b349412c5 100644
>     --- a/builtin/unpack-objects.c
>     +++ b/builtin/unpack-objects.c
>     @@ -248,7 +248,6 @@ static void write_object(unsigned nr, enum object_type type,
>                             die("failed to write object");
>                     added_object(nr, type, buf, size);
>                     free(buf);
>     -               obj_list[nr].obj = NULL;
>             } else if (type == OBJ_BLOB) {
>                     struct blob *blob;
>                     if (write_object_file(buf, size, type_name(type),
>     @@ -262,7 +261,6 @@ static void write_object(unsigned nr, enum object_type type,
>                             blob->object.flags |= FLAG_WRITTEN;
>                     else
>                             die("invalid blob object");
>     -               obj_list[nr].obj = NULL;
>             } else {
>                     struct object *obj;
>                     int eaten;
>
> The reason I'm noting it is that the same seems to be true of your new
> addition here. I.e. are these assignments to NULL needed?
>
> Anyway, the reason I started poking at this it tha this
> write_stream_blob() seems to duplicate much of write_object(). AFAICT
> only the writing part is really different, the part where we
> lookup_blob() after, set FLAG_WRITTEN etc. is all the same.
>
> Why can't we call write_object() here?
>
> The obvious answer seems to be that the call to write_object_file()
> isn't prepared to do the sort of streaming that you want, so instead
> you're bypassing it and calling write_loose_object() directly.
>
> I haven't tried this myself, but isn't a better and cleaner approach
> here to not add another meaning to what is_null_oid() means, but to just
> add a HASH_STREAM flag that'll get passed down as "unsigned flags" to
> write_loose_object()? See FLAG_BITS in object.h.
>
> Then the "obj_list[nr].obj" here could also become
> "obj_list[nr].obj.flags |= (1u<<12)" or whatever (but that wouldn't
> strictly be needed I think.
>
> But by adding the "HASH_STREAM" flag you could I think stop duplicating
> the "Generate the header" etc. here and call write_object_file_flags().
>
> I don't so much care about how it's done within unpack-objects.c, but
> not having another meaning to is_null_oid() in play would be really
> nice, and it this case it seems entirely avoidable.

I did refactor it according to your suggestions in my next patch version.
Using a HASH_STREAM tag is indeed a better way to deal with it, and it
can also reduce my refactor to the original contents.

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v4 0/5] unpack large objects in stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
@ 2021-12-07 16:18     ` Derrick Stolee
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Derrick Stolee @ 2021-12-07 16:18 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason
  Cc: Han Xin

On 12/3/2021 4:35 AM, Han Xin wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> Changes since v3:
> * Add "size" to "struct input_stream" which used by following commits.
> 
> * Increase the buffer size of "struct input_zstream_data" from 4096 to
>   8192, which is consistent with the "fixed_buf" in the "index-pack.c".
> 
> * Refactor "read stream in a loop in write_loose_object()" which
>   introduced a performance problem reported by Derrick Stolee[1].

Thank you for finding the issue. It seems simple enough to add that size
information and regain the performance back to nearly no overhead. Your
hyperfine statistics are within noise, which is great. Thanks!

-Stolee

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v5 0/6] unpack large blobs in stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
  2021-12-07 16:18     ` Derrick Stolee
@ 2021-12-10 10:34     ` Han Xin
  2021-12-17 11:26       ` Han Xin
                         ` (6 more replies)
  2021-12-10 10:34     ` [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream Han Xin
                       ` (5 subsequent siblings)
  7 siblings, 7 replies; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v4:
* Refactor to "struct input_stream" implementations so that we can
  reduce the changes to "write_loose_object()" sugguest by
  Ævar Arnfjörð Bjarmason.

* Add a new flag called "HASH_STREAM" to support this feature.

* Add a new config "core.bigFileStreamingThreshold" instread of
  "core.bigFileThreshold" sugguest by Ævar Arnfjörð Bjarmason[1].

* Roll destination repository preparement into a function in 
  "t5590-unpack-non-delta-objects.sh", so that we can run testcases
  with --run=setup,3,4.

1. https://lore.kernel.org/git/211203.86zgphsu5a.gmgdl@evledraar.gmail.com/

Han Xin (6):
  object-file: refactor write_loose_object() to support read from stream
  object-file.c: handle undetermined oid in write_loose_object()
  object-file.c: read stream in a loop in write_loose_object()
  unpack-objects.c: add dry_run mode for get_data()
  object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
  unpack-objects: unpack_non_delta_entry() read data in a stream

 Documentation/config/core.txt       | 11 ++++
 builtin/unpack-objects.c            | 86 +++++++++++++++++++++++++++--
 cache.h                             |  2 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 object-file.c                       | 73 +++++++++++++++++++-----
 object-store.h                      |  5 ++
 t/t5590-unpack-non-delta-objects.sh | 70 +++++++++++++++++++++++
 8 files changed, 234 insertions(+), 19 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v4:
1:  af707ef304 < -:  ---------- object-file: refactor write_loose_object() to read buffer from stream
2:  321ad90d8e < -:  ---------- object-file.c: handle undetermined oid in write_loose_object()
3:  1992ac39af < -:  ---------- object-file.c: read stream in a loop in write_loose_object()
-:  ---------- > 1:  f3595e68cc object-file: refactor write_loose_object() to support read from stream
-:  ---------- > 2:  c25fdd1fe5 object-file.c: handle undetermined oid in write_loose_object()
-:  ---------- > 3:  ed226f2f9f object-file.c: read stream in a loop in write_loose_object()
4:  c41eb06533 ! 4:  2f91e540f6 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize = dry_run ? 4096 : size;
    ++	unsigned long bufsize = dry_run ? 8192 : size;
     +	void *buf = xmallocz(bufsize);
      
      	memset(&stream, 0, sizeof(stream));
-:  ---------- > 5:  7698938eac object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
5:  9427775bdc ! 6:  103bb1db06 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Commit message
     
         However, unpack non-delta objects from a stream instead of from an entrie
         buffer will have 10% performance penalty. Therefore, only unpack object
    -    larger than the "big_file_threshold" in zstream. See the following
    +    larger than the "core.BigFileStreamingThreshold" in zstream. See the following
         benchmarks:
     
             hyperfine \
               --setup \
               'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
    -          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -          -n 'old' 'git -C dest.git unpack-objects <small.pack' \
    -          -n 'new' 'new/git -C dest.git unpack-objects <small.pack' \
    -          -n 'new (small threshold)' \
    -          'new/git -c core.bigfilethreshold=16k -C dest.git unpack-objects <small.pack'
    -        Benchmark 1: old
    -          Time (mean ± σ):      6.075 s ±  0.069 s    [User: 5.047 s, System: 0.991 s]
    -          Range (min … max):    6.018 s …  6.189 s    10 runs
    -
    -        Benchmark 2: new
    -          Time (mean ± σ):      6.090 s ±  0.033 s    [User: 5.075 s, System: 0.976 s]
    -          Range (min … max):    6.030 s …  6.142 s    10 runs
    -
    -        Benchmark 3: new (small threshold)
    -          Time (mean ± σ):      6.755 s ±  0.029 s    [User: 5.150 s, System: 1.560 s]
    -          Range (min … max):    6.711 s …  6.809 s    10 runs
    +          --prepare 'rm -rf dest.git && git init --bare dest.git'
     
             Summary
    -          'old' ran
    -            1.00 ± 0.01 times faster than 'new'
    -            1.11 ± 0.01 times faster than 'new (small threshold)'
    +          './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
    +            1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
    +            1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
    +            1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
    +            1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
    +            1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
     
    +    Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    + ## Documentation/config/core.txt ##
    +@@ Documentation/config/core.txt: be delta compressed, but larger binary media files won't be.
    + +
    + Common unit suffixes of 'k', 'm', or 'g' are supported.
    + 
    ++core.bigFileStreamingThreshold::
    ++	Files larger than this will be streamed out to a temporary
    ++	object file while being hashed, which will when be renamed
    ++	in-place to a loose object, particularly if the
    ++	`core.bigFileThreshold' setting dictates that they're always
    ++	written out as loose objects.
    +++
    ++Default is 128 MiB on all platforms.
    +++
    ++Common unit suffixes of 'k', 'm', or 'g' are supported.
    ++
    + core.excludesFile::
    + 	Specifies the pathname to the file that contains patterns to
    + 	describe paths that are not meant to be tracked, in addition
    +
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
      	}
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +
     +static void write_stream_blob(unsigned nr, unsigned long size)
     +{
    -+	char hdr[32];
    -+	int hdrlen;
     +	git_zstream zstream;
     +	struct input_zstream_data data;
     +	struct input_stream in_stream = {
     +		.read = feed_input_zstream,
     +		.data = &data,
    -+		.size = size,
     +	};
    -+	struct object_id *oid = &obj_list[nr].oid;
     +	int ret;
     +
     +	memset(&zstream, 0, sizeof(zstream));
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	/* Generate the header */
    -+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), (uintmax_t)size) + 1;
    -+
    -+	if ((ret = write_loose_object(oid, hdr, hdrlen, &in_stream, 0, 0)))
    ++	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
     +		die(_("failed to write object in stream %d"), ret);
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	git_inflate_end(&zstream);
     +
     +	if (strict && !dry_run) {
    -+		struct blob *blob = lookup_blob(the_repository, oid);
    ++		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      		write_object(nr, type, buf, size);
      	else
     
    - ## object-file.c ##
    -@@ object-file.c: static const void *feed_simple_input_stream(struct input_stream *in_stream, unsi
    - 	return data->buf;
    - }
    + ## cache.h ##
    +@@ cache.h: extern size_t packed_git_window_size;
    + extern size_t packed_git_limit;
    + extern size_t delta_base_cache_limit;
    + extern unsigned long big_file_threshold;
    ++extern unsigned long big_file_streaming_threshold;
    + extern unsigned long pack_size_limit_cfg;
      
    --static int write_loose_object(const struct object_id *oid, char *hdr,
    --			      int hdrlen, struct input_stream *in_stream,
    --			      time_t mtime, unsigned flags)
    -+int write_loose_object(const struct object_id *oid, char *hdr,
    -+		       int hdrlen, struct input_stream *in_stream,
    -+		       time_t mtime, unsigned flags)
    - {
    - 	int fd, ret;
    - 	unsigned char compressed[4096];
    + /*
     
    - ## object-store.h ##
    -@@ object-store.h: int hash_object_file(const struct git_hash_algo *algo, const void *buf,
    - 		     unsigned long len, const char *type,
    - 		     struct object_id *oid);
    + ## config.c ##
    +@@ config.c: static int git_default_core_config(const char *var, const char *value, void *cb)
    + 		return 0;
    + 	}
      
    -+int write_loose_object(const struct object_id *oid, char *hdr,
    -+		       int hdrlen, struct input_stream *in_stream,
    -+		       time_t mtime, unsigned flags);
    ++	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
    ++		big_file_streaming_threshold = git_config_ulong(var, value);
    ++		return 0;
    ++	}
     +
    - int write_object_file_flags(const void *buf, unsigned long len,
    - 			    const char *type, struct object_id *oid,
    - 			    unsigned flags);
    + 	if (!strcmp(var, "core.packedgitlimit")) {
    + 		packed_git_limit = git_config_ulong(var, value);
    + 		return 0;
    +
    + ## environment.c ##
    +@@ environment.c: size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
    + size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
    + size_t delta_base_cache_limit = 96 * 1024 * 1024;
    + unsigned long big_file_threshold = 512 * 1024 * 1024;
    ++unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
    + int pager_use_color = 1;
    + const char *editor_program;
    + const char *askpass_program;
     
      ## t/t5590-unpack-non-delta-objects.sh (new) ##
     @@
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +
     +. ./test-lib.sh
     +
    -+test_expect_success "create commit with big blobs (1.5 MB)" '
    ++prepare_dest () {
    ++	test_when_finished "rm -rf dest.git" &&
    ++	git init --bare dest.git &&
    ++	git -C dest.git config core.bigFileStreamingThreshold $1
    ++	git -C dest.git config core.bigFileThreshold $1
    ++}
    ++
    ++test_expect_success "setup repo with big blobs (1.5 MB)" '
     +	test-tool genrandom foo 1500000 >big-blob &&
     +	test_commit --append foo big-blob &&
     +	test-tool genrandom bar 1500000 >big-blob &&
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +		cd .git &&
     +		find objects/?? -type f | sort
     +	) >expect &&
    -+	PACK=$(echo main | git pack-objects --progress --revs test)
    ++	PACK=$(echo main | git pack-objects --revs test)
     +'
     +
    -+test_expect_success 'setup GIT_ALLOC_LIMIT to 1MB' '
    ++test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
     +	GIT_ALLOC_LIMIT=1m &&
     +	export GIT_ALLOC_LIMIT
     +'
     +
    -+test_expect_success 'prepare dest repository' '
    -+	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileThreshold 2m &&
    -+	git -C dest.git config receive.unpacklimit 100
    -+'
    -+
     +test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	prepare_dest 2m &&
     +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    -+	test_i18ngrep "fatal: attempting to allocate" err &&
    ++	grep "fatal: attempting to allocate" err &&
     +	(
     +		cd dest.git &&
     +		find objects/?? -type f | sort
     +	) >actual &&
    ++	test_file_not_empty actual &&
     +	! test_cmp expect actual
     +'
     +
    -+test_expect_success 'set a lower bigfile threshold' '
    -+	git -C dest.git config core.bigFileThreshold 1m
    -+'
    -+
     +test_expect_success 'unpack big object in stream' '
    ++	prepare_dest 1m &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_cmp expect actual
     +'
     +
    -+test_expect_success 'setup for unpack-objects dry-run test' '
    -+	git init --bare unpack-test.git
    -+'
    -+
     +test_expect_success 'unpack-objects dry-run' '
    ++	prepare_dest 1m &&
    ++	git -C dest.git unpack-objects -n <test-$PACK.pack &&
     +	(
    -+		cd unpack-test.git &&
    -+		git unpack-objects -n <../test-$PACK.pack
    -+	) &&
    -+	(
    -+		cd unpack-test.git &&
    ++		cd dest.git &&
     +		find objects/ -type f
     +	) >actual &&
     +	test_must_be_empty actual
-- 
2.34.0


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
  2021-12-07 16:18     ` Derrick Stolee
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface.

In the first step, we add a new flag called "HASH_STREAM" and make a
simple implementation, feeding the entire buffer in the stream to
"write_loose_object()" as a refactor.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 cache.h        | 1 +
 object-file.c  | 7 ++++++-
 object-store.h | 5 +++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cache.h b/cache.h
index eba12487b9..51bd435dea 100644
--- a/cache.h
+++ b/cache.h
@@ -888,6 +888,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
 #define HASH_FORMAT_CHECK 2
 #define HASH_RENORMALIZE  4
 #define HASH_SILENT 8
+#define HASH_STREAM 16
 int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
 int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
 
diff --git a/object-file.c b/object-file.c
index eb972cdccd..06375a90d6 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1898,7 +1898,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	stream.next_in = (void *)buf;
+	if (flags & HASH_STREAM) {
+		struct input_stream *in_stream = (struct input_stream *)buf;
+		stream.next_in = (void *)in_stream->read(in_stream, &len);
+	} else {
+		stream.next_in = (void *)buf;
+	}
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..ccc1fc9c1a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (2 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-13  7:32       ` Ævar Arnfjörð Bjarmason
  2021-12-10 10:34     ` [PATCH v5 3/6] object-file.c: read stream in a loop " Han Xin
                       ` (3 subsequent siblings)
  7 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in function
"write_loose_object()".

In the original implementation, we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

The promise that "oid" is constant in "write_loose_object()" has been
removed because it will be filled after reading all stream data.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/object-file.c b/object-file.c
index 06375a90d6..41099b137f 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
-static int write_loose_object(const struct object_id *oid, char *hdr,
+static int write_loose_object(struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret;
+	int fd, ret, err = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 
-	loose_object_path(the_repository, &filename, oid);
+	if (flags & HASH_STREAM)
+		/* When oid is not determined, save tmp file to odb path. */
+		strbuf_addf(&filename, "%s/", get_object_directory());
+	else
+		loose_object_path(the_repository, &filename, oid);
 
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			err = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			err = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
-	if (!oideq(oid, &parano_oid))
+	if (!(flags & HASH_STREAM) && !oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
+	if (flags & HASH_STREAM) {
+		int dirlen;
+
+		oidcpy((struct object_id *)oid, &parano_oid);
+		loose_object_path(the_repository, &filename, oid);
+
+		/* We finally know the object path, and create the missing dir. */
+		dirlen = directory_size(filename.buf);
+		if (dirlen) {
+			struct strbuf dir = STRBUF_INIT;
+			strbuf_add(&dir, filename.buf, dirlen - 1);
+			if (mkdir(dir.buf, 0777) && errno != EEXIST)
+				err = -1;
+			else if (adjust_shared_perm(dir.buf))
+				err = -1;
+			else
+				strbuf_release(&dir);
+			if (err < 0)
+				goto cleanup;
+		}
+	}
+
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
@@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	return err;
 }
 
 static int freshen_loose_object(const struct object_id *oid)
@@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
 	return ret;
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v5 3/6] object-file.c: read stream in a loop in write_loose_object()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (3 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In order to prepare the stream version of "write_loose_object()", read
the input stream in a loop in "write_loose_object()", so that we can
feed the contents of large blob object to "write_loose_object()" using
a small fixed buffer.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/object-file.c b/object-file.c
index 41099b137f..455ab3c06e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1864,7 +1864,7 @@ static int write_loose_object(struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
 {
-	int fd, ret, err = 0;
+	int fd, ret, err = 0, flush = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
@@ -1903,22 +1903,29 @@ static int write_loose_object(struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
-	if (flags & HASH_STREAM) {
-		struct input_stream *in_stream = (struct input_stream *)buf;
-		stream.next_in = (void *)in_stream->read(in_stream, &len);
-	} else {
+	if (!(flags & HASH_STREAM)) {
 		stream.next_in = (void *)buf;
+		stream.avail_in = len;
+		flush = Z_FINISH;
 	}
-	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+		if (flags & HASH_STREAM && !stream.avail_in) {
+			struct input_stream *in_stream = (struct input_stream *)buf;
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
 		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
 		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
-	} while (ret == Z_OK);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (4 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 3/6] object-file.c: read stream in a loop " Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM" Han Xin
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..d878e2f8b4 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,16 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +125,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +329,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +363,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +402,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (5 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We will use "write_object_file_flags()" in "unpack_non_delta_entry()" to
read the entire data contents in stream. When read in stream, we needn't
prepare "oid" before "write_loose_object()", only generate the header.

Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/object-file.c b/object-file.c
index 455ab3c06e..906590dae5 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2002,6 +2002,11 @@ int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	if (flags & HASH_STREAM) {
+		/* Generate the header */
+		hdrlen = xsnprintf(hdr, hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+		return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	}
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-03  9:35   ` [PATCH v4 " Han Xin
                       ` (6 preceding siblings ...)
  2021-12-10 10:34     ` [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM" Han Xin
@ 2021-12-10 10:34     ` Han Xin
  2021-12-13  8:05       ` Ævar Arnfjörð Bjarmason
  7 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-10 10:34 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an entrie
buffer will have 10% performance penalty. Therefore, only unpack object
larger than the "core.BigFileStreamingThreshold" in zstream. See the following
benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 +++++
 builtin/unpack-objects.c            | 70 ++++++++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 +++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 70 +++++++++++++++++++++++++++++
 6 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index d878e2f8b4..0df115ab0d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -326,11 +326,79 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	int ret;
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
+		die(_("failed to write object in stream %d"), ret);
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict && !dry_run) {
+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die("invalid blob object from stream");
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 51bd435dea..78548cd67a 100644
--- a/cache.h
+++ b/cache.h
@@ -965,6 +965,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 9da7f3c1a1..4fcc3de741 100644
--- a/environment.c
+++ b/environment.c
@@ -46,6 +46,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..ff4c78900b
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileStreamingThreshold $1
+	git -C dest.git config core.bigFileThreshold $1
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest 2m &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object()
  2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
@ 2021-12-13  7:32       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-13  7:32 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 10 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in function
> "write_loose_object()".
>
> In the original implementation, we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> The promise that "oid" is constant in "write_loose_object()" has been
> removed because it will be filled after reading all stream data.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 39 insertions(+), 9 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 06375a90d6..41099b137f 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,11 +1860,11 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> -static int write_loose_object(const struct object_id *oid, char *hdr,
> +static int write_loose_object(struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
>  {
> -	int fd, ret;
> +	int fd, ret, err = 0;
>  	unsigned char compressed[4096];
>  	git_zstream stream;
>  	git_hash_ctx c;
> @@ -1872,16 +1872,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  	static struct strbuf tmp_file = STRBUF_INIT;
>  	static struct strbuf filename = STRBUF_INIT;
>  
> -	loose_object_path(the_repository, &filename, oid);
> +	if (flags & HASH_STREAM)
> +		/* When oid is not determined, save tmp file to odb path. */
> +		strbuf_addf(&filename, "%s/", get_object_directory());
> +	else
> +		loose_object_path(the_repository, &filename, oid);
>  
>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
>  		if (flags & HASH_SILENT)
> -			return -1;
> +			err = -1;
>  		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +			err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
>  		else
> -			return error_errno(_("unable to create temporary file"));
> +			err = error_errno(_("unable to create temporary file"));
> +		goto cleanup;
>  	}
>  
>  	/* Set it up */
> @@ -1923,12 +1928,34 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -	if (!oideq(oid, &parano_oid))
> +	if (!(flags & HASH_STREAM) && !oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));

Here we don't have a meaningful "const" OID anymore, but still if we die
we use the "oid". 

>  	close_loose_object(fd);
>  
> +	if (flags & HASH_STREAM) {
> +		int dirlen;
> +
> +		oidcpy((struct object_id *)oid, &parano_oid);

This cast isn't needed anymore now that you stripped the "const" off,
but more on that later...

> +		loose_object_path(the_repository, &filename, oid);
> +
> +		/* We finally know the object path, and create the missing dir. */
> +		dirlen = directory_size(filename.buf);
> +		if (dirlen) {
> +			struct strbuf dir = STRBUF_INIT;
> +			strbuf_add(&dir, filename.buf, dirlen - 1);
> +			if (mkdir(dir.buf, 0777) && errno != EEXIST)
> +				err = -1;
> +			else if (adjust_shared_perm(dir.buf))
> +				err = -1;
> +			else
> +				strbuf_release(&dir);
> +			if (err < 0)
> +				goto cleanup;

Can't we use one of the existing utility functions for this? Testing
locally I could replace this with:
	
	diff --git a/object-file.c b/object-file.c
	index 7c93db11b2d..05e1fae893d 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1952,14 +1952,11 @@ static int write_loose_object(struct object_id *oid, char *hdr,
	 		if (dirlen) {
	 			struct strbuf dir = STRBUF_INIT;
	 			strbuf_add(&dir, filename.buf, dirlen - 1);
	-			if (mkdir(dir.buf, 0777) && errno != EEXIST)
	+			
	+			if (mkdir_in_gitdir(dir.buf) < 0) {
	 				err = -1;
	-			else if (adjust_shared_perm(dir.buf))
	-				err = -1;
	-			else
	-				strbuf_release(&dir);
	-			if (err < 0)
	 				goto cleanup;
	+			}
	 		}
	 	}

And your tests still pass. Maybe they have a blind spot, or maybe we can
just use the existing function.
	 
> +		}
> +	}
> +
>  	if (mtime) {
>  		struct utimbuf utb;
>  		utb.actime = mtime;
> @@ -1938,7 +1965,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  			warning_errno(_("failed utime() on %s"), tmp_file.buf);
>  	}
>  
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&filename);
> +	return err;
>  }

Reading this series is an odd mixture of of things that would really be
much easier to understand if they were combined, e.g. 1/6 adding APIs
that aren't used by anything, but then adding one codepath (also
unused), that we then use later. Could just add it at the same time as
the use and the patch would be easier to read....

...and then this, which *is* something that could be split up into an
earlier cleanup step, i.e. the strbuf leak here exists before this
series, fixing it is good, but splitting that up into its own patch
would make this diff smaller & the actual behavior changes easier to
reason about.

>  static int freshen_loose_object(const struct object_id *oid)
> @@ -2015,7 +2045,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
>  	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> -	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
> +	ret = write_loose_object((struct object_id*) oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>  
>  	return ret;

 ...on the "more on that later", here we're casting the "oid" from const
 for a function that's never going to be involved in the streaming
 codepath.

I know I suggested the HASH_STREAM flag, but what I was really going for
was "let's share more of the code?", looking at this v5 (which is
already much better than v4) I think a better approach is to split up
write_loose_object().

I.e. it already calls close_loose_object() and finalize_object_file() to
do some of its work, but around that we have:

 1. Figuring out a path for the (temp) object file
 2. Creating the tempfile
 3. Setting up zlib
 4. Once zlib is set up inspect its state, die with a message
    about oid_to_hex(oid) if we failed
 5. Optionally, do HASH_STREAM stuff
    Maybe force a loose object if "mtime".

I think if that's split up so that each of those is its own little
function what's now write_loose_object() can call those in sequence, and
a new stream_loose_object() can just do #1 differentl, followed by the
same #2 and #4, but do #4 differently etc.

You'll still be able to re-use the write_object_file_prepare()
etc. logic.

As an example your 5/6 copy/pastes the xsnprintf() formatting of the
object header. It's just one line, but it's also code that's very
central to git, so I think instead of just copy/pasting it a prep step
of factoring it out would make sense, and that would be a prep cleanup
that would help later readability. E.g.:
	
	diff --git a/object-file.c b/object-file.c
	index eac67f6f5f9..a7dcbd929e9 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1009,6 +1009,13 @@ void *xmmap(void *start, size_t length,
	 	return ret;
	 }
	 
	+static int generate_object_header(char *buf, int bufsz, const char *type_name,
	+				  unsigned long size)
	+{
	+	return xsnprintf(buf, bufsz, "%s %"PRIuMAX , type_name,
	+			 (uintmax_t)size) + 1;
	+}
	+
	 /*
	  * With an in-core object data in "map", rehash it to make sure the
	  * object name actually matches "oid" to detect object corruption.
	@@ -1037,7 +1044,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
	 		return -1;
	 
	 	/* Generate the header */
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size);
	 
	 	/* Sha1.. */
	 	r->hash_algo->init_fn(&c);
	@@ -1737,7 +1744,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
	 	git_hash_ctx c;
	 
	 	/* Generate the header */
	-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
	+	*hdrlen = generate_object_header(hdr, *hdrlen, type, len);
	 
	 	/* Sha1.. */
	 	algo->init_fn(&c);
	@@ -2009,7 +2016,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
	 	buf = read_object(the_repository, oid, &type, &len);
	 	if (!buf)
	 		return error(_("cannot read object for %s"), oid_to_hex(oid));
	-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
	+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len);
	 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
	 	free(buf);

Then in your change on top you just call that generate_object_header(),
or better yet your amended write_object_file_flags() can just call a
similarly amended write_object_file_prepare() directly.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-13  8:05       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-13  8:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin


On Fri, Dec 10 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> +	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))

There's some odd code formatting here, i.e.. ") ,&" not "), &". Could
also use line-wrapping at 79 characters.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v5 0/6] unpack large blobs in stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
                           ` (12 more replies)
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
                         ` (5 subsequent siblings)
  6 siblings, 13 replies; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v5:
* Refactor write_loose_object() to reuse in stream version sugguest by
  Ævar Arnfjörð Bjarmason [1].

* Add a new testcase into t5590-unpack-non-delta-objects to cover the case of
  unpacking existing objects.

* Fix code formatting in unpack-objects.c sugguest by
  Ævar Arnfjörð Bjarmason [2].

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
2. https://lore.kernel.org/git/211213.867dc8ansq.gmgdl@evledraar.gmail.com/

Han Xin (6):
  object-file.c: release strbuf in write_loose_object()
  object-file.c: refactor object header generation into a function
  object-file.c: refactor write_loose_object() to reuse in stream
    version
  object-file.c: make "write_object_file_flags()" to support read in
    stream
  unpack-objects.c: add dry_run mode for get_data()
  unpack-objects: unpack_non_delta_entry() read data in a stream

 Documentation/config/core.txt       |  11 ++
 builtin/unpack-objects.c            |  94 ++++++++++++-
 cache.h                             |   2 +
 config.c                            |   5 +
 environment.c                       |   1 +
 object-file.c                       | 207 +++++++++++++++++++++++-----
 object-store.h                      |   5 +
 t/t5590-unpack-non-delta-objects.sh |  87 ++++++++++++
 8 files changed, 370 insertions(+), 42 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v5:
1:  f3595e68cc < -:  ---------- object-file: refactor write_loose_object() to support read from stream
2:  c25fdd1fe5 < -:  ---------- object-file.c: handle undetermined oid in write_loose_object()
3:  ed226f2f9f < -:  ---------- object-file.c: read stream in a loop in write_loose_object()
-:  ---------- > 1:  59d35dac5f object-file.c: release strbuf in write_loose_object()
-:  ---------- > 2:  2174a6cbad object-file.c: refactor object header generation into a function
-:  ---------- > 3:  8a704ecc59 object-file.c: refactor write_loose_object() to reuse in stream version
-:  ---------- > 4:  96f05632a2 object-file.c: make "write_object_file_flags()" to support read in stream
4:  2f91e540f6 ! 5:  1acbb6e849 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize = dry_run ? 8192 : size;
    -+	void *buf = xmallocz(bufsize);
    ++	unsigned long bufsize;
    ++	void *buf;
      
      	memset(&stream, 0, sizeof(stream));
    ++	if (dry_run && size > 8192)
    ++		bufsize = 8192;
    ++	else
    ++		bufsize = size;
    ++	buf = xmallocz(bufsize);
      
      	stream.next_out = buf;
     -	stream.avail_out = size;
5:  7698938eac < -:  ---------- object-file.c: make "write_object_file_flags()" to support "HASH_STREAM"
6:  92d69cb84a ! 6:  476aaba527 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	int status;
     +};
     +
    -+static const void *feed_input_zstream(struct input_stream *in_stream, unsigned long *readlen)
    ++static const void *feed_input_zstream(const struct input_stream *in_stream,
    ++				      unsigned long *readlen)
     +{
     +	struct input_zstream_data *data = in_stream->data;
     +	git_zstream *zstream = data->zstream;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +		.read = feed_input_zstream,
     +		.data = &data,
     +	};
    -+	int ret;
     +
     +	memset(&zstream, 0, sizeof(zstream));
     +	memset(&data, 0, sizeof(data));
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	if ((ret = write_object_file_flags(&in_stream, size, type_name(OBJ_BLOB) ,&obj_list[nr].oid, HASH_STREAM)))
    -+		die(_("failed to write object in stream %d"), ret);
    ++	if (write_object_file_flags(&in_stream, size,
    ++				    type_name(OBJ_BLOB),
    ++				    &obj_list[nr].oid,
    ++				    HASH_STREAM))
    ++		die(_("failed to write object in stream"));
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
     +		die(_("inflate returned %d"), data.status);
     +	git_inflate_end(&zstream);
     +
    -+	if (strict && !dry_run) {
    ++	if (strict) {
     +		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    -+			die("invalid blob object from stream");
    ++			die(_("invalid blob object from stream"));
     +	}
     +	obj_list[nr].obj = NULL;
     +}
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +prepare_dest () {
     +	test_when_finished "rm -rf dest.git" &&
     +	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileStreamingThreshold $1
    ++	git -C dest.git config core.bigFileStreamingThreshold $1 &&
     +	git -C dest.git config core.bigFileThreshold $1
     +}
     +
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_cmp expect actual
     +'
     +
    ++test_expect_success 'unpack big object in stream with existing oids' '
    ++	prepare_dest 1m &&
    ++	git -C dest.git index-pack --stdin <test-$PACK.pack &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_must_be_empty actual &&
    ++	git -C dest.git unpack-objects <test-$PACK.pack &&
    ++	git -C dest.git fsck &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_must_be_empty actual
    ++'
    ++
     +test_expect_success 'unpack-objects dry-run' '
     +	prepare_dest 1m &&
     +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
  2021-12-17 11:26       ` Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 19:28         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
                         ` (4 subsequent siblings)
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Fix a strbuf leak in "write_loose_object()" sugguested by
Ævar Arnfjörð Bjarmason.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..32acf1dad6 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1874,11 +1874,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	fd = create_tmpfile(&tmp_file, filename.buf);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
-			return -1;
+			ret = -1;
 		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
+			ret = error(_("insufficient permission for adding an "
+				      "object to repository database %s"),
+				    get_object_directory());
 		else
-			return error_errno(_("unable to create temporary file"));
+			ret = error_errno(_("unable to create temporary file"));
+		goto cleanup;
 	}
 
 	/* Set it up */
@@ -1930,7 +1933,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 			warning_errno(_("failed utime() on %s"), tmp_file.buf);
 	}
 
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	ret = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&filename);
+	strbuf_release(&tmp_file);
+	return ret;
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v6 2/6] object-file.c: refactor object header generation into a function
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
  2021-12-17 11:26       ` Han Xin
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-17 11:26       ` [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
                         ` (3 subsequent siblings)
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

There are 3 places where "xsnprintf" is used to generate the object
header, and I originally planned to add a fourth in the latter patch.

According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
one line, it's also code that's very central to git, so reafactor them
into a function which will help later readability.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/object-file.c b/object-file.c
index 32acf1dad6..95fcd5435d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+static inline int generate_object_header(char *buf, int bufsz,
+					 const char *type_name,
+					 unsigned long size)
+{
+	return xsnprintf(buf, bufsz, "%s %"PRIuMAX, type_name,
+			 (uintmax_t)size) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(obj_type), size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = generate_object_header(hdr, *hdrlen, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2013,7 +2021,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = generate_object_header(hdr, sizeof(hdr), type_name(type), len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (2 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
                         ` (2 subsequent siblings)
  6 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in
stream instead of read into the whole buf.

As this new method "stream_loose_object()" has many similarities with
"write_loose_object()", we split up "write_loose_object()" into some
steps:
 1. Figuring out a path for the (temp) object file.
 2. Creating the tempfile.
 3. Setting up zlib and write header.
 4. Write object data and handle errors.
 5. Optionally, do someting after write, maybe force a loose object if
"mtime".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 98 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/object-file.c b/object-file.c
index 95fcd5435d..dd29e5372e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1751,6 +1751,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1836,7 +1855,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1844,7 +1864,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static void setup_stream_and_header(git_zstream *stream,
+				    unsigned char *compressed,
+				    unsigned long compressed_size,
+				    git_hash_ctx *c,
+				    char *hdr,
+				    int hdrlen)
+{
+	/* Set it up */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = compressed;
+	stream->avail_out = compressed_size;
+	the_hash_algo->init_fn(c);
+
+	/* First header.. */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1879,31 +1932,15 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
 	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			ret = -1;
-		else if (errno == EACCES)
-			ret = error(_("insufficient permission for adding an "
-				      "object to repository database %s"),
-				    get_object_directory());
-		else
-			ret = error_errno(_("unable to create temporary file"));
+		ret = -1;
 		goto cleanup;
 	}
 
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1932,16 +1969,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	ret = finalize_object_file(tmp_file.buf, filename.buf);
+	ret = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
 cleanup:
 	strbuf_release(&filename);
 	strbuf_release(&tmp_file);
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (3 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 22:52         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-17 11:26       ` [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in a
stream. The input stream is implemented as an interface.

When streaming a large blob object to "write_loose_object()", we have no
chance to run "write_object_file_prepare()" to calculate the oid in
advance. So we need to handle undetermined oid in a new function called
"stream_loose_object()".

In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
read the entire data contents in stream, so a new flag "HASH_STREAM" is
added. When read in stream, we needn't prepare the "oid" before
"write_loose_object()", only generate the header.
"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 cache.h        |  1 +
 object-file.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  5 +++
 3 files changed, 98 insertions(+)

diff --git a/cache.h b/cache.h
index cfba463aa9..6d68fd10a3 100644
--- a/cache.h
+++ b/cache.h
@@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
 #define HASH_FORMAT_CHECK 2
 #define HASH_RENORMALIZE  4
 #define HASH_SILENT 8
+#define HASH_STREAM 16
 int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
 int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
 
diff --git a/object-file.c b/object-file.c
index dd29e5372e..2ef1d4fb00 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1994,6 +1994,88 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
+			       const struct input_stream *in_stream,
+			       unsigned long len, time_t mtime, unsigned flags)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct object_id parano_oid;
+	static struct strbuf tmp_file = STRBUF_INIT;
+	static struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+
+	/* When oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (ret != Z_STREAM_END)
+		die(_("unable to deflate new object streamingly (%d)"), ret);
+	ret = git_deflate_end_gently(&stream);
+	if (ret != Z_OK)
+		die(_("deflateEnd on object streamingly failed (%d)"), ret);
+	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	close_loose_object(fd);
+
+	oidcpy(oid, &parano_oid);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+
+		if (mkdir_in_gitdir(dir.buf) < 0) {
+			err = -1;
+			goto cleanup;
+		}
+	}
+
+	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
@@ -2001,6 +2083,16 @@ int write_object_file_flags(const void *buf, unsigned long len,
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
 
+	/* When streaming a large blob object (marked as HASH_STREAM),
+	 * we have no chance to run "write_object_file_prepare()" to
+	 * calculate the "oid" in advance.  Call "stream_loose_object()"
+	 * to write loose object in stream.
+	 */
+	if (flags & HASH_STREAM) {
+		hdrlen = generate_object_header(hdr, hdrlen, type, len);
+		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	}
+
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
 	 */
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..4040e2c40a 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(const struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (4 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
@ 2021-12-17 11:26       ` Han Xin
  2021-12-17 21:22         ` René Scharfe
  2021-12-17 11:26       ` [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  6 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c4a17bdb44 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,21 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(unsigned long size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
                         ` (5 preceding siblings ...)
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-17 11:26       ` Han Xin
  6 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-17 11:26 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an
entrie buffer will have 10% performance penalty. Therefore, only unpack
object larger than the "core.BigFileStreamingThreshold" in zstream. See
the following benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 ++++
 builtin/unpack-objects.c            | 73 +++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 87 +++++++++++++++++++++++++++++
 6 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c4a17bdb44..42e1033d85 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(const struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, unsigned long size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (write_object_file_flags(&in_stream, size,
+				    type_name(OBJ_BLOB),
+				    &obj_list[nr].oid,
+				    HASH_STREAM))
+		die(_("failed to write object in stream"));
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 6d68fd10a3..976f9cf656 100644
--- a/cache.h
+++ b/cache.h
@@ -975,6 +975,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 0d06a31024..04bba593de 100644
--- a/environment.c
+++ b/environment.c
@@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..11c70e192c
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,87 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects when receive pack'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileStreamingThreshold $1 &&
+	git -C dest.git config core.bigFileThreshold $1
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest 2m &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream with existing oids' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.1.52.gfcc2252aea.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
@ 2021-12-17 19:28         ` René Scharfe
  2021-12-18  0:09           ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2021-12-17 19:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Fix a strbuf leak in "write_loose_object()" sugguested by
> Ævar Arnfjörð Bjarmason.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..32acf1dad6 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1874,11 +1874,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr,


Relevant context lines:

	static struct strbuf tmp_file = STRBUF_INIT;
	static struct strbuf filename = STRBUF_INIT;

	loose_object_path(the_repository, &filename, oid);

>  	fd = create_tmpfile(&tmp_file, filename.buf);
>  	if (fd < 0) {
>  		if (flags & HASH_SILENT)
> -			return -1;
> +			ret = -1;
>  		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> +			ret = error(_("insufficient permission for adding an "
> +				      "object to repository database %s"),
> +				    get_object_directory());
>  		else
> -			return error_errno(_("unable to create temporary file"));
> +			ret = error_errno(_("unable to create temporary file"));
> +		goto cleanup;
>  	}
>
>  	/* Set it up */
> @@ -1930,7 +1933,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  			warning_errno(_("failed utime() on %s"), tmp_file.buf);
>  	}
>
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	ret = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&filename);
> +	strbuf_release(&tmp_file);

There was no leak before.  Both strbufs are static and both functions
they are passed to (loose_object_path() and create_tmpfile()) reset
them first.  So while the allocated memory was not released before,
it was reused.

Not sure if making write_loose_object() allocate and release these
buffers on every call has much of a performance impact.  The only
reason I can think of for wanting such a change is to get rid of the
static buffers, to allow the function to be used by concurrent
threads.

So I think either keeping the code as-is or also making the strbufs
non-static would be better (but then discussing a possible
performance impact in the commit message would be nice).

> +	return ret;
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data()
  2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-17 21:22         ` René Scharfe
  0 siblings, 0 replies; 211+ messages in thread
From: René Scharfe @ 2021-12-17 21:22 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.

Clever.  Looks good to me.

For some reason I was expecting this patch to have some connection to
one of the earlier ones (perhaps because get_data() was mentioned),
but it is technically independent.

>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..c4a17bdb44 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>
> -static void *get_data(unsigned long size)
> +static void *get_data(unsigned long size, int dry_run)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize;
> +	void *buf;
>
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);
>
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
>  	return buf;
> @@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
>  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  				   unsigned nr)
>  {
> -	void *buf = get_data(size);
> +	void *buf = get_data(size, dry_run);
>
>  	if (!dry_run && buf)
>  		write_object(nr, type, buf, size);
> @@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  	if (type == OBJ_REF_DELTA) {
>  		oidread(&base_oid, fill(the_hash_algo->rawsz));
>  		use(the_hash_algo->rawsz);
> -		delta_data = get_data(delta_size);
> +		delta_data = get_data(delta_size, dry_run);
>  		if (dry_run || !delta_data) {
>  			free(delta_data);
>  			return;
> @@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
>  			die("offset value out of bound for delta base object");
>
> -		delta_data = get_data(delta_size);
> +		delta_data = get_data(delta_size, dry_run);
>  		if (dry_run || !delta_data) {
>  			free(delta_data);
>  			return;


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream
  2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
@ 2021-12-17 22:52         ` René Scharfe
  0 siblings, 0 replies; 211+ messages in thread
From: René Scharfe @ 2021-12-17 22:52 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 17.12.21 um 12:26 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "stream_loose_object()" in a
> stream. The input stream is implemented as an interface.
>
> When streaming a large blob object to "write_loose_object()", we have no
> chance to run "write_object_file_prepare()" to calculate the oid in
> advance. So we need to handle undetermined oid in a new function called
> "stream_loose_object()".
>
> In "write_loose_object()", we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object, so we have to save the temporary file in ".git/objects/"
> directory instead.
>
> We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
> read the entire data contents in stream, so a new flag "HASH_STREAM" is
> added. When read in stream, we needn't prepare the "oid" before
> "write_loose_object()", only generate the header.
> "freshen_packed_object()" or "freshen_loose_object()" will be called
> inside "stream_loose_object()" after obtaining the "oid".
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  cache.h        |  1 +
>  object-file.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  object-store.h |  5 +++
>  3 files changed, 98 insertions(+)
>
> diff --git a/cache.h b/cache.h
> index cfba463aa9..6d68fd10a3 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
>  #define HASH_FORMAT_CHECK 2
>  #define HASH_RENORMALIZE  4
>  #define HASH_SILENT 8
> +#define HASH_STREAM 16
>  int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
>  int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
>
> diff --git a/object-file.c b/object-file.c
> index dd29e5372e..2ef1d4fb00 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1994,6 +1994,88 @@ static int freshen_packed_object(const struct object_id *oid)
>  	return 1;
>  }
>
> +static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
> +			       const struct input_stream *in_stream,
> +			       unsigned long len, time_t mtime, unsigned flags)
> +{
> +	int fd, ret, err = 0, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct object_id parano_oid;
> +	static struct strbuf tmp_file = STRBUF_INIT;
> +	static struct strbuf filename = STRBUF_INIT;

Note these static strbufs.

> +	int dirlen;
> +
> +	/* When oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());
> +
> +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
> +	if (fd < 0) {
> +		err = -1;
> +		goto cleanup;
> +	}
> +
> +	/* Set it up and write header */
> +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
> +				&c, hdr, hdrlen);
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +		if (!stream.avail_in) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (len + hdrlen == stream.total_in + stream.avail_in)
> +				flush = Z_FINISH;
> +		}
> +		ret = git_deflate(&stream, flush);
> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream.next_out = compressed;
> +		stream.avail_out = sizeof(compressed);
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
> +
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to deflate new object streamingly (%d)"), ret);
> +	ret = git_deflate_end_gently(&stream);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);
> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +
> +	close_loose_object(fd);
> +
> +	oidcpy(oid, &parano_oid);
> +
> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		goto cleanup;
> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	/* We finally know the object path, and create the missing dir. */
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		strbuf_add(&dir, filename.buf, dirlen - 1);
> +
> +		if (mkdir_in_gitdir(dir.buf) < 0) {
> +			err = -1;
> +			goto cleanup;
> +		}
> +	}
> +
> +	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);

The static strbufs are released here.  That combination is strange --
why keep the variable values between calls by making them static, but
throw away the allocated buffers instead of reusing them?

Given that this function is only used for huge objects I think making
the strbufs non-static and releasing them is the best choice here.

> +	return err;
> +}
> +
>  int write_object_file_flags(const void *buf, unsigned long len,
>  			    const char *type, struct object_id *oid,
>  			    unsigned flags)
> @@ -2001,6 +2083,16 @@ int write_object_file_flags(const void *buf, unsigned long len,
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen = sizeof(hdr);
>
> +	/* When streaming a large blob object (marked as HASH_STREAM),
> +	 * we have no chance to run "write_object_file_prepare()" to
> +	 * calculate the "oid" in advance.  Call "stream_loose_object()"
> +	 * to write loose object in stream.
> +	 */
> +	if (flags & HASH_STREAM) {
> +		hdrlen = generate_object_header(hdr, hdrlen, type, len);
> +		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
> +	}

So stream_loose_object() is called by passing the flag HASH_STREAM to
write_object_file_flags() and passing a struct input_stream via its
buf pointer.  That's ... unconventional.  Certainly scary.  Why not
export stream_loose_object() and call it directly?  Demo patch below.

> +
>  	/* Normally if we have it in the pack then we do not bother writing
>  	 * it out into .git/objects/??/?{38} file.
>  	 */
> diff --git a/object-store.h b/object-store.h
> index 952efb6a4b..4040e2c40a 100644
> --- a/object-store.h
> +++ b/object-store.h
> @@ -34,6 +34,11 @@ struct object_directory {
>  	char *path;
>  };
>
> +struct input_stream {
> +	const void *(*read)(const struct input_stream *, unsigned long *len);
> +	void *data;
> +};
> +
>  KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
>  	struct object_directory *, 1, fspathhash, fspatheq)
>


diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 42e1033d85..07d186bd20 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -375,10 +375,8 @@ static void write_stream_blob(unsigned nr, unsigned long size)
 	data.zstream = &zstream;
 	git_inflate_init(&zstream);

-	if (write_object_file_flags(&in_stream, size,
-				    type_name(OBJ_BLOB),
-				    &obj_list[nr].oid,
-				    HASH_STREAM))
+	if (stream_loose_object(&in_stream, size, type_name(OBJ_BLOB), 0, 0,
+				&obj_list[nr].oid))
 		die(_("failed to write object in stream"));

 	if (zstream.total_out != size || data.status != Z_STREAM_END)
diff --git a/object-file.c b/object-file.c
index 2ef1d4fb00..0a6b65ab26 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1994,9 +1994,9 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }

-static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
-			       const struct input_stream *in_stream,
-			       unsigned long len, time_t mtime, unsigned flags)
+int stream_loose_object(struct input_stream *in_stream, unsigned long len,
+			const char *type, time_t mtime, unsigned flags,
+			struct object_id *oid)
 {
 	int fd, ret, err = 0, flush = 0;
 	unsigned char compressed[4096];
@@ -2006,6 +2006,10 @@ static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
 	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen = sizeof(hdr);
+
+	hdrlen = generate_object_header(hdr, hdrlen, type, len);

 	/* When oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
@@ -2083,16 +2087,6 @@ int write_object_file_flags(const void *buf, unsigned long len,
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);

-	/* When streaming a large blob object (marked as HASH_STREAM),
-	 * we have no chance to run "write_object_file_prepare()" to
-	 * calculate the "oid" in advance.  Call "stream_loose_object()"
-	 * to write loose object in stream.
-	 */
-	if (flags & HASH_STREAM) {
-		hdrlen = generate_object_header(hdr, hdrlen, type, len);
-		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
-	}
-
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
 	 */
diff --git a/object-store.h b/object-store.h
index 4040e2c40a..786b6435b1 100644
--- a/object-store.h
+++ b/object-store.h
@@ -237,6 +237,10 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }

+int stream_loose_object(struct input_stream *in_stream, unsigned long len,
+			const char *type, time_t mtime, unsigned flags,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object()
  2021-12-17 19:28         ` René Scharfe
@ 2021-12-18  0:09           ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2021-12-18  0:09 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

René Scharfe <l.s.r@web.de> writes:

> There was no leak before.  Both strbufs are static and both functions
> they are passed to (loose_object_path() and create_tmpfile()) reset
> them first.  So while the allocated memory was not released before,
> it was reused.
>
> Not sure if making write_loose_object() allocate and release these
> buffers on every call has much of a performance impact.  The only
> reason I can think of for wanting such a change is to get rid of the
> static buffers, to allow the function to be used by concurrent
> threads.
>
> So I think either keeping the code as-is or also making the strbufs
> non-static would be better (but then discussing a possible
> performance impact in the commit message would be nice).

Makes sense.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
@ 2021-12-20 12:10         ` Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
                             ` (2 more replies)
  0 siblings, 3 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-20 12:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jeff King, Philip Oakley, Derrick Stolee,
	Ævar Arnfjörð Bjarmason

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---

On Fri, Dec 17 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> There are 3 places where "xsnprintf" is used to generate the object
> header, and I originally planned to add a fourth in the latter patch.
>
> According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
> one line, it's also code that's very central to git, so reafactor them
> into a function which will help later readability.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>

I came up with this after my comment on the earlier round suggesting
to factor out that header formatting. I don't know if this more
thorough approach is worth it or if you'd like to replace your change
with this one, but just posting it here as an RFC.

 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 14 +++++++++++---
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7dc..900c6539f68 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac806..446dea7c516 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_loose_header((char *)obuf, sizeof(obuf),
+					 type, (uintmax_t)size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index d5cafba17d4..ccece21a4a2 100644
--- a/cache.h
+++ b/cache.h
@@ -1309,6 +1309,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_loose_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_loose_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_loose_header() is a convenience wrapper for
+ * format_loose_header_extended().
+ */
+int format_loose_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_loose_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_loose_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004a..d1a8619e0af 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index eac67f6f5f9..d94609ee48d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1009,6 +1009,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_loose_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1037,7 +1045,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1737,7 +1745,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_loose_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2009,7 +2017,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_loose_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.1119.g606023410ba


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
@ 2021-12-20 12:48           ` Philip Oakley
  2021-12-20 22:25           ` Junio C Hamano
  2021-12-21 11:43           ` Han Xin
  2 siblings, 0 replies; 211+ messages in thread
From: Philip Oakley @ 2021-12-20 12:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, git
  Cc: Junio C Hamano, Han Xin, Jeff King, Derrick Stolee,
	Johannes Schindelin

Hi Ævar,
(catching up after a week away, and noticed your patch today..)

On 20/12/2021 12:10, Ævar Arnfjörð Bjarmason wrote:
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.

I recently completed a PR in the Git for Windows build that is focused on
"git hash-object --literally" as a starter for LLP64 large file (>4GB)
compatibility.
(https://github.com/git-for-windows/git/pull/3533), which Dscho has
merged (cc'd).

I'm not sure that the `extended` version will work as expected across
the test suite
as multiple fake object types are tried, though I only skimmed the patch.

I'd support the general thrust, but just wanted to synchronise any changes.

Philip
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>
> On Fri, Dec 17 2021, Han Xin wrote:
>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> There are 3 places where "xsnprintf" is used to generate the object
>> header, and I originally planned to add a fourth in the latter patch.
>>
>> According to Ævar Arnfjörð Bjarmason’s suggestion, although it's just
>> one line, it's also code that's very central to git, so reafactor them
>> into a function which will help later readability.
>>
>> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> I came up with this after my comment on the earlier round suggesting
> to factor out that header formatting. I don't know if this more
> thorough approach is worth it or if you'd like to replace your change
> with this one, but just posting it here as an RFC.
>
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index c23d01de7dc..900c6539f68 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
>  	int hdrlen;
>  
>  	if (!is_delta_type(type)) {
> -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
> -				   type_name(type),(uintmax_t)size) + 1;
> +		hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)size);
>  		the_hash_algo->init_fn(&c);
>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>  	} else
> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index 8785b2ac806..446dea7c516 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>  	if (seekback == (off_t) -1)
>  		return error("cannot find the current offset");
>  
> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
> -			       type_name(type), (uintmax_t)size) + 1;
> +	header_len = format_loose_header((char *)obuf, sizeof(obuf),
> +					 type, (uintmax_t)size);
>  	the_hash_algo->init_fn(&ctx);
>  	the_hash_algo->update_fn(&ctx, obuf, header_len);
>  
> diff --git a/cache.h b/cache.h
> index d5cafba17d4..ccece21a4a2 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1309,6 +1309,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
>  						    unsigned long bufsiz,
>  						    struct strbuf *hdrbuf);
>  
> +/**
> + * format_loose_header() is a thin wrapper around s xsnprintf() that
> + * writes the initial "<type> <obj-len>" part of the loose object
> + * header. It returns the size that snprintf() returns + 1.
> + *
> + * The format_loose_header_extended() function allows for writing a
> + * type_name that's not one of the "enum object_type" types. This is
> + * used for "git hash-object --literally". Pass in a OBJ_NONE as the
> + * type, and a non-NULL "type_str" to do that.
> + *
> + * format_loose_header() is a convenience wrapper for
> + * format_loose_header_extended().
> + */
> +int format_loose_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *type_str, size_t objsize);
> +static inline int format_loose_header(char *str, size_t size,
> +				      enum object_type type, size_t objsize)
> +{
> +	return format_loose_header_extended(str, size, type, NULL, objsize);
> +}
> +
>  /**
>   * parse_loose_header() parses the starting "<type> <len>\0" of an
>   * object. If it doesn't follow that format -1 is returned. To check
> diff --git a/http-push.c b/http-push.c
> index 3309aaf004a..d1a8619e0af 100644
> --- a/http-push.c
> +++ b/http-push.c
> @@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
>  	git_zstream stream;
>  
>  	unpacked = read_object_file(&request->obj->oid, &type, &len);
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), type, (uintmax_t)len);
>  
>  	/* Set it up */
>  	git_deflate_init(&stream, zlib_compression_level);
> diff --git a/object-file.c b/object-file.c
> index eac67f6f5f9..d94609ee48d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1009,6 +1009,14 @@ void *xmmap(void *start, size_t length,
>  	return ret;
>  }
>  
> +int format_loose_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *typestr, size_t objsize)
> +{
> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
> +
> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
> +}
> +
>  /*
>   * With an in-core object data in "map", rehash it to make sure the
>   * object name actually matches "oid" to detect object corruption.
> @@ -1037,7 +1045,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
>  		return -1;
>  
>  	/* Generate the header */
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), obj_type, size);
>  
>  	/* Sha1.. */
>  	r->hash_algo->init_fn(&c);
> @@ -1737,7 +1745,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	git_hash_ctx c;
>  
>  	/* Generate the header */
> -	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
> +	*hdrlen = format_loose_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
>  
>  	/* Sha1.. */
>  	algo->init_fn(&c);
> @@ -2009,7 +2017,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_loose_header(hdr, sizeof(hdr), type, len);
>  	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>  


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
@ 2021-12-20 22:25           ` Junio C Hamano
  2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:43           ` Han Xin
  2 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2021-12-20 22:25 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
> ...
> +/**
> + * format_loose_header() is a thin wrapper around s xsnprintf() that

The name should have "object" somewhere in it.  Not all readers can
be expected to know that you meant "loose" to be an acceptable short
hand for "loose object".

That nit aside, I think it is a good idea to give people a common
helper function to call.  I am undecided if it is a good idea to
make it take enum or "const char *"; most everybody should be able
to say

	format_object_header(type_name(OBJ_COMMIT), ...)

just fine, so two variants might be overkill, just to allow 

	format_object_header(OBJ_COMMIT, ...)

and to forbid

	format_object_header("connit", ...)

I dunno.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 22:25           ` Junio C Hamano
@ 2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
  2021-12-21  2:11               ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21  1:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee


On Mon, Dec 20 2021, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:
>
>> Add a convenience function to wrap the xsnprintf() command that
>> generates loose object headers. This code was copy/pasted in various
>> parts of the codebase, let's define it in one place and re-use it from
>> there.
>> ...
>> +/**
>> + * format_loose_header() is a thin wrapper around s xsnprintf() that
>
> The name should have "object" somewhere in it.  Not all readers can
> be expected to know that you meant "loose" to be an acceptable short
> hand for "loose object".

*nod*

> That nit aside, I think it is a good idea to give people a common
> helper function to call.  I am undecided if it is a good idea to
> make it take enum or "const char *"; most everybody should be able
> to say
>
> 	format_object_header(type_name(OBJ_COMMIT), ...)
>
> just fine, so two variants might be overkill, just to allow 
>
> 	format_object_header(OBJ_COMMIT, ...)
>
> and to forbid
>
> 	format_object_header("connit", ...)
>
> I dunno.

Ultimately only a single API caller in hash-object.c really cares about
something else than the enum.

I've got some patches locally to convert e.g. write_object_file() to use
the enum, and it removes the need for some callers to convert enum to
char *, only to have other things convert it back.

So I think for any new APIs it makes sense to work towards sidelining
the hash-object.c --literally caller.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
@ 2021-12-21  2:11               ` Junio C Hamano
  2021-12-21  2:27                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2021-12-21  2:11 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> I've got some patches locally to convert e.g. write_object_file() to use
> the enum, and it removes the need for some callers to convert enum to
> char *, only to have other things convert it back.
>
> So I think for any new APIs it makes sense to work towards sidelining
> the hash-object.c --literally caller.

Your logic is backwards to argue "because I did something this way,
it makes sense to do it this way"?

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-21  2:11               ` Junio C Hamano
@ 2021-12-21  2:27                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21  2:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Han Xin, Jeff King, Philip Oakley, Derrick Stolee

On Mon, Dec 20 2021, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> I've got some patches locally to convert e.g. write_object_file() to use
>> the enum, and it removes the need for some callers to convert enum to
>> char *, only to have other things convert it back.
>>
>> So I think for any new APIs it makes sense to work towards sidelining
>> the hash-object.c --literally caller.
>
> Your logic is backwards to argue "because I did something this way,
> it makes sense to do it this way"?

No, it's that if you look at the write_object_file() and
hash_object_file() callers in-tree now many, including in object-file.c
itself are taking an "enum object_type" only to convert it to a string,
and then we'll in turn sometimes convert that to the "enum object_type"
again at some lower level.

That API inconsistency dates back to at least Linus's a733cb606fe
(Change pack file format. Hopefully for the last time., 2005-06-28).

I'm just pointing out that I have local patches that prove that a lot of
back & forth is done for no good reason, and that this is one of the
codepaths that's tangentally involved. So it makes sense in this case to
make any new API take "enum object_type" as the primary interface.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file API: add a format_loose_header() function
  2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
  2021-12-20 12:48           ` Philip Oakley
  2021-12-20 22:25           ` Junio C Hamano
@ 2021-12-21 11:43           ` Han Xin
  2 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:43 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git List, Junio C Hamano, Jeff King, Philip Oakley,
	Derrick Stolee

On Mon, Dec 20, 2021 at 8:10 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> I came up with this after my comment on the earlier round suggesting
> to factor out that header formatting. I don't know if this more
> thorough approach is worth it or if you'd like to replace your change
> with this one, but just posting it here as an RFC.
>

I will take this patch and rename the function name from
"format_loose_header()" to "format_object_header()".

Thanks
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v7 0/5] unpack large blobs in stream
  2021-12-17 11:26       ` Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
                           ` (11 subsequent siblings)
  12 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v6:
* Remove "object-file.c: release strbuf in write_loose_object()" which is not
  needed anymore. Thanks to René Scharfe[1] for reporting this.

* Reorder the patch series and put "unpack-objects.c: add dry_run mode for get_data()"
  and its testcases to the front.

* Replace "refactor object header generation into a function" with
  "object-file API: add a format_object_header() function" sugguested by
  Ævar Arnfjörð Bjarmason[2].

* Export "write_stream_object_file()" instead of "reusing write_object_file_flags()"
  sugguested by René Scharfe[3]. The new flag "HASH_STREAM" has been removed.

* Fix the directory creation error and the "strbuf dir" leak in
  "write_stream_object_file()".

* Change "unsigned long size" to "size_t size" in "write_stream_blob()" and
  "get_data()" in "unpack-objects.c".

1. https://lore.kernel.org/git/c860c56f-ce25-4391-7f65-50c9d5d80c2c@web.de/
2. https://lore.kernel.org/git/RFC-patch-1.1-bda62567f6b-20211220T120740Z-avarab@gmail.com/
3. https://lore.kernel.org/git/e959e4f1-7500-5f6b-5bd2-2f060287eeff@web.de/

Han Xin (4):
  unpack-objects.c: add dry_run mode for get_data()
  object-file.c: refactor write_loose_object() to reuse in stream
    version
  object-file.c: add "write_stream_object_file()" to support read in
    stream
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 Documentation/config/core.txt       |  11 ++
 builtin/index-pack.c                |   3 +-
 builtin/unpack-objects.c            |  94 ++++++++++++-
 bulk-checkin.c                      |   4 +-
 cache.h                             |  22 +++
 config.c                            |   5 +
 environment.c                       |   1 +
 http-push.c                         |   2 +-
 object-file.c                       | 199 ++++++++++++++++++++++------
 object-store.h                      |   9 ++
 t/t5590-unpack-non-delta-objects.sh |  91 +++++++++++++
 11 files changed, 392 insertions(+), 49 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

Range-diff against v6:
1:  59d35dac5f < -:  ---------- object-file.c: release strbuf in write_loose_object()
2:  2174a6cbad < -:  ---------- object-file.c: refactor object header generation into a function
5:  1acbb6e849 ! 1:  a8f232f553 unpack-objects.c: add dry_run mode for get_data()
    @@ builtin/unpack-objects.c: static void use(int bytes)
      }
      
     -static void *get_data(unsigned long size)
    -+static void *get_data(unsigned long size, int dry_run)
    ++static void *get_data(size_t size, int dry_run)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize;
    ++	size_t bufsize;
     +	void *buf;
      
      	memset(&stream, 0, sizeof(stream));
    @@ builtin/unpack-objects.c: static void unpack_delta_entry(enum object_type type,
      		if (dry_run || !delta_data) {
      			free(delta_data);
      			return;
    +
    + ## t/t5590-unpack-non-delta-objects.sh (new) ##
    +@@
    ++#!/bin/sh
    ++#
    ++# Copyright (c) 2021 Han Xin
    ++#
    ++
    ++test_description='Test unpack-objects with non-delta objects'
    ++
    ++GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
    ++export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    ++
    ++. ./test-lib.sh
    ++
    ++prepare_dest () {
    ++	test_when_finished "rm -rf dest.git" &&
    ++	git init --bare dest.git
    ++}
    ++
    ++test_expect_success "setup repo with big blobs (1.5 MB)" '
    ++	test-tool genrandom foo 1500000 >big-blob &&
    ++	test_commit --append foo big-blob &&
    ++	test-tool genrandom bar 1500000 >big-blob &&
    ++	test_commit --append bar big-blob &&
    ++	(
    ++		cd .git &&
    ++		find objects/?? -type f | sort
    ++	) >expect &&
    ++	PACK=$(echo main | git pack-objects --revs test)
    ++'
    ++
    ++test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    ++	GIT_ALLOC_LIMIT=1m &&
    ++	export GIT_ALLOC_LIMIT
    ++'
    ++
    ++test_expect_success 'fail to unpack-objects: cannot allocate' '
    ++	prepare_dest &&
    ++	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	grep "fatal: attempting to allocate" err &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/?? -type f | sort
    ++	) >actual &&
    ++	test_file_not_empty actual &&
    ++	! test_cmp expect actual
    ++'
    ++
    ++test_expect_success 'unpack-objects dry-run' '
    ++	prepare_dest &&
    ++	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    ++	(
    ++		cd dest.git &&
    ++		find objects/ -type f
    ++	) >actual &&
    ++	test_must_be_empty actual
    ++'
    ++
    ++test_done
-:  ---------- > 2:  0d2e0f3a00 object-file API: add a format_object_header() function
3:  8a704ecc59 ! 3:  a571b8f16c object-file.c: refactor write_loose_object() to reuse in stream version
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	loose_object_path(the_repository, &filename, oid);
      
     -	fd = create_tmpfile(&tmp_file, filename.buf);
    -+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    - 	if (fd < 0) {
    +-	if (fd < 0) {
     -		if (flags & HASH_SILENT)
    --			ret = -1;
    +-			return -1;
     -		else if (errno == EACCES)
    --			ret = error(_("insufficient permission for adding an "
    --				      "object to repository database %s"),
    --				    get_object_directory());
    +-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
     -		else
    --			ret = error_errno(_("unable to create temporary file"));
    -+		ret = -1;
    - 		goto cleanup;
    - 	}
    - 
    +-			return error_errno(_("unable to create temporary file"));
    +-	}
    +-
     -	/* Set it up */
     -	git_deflate_init(&stream, zlib_compression_level);
     -	stream.next_out = compressed;
     -	stream.avail_out = sizeof(compressed);
     -	the_hash_algo->init_fn(&c);
    --
    ++	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    ++	if (fd < 0)
    ++		return -1;
    + 
     -	/* First header.. */
     -	stream.next_in = (unsigned char *)hdr;
     -	stream.avail_in = hdrlen;
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -			warning_errno(_("failed utime() on %s"), tmp_file.buf);
     -	}
     -
    --	ret = finalize_object_file(tmp_file.buf, filename.buf);
    -+	ret = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
    - cleanup:
    - 	strbuf_release(&filename);
    - 	strbuf_release(&tmp_file);
    +-	return finalize_object_file(tmp_file.buf, filename.buf);
    ++	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
    ++					       mtime, flags);
    + }
    + 
    + static int freshen_loose_object(const struct object_id *oid)
4:  96f05632a2 ! 4:  1de06a8f5c object-file.c: make "write_object_file_flags()" to support read in stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: make "write_object_file_flags()" to support read in stream
    +    object-file.c: add "write_stream_object_file()" to support read in stream
     
         We used to call "get_data()" in "unpack_non_delta_entry()" to read the
         entire contents of a blob object, no matter how big it is. This
         implementation may consume all the memory and cause OOM.
     
    -    This can be improved by feeding data to "stream_loose_object()" in a
    -    stream. The input stream is implemented as an interface.
    -
    -    When streaming a large blob object to "write_loose_object()", we have no
    -    chance to run "write_object_file_prepare()" to calculate the oid in
    -    advance. So we need to handle undetermined oid in a new function called
    -    "stream_loose_object()".
    +    This can be improved by feeding data to "write_stream_object_file()"
    +    in a stream. The input stream is implemented as an interface.
     
    +    The difference with "write_loose_object()" is that we have no chance
    +    to run "write_object_file_prepare()" to calculate the oid in advance.
         In "write_loose_object()", we know the oid and we can write the
         temporary file in the same directory as the final object, but for an
         object with an undetermined oid, we don't know the exact directory for
         the object, so we have to save the temporary file in ".git/objects/"
         directory instead.
     
    -    We will reuse "write_object_file_flags()" in "unpack_non_delta_entry()" to
    -    read the entire data contents in stream, so a new flag "HASH_STREAM" is
    -    added. When read in stream, we needn't prepare the "oid" before
    -    "write_loose_object()", only generate the header.
         "freshen_packed_object()" or "freshen_loose_object()" will be called
    -    inside "stream_loose_object()" after obtaining the "oid".
    +    inside "write_stream_object_file()" after obtaining the "oid".
     
    +    Helped-by: René Scharfe <l.s.r@web.de>
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    - ## cache.h ##
    -@@ cache.h: int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
    - #define HASH_FORMAT_CHECK 2
    - #define HASH_RENORMALIZE  4
    - #define HASH_SILENT 8
    -+#define HASH_STREAM 16
    - int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
    - int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
    - 
    -
      ## object-file.c ##
     @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
      	return 1;
      }
      
    -+static int stream_loose_object(struct object_id *oid, char *hdr, int hdrlen,
    -+			       const struct input_stream *in_stream,
    -+			       unsigned long len, time_t mtime, unsigned flags)
    ++int write_stream_object_file(struct input_stream *in_stream, size_t len,
    ++			     enum object_type type, time_t mtime,
    ++			     unsigned flags, struct object_id *oid)
     +{
    -+	int fd, ret, err = 0, flush = 0;
    ++	int fd, ret, flush = 0;
     +	unsigned char compressed[4096];
     +	git_zstream stream;
     +	git_hash_ctx c;
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	static struct strbuf tmp_file = STRBUF_INIT;
     +	static struct strbuf filename = STRBUF_INIT;
     +	int dirlen;
    ++	char hdr[MAX_HEADER_LEN];
    ++	int hdrlen = sizeof(hdr);
     +
    ++	/* Since "filename" is defined as static, it will be reused. So reset it
    ++	 * first before using it. */
    ++	strbuf_reset(&filename);
     +	/* When oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
     +
     +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    -+	if (fd < 0) {
    -+		err = -1;
    -+		goto cleanup;
    -+	}
    ++	if (fd < 0)
    ++		return -1;
    ++
    ++	hdrlen = format_object_header(hdr, hdrlen, type, len);
     +
     +	/* Set it up and write header */
     +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
    -+		goto cleanup;
    ++		return 0;
     +	}
     +
     +	loose_object_path(the_repository, &filename, oid);
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +		struct strbuf dir = STRBUF_INIT;
     +		strbuf_add(&dir, filename.buf, dirlen - 1);
     +
    -+		if (mkdir_in_gitdir(dir.buf) < 0) {
    -+			err = -1;
    -+			goto cleanup;
    ++		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
    ++			ret = error_errno(_("unable to create directory %s"), dir.buf);
    ++			strbuf_release(&dir);
    ++			return ret;
     +		}
    ++		strbuf_release(&dir);
     +	}
     +
    -+	err = finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
    -+cleanup:
    -+	strbuf_release(&tmp_file);
    -+	strbuf_release(&filename);
    -+	return err;
    ++	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
     +}
     +
      int write_object_file_flags(const void *buf, unsigned long len,
      			    const char *type, struct object_id *oid,
      			    unsigned flags)
    -@@ object-file.c: int write_object_file_flags(const void *buf, unsigned long len,
    - 	char hdr[MAX_HEADER_LEN];
    - 	int hdrlen = sizeof(hdr);
    - 
    -+	/* When streaming a large blob object (marked as HASH_STREAM),
    -+	 * we have no chance to run "write_object_file_prepare()" to
    -+	 * calculate the "oid" in advance.  Call "stream_loose_object()"
    -+	 * to write loose object in stream.
    -+	 */
    -+	if (flags & HASH_STREAM) {
    -+		hdrlen = generate_object_header(hdr, hdrlen, type, len);
    -+		return stream_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
    -+	}
    -+
    - 	/* Normally if we have it in the pack then we do not bother writing
    - 	 * it out into .git/objects/??/?{38} file.
    - 	 */
     
      ## object-store.h ##
     @@ object-store.h: struct object_directory {
    @@ object-store.h: struct object_directory {
      };
      
     +struct input_stream {
    -+	const void *(*read)(const struct input_stream *, unsigned long *len);
    ++	const void *(*read)(struct input_stream *, unsigned long *len);
     +	void *data;
     +};
     +
      KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
      	struct object_directory *, 1, fspathhash, fspatheq)
      
    +@@ object-store.h: static inline int write_object_file(const void *buf, unsigned long len,
    + 	return write_object_file_flags(buf, len, type, oid, 0);
    + }
    + 
    ++int write_stream_object_file(struct input_stream *in_stream, size_t len,
    ++			     enum object_type type, time_t mtime,
    ++			     unsigned flags, struct object_id *oid);
    ++
    + int hash_object_file_literally(const void *buf, unsigned long len,
    + 			       const char *type, struct object_id *oid,
    + 			       unsigned flags);
6:  476aaba527 ! 5:  e7b4e426ef unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	int status;
     +};
     +
    -+static const void *feed_input_zstream(const struct input_stream *in_stream,
    ++static const void *feed_input_zstream(struct input_stream *in_stream,
     +				      unsigned long *readlen)
     +{
     +	struct input_zstream_data *data = in_stream->data;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	return data->buf;
     +}
     +
    -+static void write_stream_blob(unsigned nr, unsigned long size)
    ++static void write_stream_blob(unsigned nr, size_t size)
     +{
     +	git_zstream zstream;
     +	struct input_zstream_data data;
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	if (write_object_file_flags(&in_stream, size,
    -+				    type_name(OBJ_BLOB),
    -+				    &obj_list[nr].oid,
    -+				    HASH_STREAM))
    ++	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
    ++				     &obj_list[nr].oid))
     +		die(_("failed to write object in stream"));
     +
     +	if (zstream.total_out != size || data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	git_inflate_end(&zstream);
     +
     +	if (strict) {
    -+		struct blob *blob = lookup_blob(the_repository, &obj_list[nr].oid);
    ++		struct blob *blob =
    ++			lookup_blob(the_repository, &obj_list[nr].oid);
     +		if (blob)
     +			blob->object.flags |= FLAG_WRITTEN;
     +		else
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	void *buf;
     +
     +	/* Write large blob in stream without allocating full buffer. */
    -+	if (!dry_run && type == OBJ_BLOB && size > big_file_streaming_threshold) {
    ++	if (!dry_run && type == OBJ_BLOB &&
    ++	    size > big_file_streaming_threshold) {
     +		write_stream_blob(nr, size);
     +		return;
     +	}
    @@ environment.c: size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
      const char *editor_program;
      const char *askpass_program;
     
    - ## t/t5590-unpack-non-delta-objects.sh (new) ##
    -@@
    -+#!/bin/sh
    -+#
    -+# Copyright (c) 2021 Han Xin
    -+#
    -+
    -+test_description='Test unpack-objects when receive pack'
    -+
    -+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
    -+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    -+
    -+. ./test-lib.sh
    -+
    -+prepare_dest () {
    -+	test_when_finished "rm -rf dest.git" &&
    -+	git init --bare dest.git &&
    -+	git -C dest.git config core.bigFileStreamingThreshold $1 &&
    -+	git -C dest.git config core.bigFileThreshold $1
    -+}
    -+
    -+test_expect_success "setup repo with big blobs (1.5 MB)" '
    -+	test-tool genrandom foo 1500000 >big-blob &&
    -+	test_commit --append foo big-blob &&
    -+	test-tool genrandom bar 1500000 >big-blob &&
    -+	test_commit --append bar big-blob &&
    -+	(
    -+		cd .git &&
    -+		find objects/?? -type f | sort
    -+	) >expect &&
    -+	PACK=$(echo main | git pack-objects --revs test)
    -+'
    -+
    -+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    -+	GIT_ALLOC_LIMIT=1m &&
    -+	export GIT_ALLOC_LIMIT
    -+'
    -+
    -+test_expect_success 'fail to unpack-objects: cannot allocate' '
    + ## t/t5590-unpack-non-delta-objects.sh ##
    +@@ t/t5590-unpack-non-delta-objects.sh: export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
    + prepare_dest () {
    + 	test_when_finished "rm -rf dest.git" &&
    + 	git init --bare dest.git
    ++	if test -n "$1"
    ++	then
    ++		git -C dest.git config core.bigFileStreamingThreshold $1
    ++		git -C dest.git config core.bigFileThreshold $1
    ++	fi
    + }
    + 
    + test_expect_success "setup repo with big blobs (1.5 MB)" '
    +@@ t/t5590-unpack-non-delta-objects.sh: test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
    + '
    + 
    + test_expect_success 'fail to unpack-objects: cannot allocate' '
    +-	prepare_dest &&
     +	prepare_dest 2m &&
    -+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    -+	grep "fatal: attempting to allocate" err &&
    -+	(
    -+		cd dest.git &&
    -+		find objects/?? -type f | sort
    -+	) >actual &&
    -+	test_file_not_empty actual &&
    -+	! test_cmp expect actual
    -+'
    -+
    + 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    + 	grep "fatal: attempting to allocate" err &&
    + 	(
    +@@ t/t5590-unpack-non-delta-objects.sh: test_expect_success 'fail to unpack-objects: cannot allocate' '
    + 	! test_cmp expect actual
    + '
    + 
     +test_expect_success 'unpack big object in stream' '
     +	prepare_dest 1m &&
    ++	mkdir -p dest.git/objects/05 &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
     +	git -C dest.git fsck &&
     +	(
    @@ t/t5590-unpack-non-delta-objects.sh (new)
     +	test_must_be_empty actual
     +'
     +
    -+test_expect_success 'unpack-objects dry-run' '
    -+	prepare_dest 1m &&
    -+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    -+	(
    -+		cd dest.git &&
    -+		find objects/ -type f
    -+	) >actual &&
    -+	test_must_be_empty actual
    -+'
    -+
    -+test_done
    + test_expect_success 'unpack-objects dry-run' '
    + 	prepare_dest &&
    + 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:06           ` Jiang Xin
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
                           ` (10 subsequent siblings)
  12 siblings, 2 replies; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

In dry_run mode, "get_data()" is used to verify the inflation of data,
and the returned buffer will not be used at all and will be freed
immediately. Even in dry_run mode, it is dangerous to allocate a
full-size buffer for a large blob object. Therefore, only allocate a
low memory footprint when calling "get_data()" in dry_run mode.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c            | 23 +++++++++---
 t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 6 deletions(-)
 create mode 100755 t/t5590-unpack-non-delta-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..9104eb48da 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,21 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
-static void *get_data(unsigned long size)
+static void *get_data(size_t size, int dry_run)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	size_t bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,6 +130,11 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
 	return buf;
@@ -323,7 +334,7 @@ static void added_object(unsigned nr, enum object_type type,
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf = get_data(size, dry_run);
 
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
@@ -357,7 +368,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	if (type == OBJ_REF_DELTA) {
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
@@ -396,7 +407,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 			die("offset value out of bound for delta base object");
 
-		delta_data = get_data(delta_size);
+		delta_data = get_data(delta_size, dry_run);
 		if (dry_run || !delta_data) {
 			free(delta_data);
 			return;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
new file mode 100755
index 0000000000..48c4fb1ba3
--- /dev/null
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='Test unpack-objects with non-delta objects'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "setup repo with big blobs (1.5 MB)" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	(
+		cd .git &&
+		find objects/?? -type f | sort
+	) >expect &&
+	PACK=$(echo main | git pack-objects --revs test)
+'
+
+test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'fail to unpack-objects: cannot allocate' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_file_not_empty actual &&
+	! test_cmp expect actual
+'
+
+test_expect_success 'unpack-objects dry-run' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/ -type f
+	) >actual &&
+	test_must_be_empty actual
+'
+
+test_done
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-17 11:26       ` Han Xin
  2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:30           ` René Scharfe
  2021-12-31  3:12           ` [PATCH v7 2/5] object-file API: add a format_object_header() function Jiang Xin
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
                           ` (9 subsequent siblings)
  12 siblings, 2 replies; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 14 +++++++++++---
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..4a765ddae6 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..1733a1de4f 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, (uintmax_t)size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f55e316ff4 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index eb1426f98c..6bba4766f9 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2006,7 +2014,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-17 11:26       ` Han Xin
                           ` (2 preceding siblings ...)
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
@ 2021-12-21 11:51         ` Han Xin
  2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
                           ` (8 subsequent siblings)
  12 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:51 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in
stream instead of read into the whole buf.

As this new method "stream_loose_object()" has many similarities with
"write_loose_object()", we split up "write_loose_object()" into some
steps:
 1. Figuring out a path for the (temp) object file.
 2. Creating the tempfile.
 3. Setting up zlib and write header.
 4. Write object data and handle errors.
 5. Optionally, do someting after write, maybe force a loose object if
"mtime".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 100 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 35 deletions(-)

diff --git a/object-file.c b/object-file.c
index 6bba4766f9..e048f3d39e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1751,6 +1751,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1836,7 +1855,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1844,7 +1864,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static void setup_stream_and_header(git_zstream *stream,
+				    unsigned char *compressed,
+				    unsigned long compressed_size,
+				    git_hash_ctx *c,
+				    char *hdr,
+				    int hdrlen)
+{
+	/* Set it up */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = compressed;
+	stream->avail_out = compressed_size;
+	the_hash_algo->init_fn(c);
+
+	/* First header.. */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1879,28 +1932,13 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0)
+		return -1;
 
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1929,16 +1967,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
+					       mtime, flags);
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-17 11:26       ` Han Xin
                           ` (3 preceding siblings ...)
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-21 11:52         ` Han Xin
  2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
                           ` (7 subsequent siblings)
  12 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:52 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_stream_object_file()"
in a stream. The input stream is implemented as an interface.

The difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "write_stream_object_file()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  9 ++++++
 2 files changed, 94 insertions(+)

diff --git a/object-file.c b/object-file.c
index e048f3d39e..d0573e2a61 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1989,6 +1989,91 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int write_stream_object_file(struct input_stream *in_stream, size_t len,
+			     enum object_type type, time_t mtime,
+			     unsigned flags, struct object_id *oid)
+{
+	int fd, ret, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct object_id parano_oid;
+	static struct strbuf tmp_file = STRBUF_INIT;
+	static struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen = sizeof(hdr);
+
+	/* Since "filename" is defined as static, it will be reused. So reset it
+	 * first before using it. */
+	strbuf_reset(&filename);
+	/* When oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0)
+		return -1;
+
+	hdrlen = format_object_header(hdr, hdrlen, type, len);
+
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (len + hdrlen == stream.total_in + stream.avail_in)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (ret != Z_STREAM_END)
+		die(_("unable to deflate new object streamingly (%d)"), ret);
+	ret = git_deflate_end_gently(&stream);
+	if (ret != Z_OK)
+		die(_("deflateEnd on object streamingly failed (%d)"), ret);
+	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	close_loose_object(fd);
+
+	oidcpy(oid, &parano_oid);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		return 0;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen - 1);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			ret = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			return ret;
+		}
+		strbuf_release(&dir);
+	}
+
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..061b0cb2ba 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,11 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +237,10 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int write_stream_object_file(struct input_stream *in_stream, size_t len,
+			     enum object_type type, time_t mtime,
+			     unsigned flags, struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-17 11:26       ` Han Xin
                           ` (4 preceding siblings ...)
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
@ 2021-12-21 11:52         ` Han Xin
  2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:19           ` Jiang Xin
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                           ` (6 subsequent siblings)
  12 siblings, 2 replies; 211+ messages in thread
From: Han Xin @ 2021-12-21 11:52 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an
entrie buffer will have 10% performance penalty. Therefore, only unpack
object larger than the "core.BigFileStreamingThreshold" in zstream. See
the following benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 +++++
 builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
 6 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 9104eb48da..72d8616e00 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
+				     &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB &&
+	    size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 64071a8d80..8c9123cb5d 100644
--- a/cache.h
+++ b/cache.h
@@ -974,6 +974,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 0d06a31024..04bba593de 100644
--- a/environment.c
+++ b/environment.c
@@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
index 48c4fb1ba3..8436cbf8db 100755
--- a/t/t5590-unpack-non-delta-objects.sh
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -13,6 +13,11 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
 	git init --bare dest.git
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileStreamingThreshold $1
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_expect_success "setup repo with big blobs (1.5 MB)" '
@@ -33,7 +38,7 @@ test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
 '
 
 test_expect_success 'fail to unpack-objects: cannot allocate' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err &&
 	(
@@ -44,6 +49,35 @@ test_expect_success 'fail to unpack-objects: cannot allocate' '
 	! test_cmp expect actual
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	mkdir -p dest.git/objects/05 &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream with existing oids' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual
+'
+
 test_expect_success 'unpack-objects dry-run' '
 	prepare_dest &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-- 
2.34.1.52.g80008efde6.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
@ 2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-21 14:43             ` René Scharfe
  2021-12-22 11:29             ` Jiang Xin
  2021-12-31  3:06           ` Jiang Xin
  1 sibling, 2 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:09 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c            | 23 +++++++++---
>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+), 6 deletions(-)
>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..9104eb48da 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>  
> -static void *get_data(unsigned long size)
> +static void *get_data(size_t size, int dry_run)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	size_t bufsize;
> +	void *buf;
>  
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);

Maybe I'm misunderstanding this, but the commit message says it would be
dangerous to allocate a very larger buffer, but here we only limit the
size under "dry_run".

Removing that "&& size > 8192" makes all the tests pass still, so there
seems to be some missing coverage here in any case.

> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> new file mode 100755
> index 0000000000..48c4fb1ba3
> --- /dev/null
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -0,0 +1,57 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='Test unpack-objects with non-delta objects'
> +
> +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> +
> +. ./test-lib.sh
> +
> +prepare_dest () {
> +	test_when_finished "rm -rf dest.git" &&
> +	git init --bare dest.git
> +}
> +
> +test_expect_success "setup repo with big blobs (1.5 MB)" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	(
> +		cd .git &&
> +		find objects/?? -type f | sort
> +	) >expect &&
> +	PACK=$(echo main | git pack-objects --revs test)
> +'
> +
> +test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'fail to unpack-objects: cannot allocate' '
> +	prepare_dest &&
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	grep "fatal: attempting to allocate" err &&
> +	(
> +		cd dest.git &&
> +		find objects/?? -type f | sort
> +	) >actual &&
> +	test_file_not_empty actual &&
> +	! test_cmp expect actual
> +'
> +
> +test_expect_success 'unpack-objects dry-run' '
> +	prepare_dest &&
> +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
> +	(
> +		cd dest.git &&
> +		find objects/ -type f
> +	) >actual &&
> +	test_must_be_empty actual
> +'
> +
> +test_done

I commented on this "find" usage in an earlier round, I think there's a
much easier way to do this. You're really just going back and forth
between checking whether or not all the objects are loose.

I think that the below fix-up on top of this series is a better way to
do that, and more accurate. I.e. in your test here you check "!
test_cmp", which means that we could have some packed and some loose,
but really what you're meaning to check is a flip-flop between "all
loose?" and "no loose?.

In addition to that there was no reason to hardcode "main", we can just
use HEAD. All in all I think the below fix-up makes sense:

diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
index 8436cbf8db6..d78bb89225d 100755
--- a/t/t5590-unpack-non-delta-objects.sh
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -5,9 +5,6 @@
 
 test_description='Test unpack-objects with non-delta objects'
 
-GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
-export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
-
 . ./test-lib.sh
 
 prepare_dest () {
@@ -20,16 +17,22 @@ prepare_dest () {
 	fi
 }
 
+assert_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	echo $glob >actual &&
+	test_cmp expect actual
+}
+
 test_expect_success "setup repo with big blobs (1.5 MB)" '
 	test-tool genrandom foo 1500000 >big-blob &&
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
-	(
-		cd .git &&
-		find objects/?? -type f | sort
-	) >expect &&
-	PACK=$(echo main | git pack-objects --revs test)
+
+	# Everything is loose
+	rmdir .git/objects/pack &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
 '
 
 test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
@@ -41,51 +44,27 @@ test_expect_success 'fail to unpack-objects: cannot allocate' '
 	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_file_not_empty actual &&
-	! test_cmp expect actual
+	rmdir dest.git/objects/pack
 '
 
 test_expect_success 'unpack big object in stream' '
 	prepare_dest 1m &&
 	mkdir -p dest.git/objects/05 &&
 	git -C dest.git unpack-objects <test-$PACK.pack &&
-	git -C dest.git fsck &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_cmp expect actual
+	rmdir dest.git/objects/pack
 '
 
 test_expect_success 'unpack big object in stream with existing oids' '
 	prepare_dest 1m &&
 	git -C dest.git index-pack --stdin <test-$PACK.pack &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_must_be_empty actual &&
 	git -C dest.git unpack-objects <test-$PACK.pack &&
-	git -C dest.git fsck &&
-	(
-		cd dest.git &&
-		find objects/?? -type f | sort
-	) >actual &&
-	test_must_be_empty actual
+	assert_no_loose
 '
 
 test_expect_success 'unpack-objects dry-run' '
 	prepare_dest &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
-	(
-		cd dest.git &&
-		find objects/ -type f
-	) >actual &&
-	test_must_be_empty actual
+	assert_no_loose
 '
 
 test_done

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
@ 2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
  2021-12-22 12:02             ` Jiang Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:16 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;
>  
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);
> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());

This should be an error_errno() instead, ...

> +		else
> +			return error_errno(_("unable to create temporary file"));

...and we can just fold this whole if/else into one condition with a
briefer message, e.g.:

    error_errno(_("unable to add object to '%s'"), get_object_directory());

Or whatever, unless there's another bug here where you inverted these
conditions, and the "else" really should not use "error_errno" but
"error".... (I don't know...)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
@ 2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
  2021-12-21 15:05             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 14:20 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> +int write_stream_object_file(struct input_stream *in_stream, size_t len,
> +			     enum object_type type, time_t mtime,
> +			     unsigned flags, struct object_id *oid)
> +{
> +	int fd, ret, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct object_id parano_oid;
> +	static struct strbuf tmp_file = STRBUF_INIT;
> +	static struct strbuf filename = STRBUF_INIT;
> +	int dirlen;
> +	char hdr[MAX_HEADER_LEN];
> +	int hdrlen = sizeof(hdr);
> +
> +	/* Since "filename" is defined as static, it will be reused. So reset it
> +	 * first before using it. */
> +	strbuf_reset(&filename);
> +	/* When oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());

I realize this is somewhat following the pattern of code you moved
around earlier, but FWIW I think these sorts of comments are really
over-doing it. I.e. we try not to comment on things that are obvious
from the code itself.

Also René's comment on v6 still applies here:

    Given that this function is only used for huge objects I think making
    the strbufs non-static and releasing them is the best choice here.

I thin just making them non-static and doing a strbuf_release() as he
suggested is best here.

> +
> +	fd = create_tmpfile(&tmp_file, filename.buf, flags);
> +	if (fd < 0)
> +		return -1;
> +
> +	hdrlen = format_object_header(hdr, hdrlen, type, len);
> +
> +	/* Set it up and write header */
> +	setup_stream_and_header(&stream, compressed, sizeof(compressed),
> +				&c, hdr, hdrlen);
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +		if (!stream.avail_in) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (len + hdrlen == stream.total_in + stream.avail_in)
> +				flush = Z_FINISH;
> +		}
> +		ret = git_deflate(&stream, flush);
> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
> +			die(_("unable to write loose object file"));
> +		stream.next_out = compressed;
> +		stream.avail_out = sizeof(compressed);
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
> +
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to deflate new object streamingly (%d)"), ret);
> +	ret = git_deflate_end_gently(&stream);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);

nit: let's say "unable to stream deflate new object" or something, and
not use the confusing (invented?) word "streamingly".

> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +
> +	close_loose_object(fd);
> +
> +	oidcpy(oid, &parano_oid);

I see there's still quite a bit of duplication between this and
write_loose_object(), but maybe it's not easy to factor out.

> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		return 0;
> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	/* We finally know the object path, and create the missing dir. */
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		strbuf_add(&dir, filename.buf, dirlen - 1);

Just a minor nit, but I noticed we could have this on top, i.e. this
"remove the slash" is now what 1/3 users of it wan:
	
	 object-file.c | 10 +++++-----
	 1 file changed, 5 insertions(+), 5 deletions(-)
	
	diff --git a/object-file.c b/object-file.c
	index 77a3217fd0e..b0dea96906e 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1878,13 +1878,13 @@ static void close_loose_object(int fd)
	 		die_errno(_("error when closing loose object file"));
	 }
	 
	-/* Size of directory component, including the ending '/' */
	+/* Size of directory component, excluding the ending '/' */
	 static inline int directory_size(const char *filename)
	 {
	 	const char *s = strrchr(filename, '/');
	 	if (!s)
	 		return 0;
	-	return s - filename + 1;
	+	return s - filename;
	 }
	 
	 /*
	@@ -1901,7 +1901,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
	 
	 	strbuf_reset(tmp);
	 	strbuf_add(tmp, filename, dirlen);
	-	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
	+	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
	 	fd = git_mkstemp_mode(tmp->buf, 0444);
	 	do {
	 		if (fd >= 0 || !dirlen || errno != ENOENT)
	@@ -1913,7 +1913,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
	 		 * scratch.
	 		 */
	 		strbuf_reset(tmp);
	-		strbuf_add(tmp, filename, dirlen - 1);
	+		strbuf_add(tmp, filename, dirlen);
	 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
	 			break;
	 		if (adjust_shared_perm(tmp->buf))
	@@ -2100,7 +2100,7 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 	dirlen = directory_size(filename.buf);
	 	if (dirlen) {
	 		struct strbuf dir = STRBUF_INIT;
	-		strbuf_add(&dir, filename.buf, dirlen - 1);
	+		strbuf_add(&dir, filename.buf, dirlen);
	 
	 		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
	 			ret = error_errno(_("unable to create directory %s"), dir.buf);

On my platform (linux) it's not needed either way, a "mkdir foo" works
as well as "mkdir foo/", but maybe some oS's have trouble with it.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
@ 2021-12-21 14:30           ` René Scharfe
  2022-02-01 14:28             ` C99 %z (was: [PATCH v7 2/5] object-file API: add a format_object_header() function) Ævar Arnfjörð Bjarmason
  2021-12-31  3:12           ` [PATCH v7 2/5] object-file API: add a format_object_header() function Jiang Xin
  1 sibling, 1 reply; 211+ messages in thread
From: René Scharfe @ 2021-12-21 14:30 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 21.12.21 um 12:51 schrieb Han Xin:
> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index c23d01de7d..4a765ddae6 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
>  	int hdrlen;
>
>  	if (!is_delta_type(type)) {
> -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
> -				   type_name(type),(uintmax_t)size) + 1;
> +		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
                                                                      ^^^^^^^^^^^
This explicit cast is unnecessary.  It was needed with xsnprintf(), but
that implementation detail is handled inside the new helper function.

(format_object_header() takes a size_t; even if unsigned long would be
wider than that on some weird architecture, casting the size to
uintmax_t will not avoid the implicit truncation happening during the
function call.)

>  		the_hash_algo->init_fn(&c);
>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>  	} else
> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index 8785b2ac80..1733a1de4f 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>  	if (seekback == (off_t) -1)
>  		return error("cannot find the current offset");
>
> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
> -			       type_name(type), (uintmax_t)size) + 1;
> +	header_len = format_object_header((char *)obuf, sizeof(obuf),
> +					 type, (uintmax_t)size);
                                               ^^^^^^^^^^^
Same here, just that size is already of type size_t, so a cast makes
even less sense.

>  	the_hash_algo->init_fn(&ctx);
>  	the_hash_algo->update_fn(&ctx, obuf, header_len);
>
> diff --git a/cache.h b/cache.h
> index cfba463aa9..64071a8d80 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
>  						    unsigned long bufsiz,
>  						    struct strbuf *hdrbuf);
>
> +/**
> + * format_object_header() is a thin wrapper around s xsnprintf() that
> + * writes the initial "<type> <obj-len>" part of the loose object
> + * header. It returns the size that snprintf() returns + 1.
> + *
> + * The format_object_header_extended() function allows for writing a
> + * type_name that's not one of the "enum object_type" types. This is
> + * used for "git hash-object --literally". Pass in a OBJ_NONE as the
> + * type, and a non-NULL "type_str" to do that.
> + *
> + * format_object_header() is a convenience wrapper for
> + * format_object_header_extended().
> + */
> +int format_object_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *type_str, size_t objsize);
> +static inline int format_object_header(char *str, size_t size,
> +				      enum object_type type, size_t objsize)
> +{
> +	return format_object_header_extended(str, size, type, NULL, objsize);
> +}
> +
>  /**
>   * parse_loose_header() parses the starting "<type> <len>\0" of an
>   * object. If it doesn't follow that format -1 is returned. To check
> diff --git a/http-push.c b/http-push.c
> index 3309aaf004..f55e316ff4 100644
> --- a/http-push.c
> +++ b/http-push.c
> @@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
>  	git_zstream stream;
>
>  	unpacked = read_object_file(&request->obj->oid, &type, &len);
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
                                                              ^^^^^^^^^^^
Same here; len is of type unsigned long.

>
>  	/* Set it up */
>  	git_deflate_init(&stream, zlib_compression_level);
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..6bba4766f9 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
>  	return ret;
>  }
>
> +int format_object_header_extended(char *str, size_t size, enum object_type type,
> +				 const char *typestr, size_t objsize)
> +{
> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
> +
> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
                                                      ^^^^^^^^^^^
This cast is necessary to match PRIuMAX.  And that is used because the z
modifier (as in e.g. printf("%zu", sizeof(size_t));) was only added in
C99 and not all platforms may have it.  (Perhaps this cautious approach
is worth revisiting separately, now that some time has passed, but this
patch series should still use PRIuMAX, as it does.)

> +}
> +
>  /*
>   * With an in-core object data in "map", rehash it to make sure the
>   * object name actually matches "oid" to detect object corruption.
> @@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
>  		return -1;
>
>  	/* Generate the header */
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
>
>  	/* Sha1.. */
>  	r->hash_algo->init_fn(&c);
> @@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	git_hash_ctx c;
>
>  	/* Generate the header */
> -	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
> +	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
>
>  	/* Sha1.. */
>  	algo->init_fn(&c);
> @@ -2006,7 +2014,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	buf = read_object(the_repository, oid, &type, &len);
>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));
> -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
> +	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
>  	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
>  	free(buf);
>

No explicit cast in these three cases -- good.  They all pass an
unsigned long as last parameter btw.

René

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
@ 2021-12-21 14:43             ` René Scharfe
  2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
  2021-12-22 11:15               ` Jiang Xin
  2021-12-22 11:29             ` Jiang Xin
  1 sibling, 2 replies; 211+ messages in thread
From: René Scharfe @ 2021-12-21 14:43 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, Han Xin

Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>
>> In dry_run mode, "get_data()" is used to verify the inflation of data,
>> and the returned buffer will not be used at all and will be freed
>> immediately. Even in dry_run mode, it is dangerous to allocate a
>> full-size buffer for a large blob object. Therefore, only allocate a
>> low memory footprint when calling "get_data()" in dry_run mode.
>>
>> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>> ---
>>  builtin/unpack-objects.c            | 23 +++++++++---
>>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>>  2 files changed, 74 insertions(+), 6 deletions(-)
>>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>>
>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>> index 4a9466295b..9104eb48da 100644
>> --- a/builtin/unpack-objects.c
>> +++ b/builtin/unpack-objects.c
>> @@ -96,15 +96,21 @@ static void use(int bytes)
>>  	display_throughput(progress, consumed_bytes);
>>  }
>>
>> -static void *get_data(unsigned long size)
>> +static void *get_data(size_t size, int dry_run)
>>  {
>>  	git_zstream stream;
>> -	void *buf = xmallocz(size);
>> +	size_t bufsize;
>> +	void *buf;
>>
>>  	memset(&stream, 0, sizeof(stream));
>> +	if (dry_run && size > 8192)
>> +		bufsize = 8192;
>> +	else
>> +		bufsize = size;
>> +	buf = xmallocz(bufsize);
>
> Maybe I'm misunderstanding this, but the commit message says it would be
> dangerous to allocate a very larger buffer, but here we only limit the
> size under "dry_run".

This patch reduces the memory usage of dry runs, as its commit message
says.  The memory usage of one type of actual (non-dry) unpack is reduced
by patch 5.

> Removing that "&& size > 8192" makes all the tests pass still, so there
> seems to be some missing coverage here in any case.

How would you test that an 8KB buffer is allocated even though a smaller
one would suffice?  And why?  Wasting a few KB shouldn't be noticeable.

René

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:43             ` René Scharfe
@ 2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
  2021-12-22 11:15               ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:04 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, Han Xin


On Tue, Dec 21 2021, René Scharfe wrote:

> Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
>>
>> On Tue, Dec 21 2021, Han Xin wrote:
>>
>>> From: Han Xin <hanxin.hx@alibaba-inc.com>
>>>
>>> In dry_run mode, "get_data()" is used to verify the inflation of data,
>>> and the returned buffer will not be used at all and will be freed
>>> immediately. Even in dry_run mode, it is dangerous to allocate a
>>> full-size buffer for a large blob object. Therefore, only allocate a
>>> low memory footprint when calling "get_data()" in dry_run mode.
>>>
>>> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>>> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>>> ---
>>>  builtin/unpack-objects.c            | 23 +++++++++---
>>>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>>>  2 files changed, 74 insertions(+), 6 deletions(-)
>>>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>>>
>>> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
>>> index 4a9466295b..9104eb48da 100644
>>> --- a/builtin/unpack-objects.c
>>> +++ b/builtin/unpack-objects.c
>>> @@ -96,15 +96,21 @@ static void use(int bytes)
>>>  	display_throughput(progress, consumed_bytes);
>>>  }
>>>
>>> -static void *get_data(unsigned long size)
>>> +static void *get_data(size_t size, int dry_run)
>>>  {
>>>  	git_zstream stream;
>>> -	void *buf = xmallocz(size);
>>> +	size_t bufsize;
>>> +	void *buf;
>>>
>>>  	memset(&stream, 0, sizeof(stream));
>>> +	if (dry_run && size > 8192)
>>> +		bufsize = 8192;
>>> +	else
>>> +		bufsize = size;
>>> +	buf = xmallocz(bufsize);
>>
>> Maybe I'm misunderstanding this, but the commit message says it would be
>> dangerous to allocate a very larger buffer, but here we only limit the
>> size under "dry_run".
>
> This patch reduces the memory usage of dry runs, as its commit message
> says.  The memory usage of one type of actual (non-dry) unpack is reduced
> by patch 5.
>
>> Removing that "&& size > 8192" makes all the tests pass still, so there
>> seems to be some missing coverage here in any case.
>
> How would you test that an 8KB buffer is allocated even though a smaller
> one would suffice?  And why?  Wasting a few KB shouldn't be noticeable.

That doesn't sound like it needs to be tested. I was just trying to grok
what this was all doing. Thanks!

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream
  2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
@ 2021-12-21 15:05             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Ævar Arnfjörð Bjarmason wrote:

> On Tue, Dec 21 2021, Han Xin wrote:

>> +	/* Then the data itself.. */
>> +	do {
>> +		unsigned char *in0 = stream.next_in;
>> +		if (!stream.avail_in) {
>> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
>> +			stream.next_in = (void *)in;
>> +			in0 = (unsigned char *)in;
>> +			/* All data has been read. */
>> +			if (len + hdrlen == stream.total_in + stream.avail_in)
>> +				flush = Z_FINISH;
>> +		}
>> +		ret = git_deflate(&stream, flush);
>> +		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
>> +		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
>> +			die(_("unable to write loose object file"));
>> +		stream.next_out = compressed;
>> +		stream.avail_out = sizeof(compressed);
>> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
>> +
>> +	if (ret != Z_STREAM_END)
>> +		die(_("unable to deflate new object streamingly (%d)"), ret);
>> +	ret = git_deflate_end_gently(&stream);
>> +	if (ret != Z_OK)
>> +		die(_("deflateEnd on object streamingly failed (%d)"), ret);
>
> nit: let's say "unable to stream deflate new object" or something, and
> not use the confusing (invented?) word "streamingly".
>
>> +	the_hash_algo->final_oid_fn(&parano_oid, &c);
>> +
>> +	close_loose_object(fd);
>> +
>> +	oidcpy(oid, &parano_oid);
>
> I see there's still quite a bit of duplication between this and
> write_loose_object(), but maybe it's not easy to factor out.

For what it's worth I tried to do that and the result doesn't really
seem worth it. I.e. something like the below. The inner loop of the
do/while looks like it could get a similar treatment, but likewise
doesn't seem worth the effort.

diff --git a/object-file.c b/object-file.c
index b0dea96906e..7fc2363cfa1 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1957,6 +1957,46 @@ static void setup_stream_and_header(git_zstream *stream,
 	the_hash_algo->update_fn(c, hdr, hdrlen);
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     enum object_type type, size_t len,
+				     char *hdr, int *hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename, flags);
+	if (fd < 0)
+		return -1;
+
+	if (type != OBJ_NONE)
+		*hdrlen = format_object_header(hdr, *hdrlen, type, len);
+
+	/* Set it up and write header */
+	setup_stream_and_header(stream, buf, buflen, c, hdr, *hdrlen);
+
+	return fd;
+
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *zstream_end_fmt,
+				    const char *z_ok_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(zstream_end_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(z_ok_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1970,15 +2010,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	static struct strbuf filename = STRBUF_INIT;
 
 	loose_object_path(the_repository, &filename, oid);
-
-	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_NONE, 0, hdr, &hdrlen);
 	if (fd < 0)
 		return -1;
 
-	/* Set it up and write header */
-	setup_stream_and_header(&stream, compressed, sizeof(compressed),
-				&c, hdr, hdrlen);
-
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
@@ -1992,14 +2029,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
@@ -2049,16 +2081,12 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
 	/* When oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
 
-	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, type, len, hdr, &hdrlen);
 	if (fd < 0)
 		return -1;
 
-	hdrlen = format_object_header(hdr, hdrlen, type, len);
-
-	/* Set it up and write header */
-	setup_stream_and_header(&stream, compressed, sizeof(compressed),
-				&c, hdr, hdrlen);
-
 	/* Then the data itself.. */
 	do {
 		unsigned char *in0 = stream.next_in;
@@ -2078,12 +2106,9 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object streamingly (%d)"), ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object streamingly failed (%d)"), ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	end_loose_object_common(ret, &c, &stream, &parano_oid, NULL,
+				N_("unable to deflate new object streamingly (%d)"),
+				N_("deflateEnd on object streamingly failed (%d)"));
 
 	close_loose_object(fd);
 

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
  2021-12-31  3:19           ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-12-21 15:06 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Han Xin


On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>  
> +core.bigFileStreamingThreshold::
> +	Files larger than this will be streamed out to a temporary
> +	object file while being hashed, which will when be renamed
> +	in-place to a loose object, particularly if the
> +	`core.bigFileThreshold' setting dictates that they're always
> +	written out as loose objects.
> ++
> +Default is 128 MiB on all platforms.
> ++
> +Common unit suffixes of 'k', 'm', or 'g' are supported.
> +
>  core.excludesFile::
>  	Specifies the pathname to the file that contains patterns to
>  	describe paths that are not meant to be tracked, in addition
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 9104eb48da..72d8616e00 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +struct input_zstream_data {
> +	git_zstream *zstream;
> +	unsigned char buf[8192];
> +	int status;
> +};
> +
> +static const void *feed_input_zstream(struct input_stream *in_stream,
> +				      unsigned long *readlen)
> +{
> +	struct input_zstream_data *data = in_stream->data;
> +	git_zstream *zstream = data->zstream;
> +	void *in = fill(1);
> +
> +	if (!len || data->status == Z_STREAM_END) {
> +		*readlen = 0;
> +		return NULL;
> +	}
> +
> +	zstream->next_out = data->buf;
> +	zstream->avail_out = sizeof(data->buf);
> +	zstream->next_in = in;
> +	zstream->avail_in = len;
> +
> +	data->status = git_inflate(zstream, 0);
> +	use(len - zstream->avail_in);
> +	*readlen = sizeof(data->buf) - zstream->avail_out;
> +
> +	return data->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, size_t size)
> +{
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +	};
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));

nit/style: both of these memset can be replaced by "{ 0 }", e.g. "git_zstream zstream = { 0 }".

> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
> +				     &obj_list[nr].oid))

So at the end of this series we never pass in anything but blob here,
mtime is always 0 etc. So there was no reason to create a factored out
finalize_object_file_with_mtime() earlier in the series.

Well, I don't mind the finalize_object_file_with_mtime() exiting, but
let's not pretend this is more generalized than it is. We're unlikely to
ever want to do this for non-blobs.

This on top of this series (and my local WIP fixups as I'm reviewing
this, so it won't cleanly apply, but the idea should be clear) makes
this simpler:
	
	diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
	index 2f8d34a2e47..a3a1d4b266f 100644
	--- a/builtin/unpack-objects.c
	+++ b/builtin/unpack-objects.c
	@@ -375,8 +375,7 @@ static void write_stream_blob(unsigned nr, size_t size)
	 	data.zstream = &zstream;
	 	git_inflate_init(&zstream);
	 
	-	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
	-				     &obj_list[nr].oid))
	+	if (write_stream_object_file(&in_stream, size, &obj_list[nr].oid))
	 		die(_("failed to write object in stream"));
	 
	 	if (zstream.total_out != size || data.status != Z_STREAM_END)
	diff --git a/object-file.c b/object-file.c
	index 7fc2363cfa1..0572b34fc5a 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -2061,8 +2061,7 @@ static int freshen_packed_object(const struct object_id *oid)
	 }
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid)
	+			     struct object_id *oid)
	 {
	 	int fd, ret, flush = 0;
	 	unsigned char compressed[4096];
	@@ -2081,9 +2080,9 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 	/* When oid is not determined, save tmp file to odb path. */
	 	strbuf_addf(&filename, "%s/", get_object_directory());
	 
	-	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
	+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
	 				       &stream, compressed, sizeof(compressed),
	-				       &c, type, len, hdr, &hdrlen);
	+				       &c, OBJ_BLOB, len, hdr, &hdrlen);
	 	if (fd < 0)
	 		return -1;
	 
	@@ -2135,7 +2134,7 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 		strbuf_release(&dir);
	 	}
	 
	-	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
	+	return finalize_object_file(tmp_file.buf, filename.buf);
	 }
	 
	 int write_object_file_flags(const void *buf, unsigned long len,
	diff --git a/object-store.h b/object-store.h
	index 87d370d39ca..1362b58a4d3 100644
	--- a/object-store.h
	+++ b/object-store.h
	@@ -257,8 +257,7 @@ int hash_write_object_file_literally(const void *buf, unsigned long len,
	 				     unsigned flags);
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid);
	+			     struct object_id *oid);
	 
	 /*
	  * Add an object file to the in-memory object store, without writing it
	

> +		die(_("failed to write object in stream"));
> diff --git a/environment.c b/environment.c
> index 0d06a31024..04bba593de 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
>  size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
>  size_t delta_base_cache_limit = 96 * 1024 * 1024;
>  unsigned long big_file_threshold = 512 * 1024 * 1024;
> +unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
>  int pager_use_color = 1;
>  const char *editor_program;
>  const char *askpass_program;
> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> index 48c4fb1ba3..8436cbf8db 100755
> --- a/t/t5590-unpack-non-delta-objects.sh
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -13,6 +13,11 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>  prepare_dest () {
>  	test_when_finished "rm -rf dest.git" &&
>  	git init --bare dest.git
> +	if test -n "$1"
> +	then
> +		git -C dest.git config core.bigFileStreamingThreshold $1
> +		git -C dest.git config core.bigFileThreshold $1
> +	fi

All of this new code is missing "&&" to chain & test forfailures.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:43             ` René Scharfe
  2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
@ 2021-12-22 11:15               ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-22 11:15 UTC (permalink / raw)
  To: René Scharfe
  Cc: Ævar Arnfjörð Bjarmason, Han Xin, Junio C Hamano,
	Git List, Jeff King, Jiang Xin, Philip Oakley, Derrick Stolee,
	Han Xin

On Wed, Dec 22, 2021 at 9:53 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 21.12.21 um 15:09 schrieb Ævar Arnfjörð Bjarmason:
> > Maybe I'm misunderstanding this, but the commit message says it would be
> > dangerous to allocate a very larger buffer, but here we only limit the
> > size under "dry_run".
>
> This patch reduces the memory usage of dry runs, as its commit message
> says.  The memory usage of one type of actual (non-dry) unpack is reduced
> by patch 5.
>

For Han Xin and me, it is very challenging to write better commit log
in English.  Since the commit is moved to the beginning, the commit
log should be rewritten as follows:

unpack-objects.c: low memory footprint for get_data() in dry_run mode

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to resolve unpack issue of large blob objects, refactor
"get_data()" to reduce memory footprint for dry_run mode. Because
in dry_run mode, "get_data()" is only used to check the integrity of
data, and the returned buffer is not used at all.

Therefore, add the flag "dry_run" as an additional parameter of
"get_data()" and reuse a small buffer in dry_run mode. Because in
dry_run mode, the return buffer is not the entire data that the user
wants, for this reason, we will release the buffer and return NULL.

Han Xin, I think you can try to free the allocated buffer for dry_run
mode inside "get_data()".

--
Jiang Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
  2021-12-21 14:43             ` René Scharfe
@ 2021-12-22 11:29             ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-22 11:29 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 8:37 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
> I commented on this "find" usage in an earlier round, I think there's a
> much easier way to do this. You're really just going back and forth
> between checking whether or not all the objects are loose.
>
> I think that the below fix-up on top of this series is a better way to
> do that, and more accurate. I.e. in your test here you check "!
> test_cmp", which means that we could have some packed and some loose,
> but really what you're meaning to check is a flip-flop between "all
> loose?" and "no loose?.
>
> In addition to that there was no reason to hardcode "main", we can just
> use HEAD. All in all I think the below fix-up makes sense:
>
> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> index 8436cbf8db6..d78bb89225d 100755
> --- a/t/t5590-unpack-non-delta-objects.sh
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -5,9 +5,6 @@
>
>  test_description='Test unpack-objects with non-delta objects'
>
> -GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
> -export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
> -
>  . ./test-lib.sh
>
>  prepare_dest () {
> @@ -20,16 +17,22 @@ prepare_dest () {
>         fi
>  }
>
> +assert_no_loose () {
> +       glob=dest.git/objects/?? &&
> +       echo "$glob" >expect &&
> +       echo $glob >actual &&

Incompatible for zsh. This may work:

    eval "echo $glob" >actual &&

--
Jiang Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version
  2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
@ 2021-12-22 12:02             ` Jiang Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-22 12:02 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 8:40 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> > [...]
> > @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >               strbuf_reset(tmp);
> >               strbuf_add(tmp, filename, dirlen - 1);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> > -                     return -1;
> > +                     break;
> >               if (adjust_shared_perm(tmp->buf))
> > -                     return -1;
> > +                     break;
> >
> >               /* Try again */
> >               strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >               fd = git_mkstemp_mode(tmp->buf, 0444);
> > +     } while (0);
> > +
> > +     if (fd < 0 && !(flags & HASH_SILENT)) {
> > +             if (errno == EACCES)
> > +                     return error(_("insufficient permission for adding an "
> > +                                    "object to repository database %s"),
> > +                                  get_object_directory());
>
> This should be an error_errno() instead, ...

We already know the errno (EACCESS) and output a decent error message,
so using error() is OK.  BTW, it's just a refactor by copy & paste.

>
> > +             else
> > +                     return error_errno(_("unable to create temporary file"));
>
> ...and we can just fold this whole if/else into one condition with a
> briefer message, e.g.:
>
>     error_errno(_("unable to add object to '%s'"), get_object_directory());
>
> Or whatever, unless there's another bug here where you inverted these
> conditions, and the "else" really should not use "error_errno" but
> "error".... (I don't know...)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data()
  2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
  2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
@ 2021-12-31  3:06           ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-31  3:06 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:33 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> In dry_run mode, "get_data()" is used to verify the inflation of data,
> and the returned buffer will not be used at all and will be freed
> immediately. Even in dry_run mode, it is dangerous to allocate a
> full-size buffer for a large blob object. Therefore, only allocate a
> low memory footprint when calling "get_data()" in dry_run mode.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c            | 23 +++++++++---
>  t/t5590-unpack-non-delta-objects.sh | 57 +++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+), 6 deletions(-)
>  create mode 100755 t/t5590-unpack-non-delta-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..9104eb48da 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,21 @@ static void use(int bytes)
>         display_throughput(progress, consumed_bytes);
>  }
>
> -static void *get_data(unsigned long size)
> +static void *get_data(size_t size, int dry_run)

After a offline talk with Han Xin, we feel it is not necessary to pass
"dry_run" as a argument, use the file-scope static variable directly
in "get_data()".

>  {
>         git_zstream stream;
> -       void *buf = xmallocz(size);
> +       size_t bufsize;
> +       void *buf;
>
>         memset(&stream, 0, sizeof(stream));
> +       if (dry_run && size > 8192)

Use the file-scope static variable "dry_run".

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 2/5] object-file API: add a format_object_header() function
  2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
  2021-12-21 14:30           ` René Scharfe
@ 2021-12-31  3:12           ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-31  3:12 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:56 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> Add a convenience function to wrap the xsnprintf() command that
> generates loose object headers. This code was copy/pasted in various
> parts of the codebase, let's define it in one place and re-use it from
> there.
>
> All except one caller of it had a valid "enum object_type" for us,
> it's only write_object_file_prepare() which might need to deal with
> "git hash-object --literally" and a potential garbage type. Let's have
> the primary API use an "enum object_type", and define an *_extended()
> function that can take an arbitrary "const char *" for the type.
>
> See [1] for the discussion that prompted this patch, i.e. new code in
> object-file.c that wanted to copy/paste the xsnprintf() invocation.
>
> 1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/index-pack.c |  3 +--
>  bulk-checkin.c       |  4 ++--
>  cache.h              | 21 +++++++++++++++++++++
>  http-push.c          |  2 +-
>  object-file.c        | 14 +++++++++++---
>  5 files changed, 36 insertions(+), 8 deletions(-)

After a offline review with Han Xin, we feel it's better to move this
fixup commit to the end of this series, and this commit will also fix
an additional "xsnprintf()" we introduced in this series.

--
Jiang Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
@ 2021-12-31  3:19           ` Jiang Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Jiang Xin @ 2021-12-31  3:19 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Han Xin

On Wed, Dec 22, 2021 at 2:56 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>
> +core.bigFileStreamingThreshold::
> +       Files larger than this will be streamed out to a temporary
> +       object file while being hashed, which will when be renamed
> +       in-place to a loose object, particularly if the
> +       `core.bigFileThreshold' setting dictates that they're always
> +       written out as loose objects.

Han Xin told me the reason to introduce another git config variable,
but I feel it not good to introduce an application specific config
variable as "core.XXX" and parsing it in "config.c".

So in patch v8, will still reuse the config variable
"core.bigFileThreshold", and will introduce an application specific
config variable, such as unpack.bigFileThreshold and parse the new
config in "builtin/unpack-objects.c".

--
Jiang Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v8 0/6] unpack large blobs in stream
  2021-12-17 11:26       ` Han Xin
                           ` (5 preceding siblings ...)
  2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
                             ` (5 more replies)
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
                           ` (5 subsequent siblings)
  12 siblings, 6 replies; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v7:
* Use functions "assert_no_loose()" and "assert_no_pack()" to do tests instead
  of "find" sugguseted by Ævar Arnfjörð Bjarmason[1].

* "get_data()" now use the global "dry_run" and it will release the buf before
  returning.

* Add a new commit "object-file.c: remove the slash for directory_size()"
  sugguseted by Ævar Arnfjörð Bjarmason[2].

* Add "int is_finished" to "struct input_stream" who will tell us if there is 
  next buffer in the stream.

* Remove the config "core.bigFileStreamingThreshold" introduced in v5, and keep
  using "core.bigFileThreshold". Until now, the config variable has been used in
  the cases listed in "unpack-objects: unpack_non_delta_entry() read data in a
  stream", this new case belongs to the packfile category.

* Remove unnecessary explicit cast in "object-file API: add a 
  format_object_header() function" sugguseted by René Scharfe[3].

1. https://lore.kernel.org/git/211221.86bl1arqls.gmgdl@evledraar.gmail.com/
2. https://lore.kernel.org/git/211221.8635mmrpps.gmgdl@evledraar.gmail.com/
3. https://lore.kernel.org/git/b2dee243-1a38-531e-02b1-ffd66c465fa5@web.de/

Han Xin (5):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: remove the slash for directory_size()
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 builtin/index-pack.c            |   3 +-
 builtin/unpack-objects.c        | 110 +++++++++++--
 bulk-checkin.c                  |   4 +-
 cache.h                         |  21 +++
 http-push.c                     |   2 +-
 object-file.c                   | 272 ++++++++++++++++++++++++++------
 object-store.h                  |   9 ++
 t/t5329-unpack-large-objects.sh |  69 ++++++++
 8 files changed, 422 insertions(+), 68 deletions(-)
 create mode 100755 t/t5329-unpack-large-objects.sh

Range-diff against v7:
1:  a8f232f553 < -:  ---------- unpack-objects.c: add dry_run mode for get_data()
-:  ---------- > 1:  bd34da5816 unpack-objects: low memory footprint for get_data() in dry_run mode
3:  a571b8f16c ! 2:  f9a4365a7d object-file.c: refactor write_loose_object() to reuse in stream version
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    object-file.c: refactor write_loose_object() to reuse in stream version
    +    object-file.c: refactor write_loose_object() to several steps
     
    -    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    -    entire contents of a blob object, no matter how big it is. This
    -    implementation may consume all the memory and cause OOM.
    +    When writing a large blob using "write_loose_object()", we have to pass
    +    a buffer with the whole content of the blob, and this behavior will
    +    consume lots of memory and may cause OOM. We will introduce a stream
    +    version function ("stream_loose_object()") in latter commit to resolve
    +    this issue.
     
    -    This can be improved by feeding data to "stream_loose_object()" in
    -    stream instead of read into the whole buf.
    +    Before introducing a stream vesion function for writing loose object,
    +    do some refactoring on "write_loose_object()" to reuse code for both
    +    versions.
     
    -    As this new method "stream_loose_object()" has many similarities with
    -    "write_loose_object()", we split up "write_loose_object()" into some
    -    steps:
    -     1. Figuring out a path for the (temp) object file.
    -     2. Creating the tempfile.
    -     3. Setting up zlib and write header.
    -     4. Write object data and handle errors.
    -     5. Optionally, do someting after write, maybe force a loose object if
    -    "mtime".
    +    Rewrite "write_loose_object()" as follows:
    +
    +     1. Figure out a path for the (temp) object file. This step is only
    +        used in "write_loose_object()".
    +
    +     2. Move common steps for starting to write loose objects into a new
    +        function "start_loose_object_common()".
    +
    +     3. Compress data.
    +
    +     4. Move common steps for ending zlib stream into a new funciton
    +        "end_loose_object_common()".
    +
    +     5. Close fd and finalize the object file.
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
    +    Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
      	return fd;
      }
      
    -+static void setup_stream_and_header(git_zstream *stream,
    -+				    unsigned char *compressed,
    -+				    unsigned long compressed_size,
    -+				    git_hash_ctx *c,
    -+				    char *hdr,
    -+				    int hdrlen)
    ++static int start_loose_object_common(struct strbuf *tmp_file,
    ++				     const char *filename, unsigned flags,
    ++				     git_zstream *stream,
    ++				     unsigned char *buf, size_t buflen,
    ++				     git_hash_ctx *c,
    ++				     enum object_type type, size_t len,
    ++				     char *hdr, int hdrlen)
     +{
    -+	/* Set it up */
    ++	int fd;
    ++
    ++	fd = create_tmpfile(tmp_file, filename, flags);
    ++	if (fd < 0)
    ++		return -1;
    ++
    ++	/*  Setup zlib stream for compression */
     +	git_deflate_init(stream, zlib_compression_level);
    -+	stream->next_out = compressed;
    -+	stream->avail_out = compressed_size;
    ++	stream->next_out = buf;
    ++	stream->avail_out = buflen;
     +	the_hash_algo->init_fn(c);
     +
    -+	/* First header.. */
    ++	/*  Start to feed header to zlib stream */
     +	stream->next_in = (unsigned char *)hdr;
     +	stream->avail_in = hdrlen;
     +	while (git_deflate(stream, 0) == Z_OK)
     +		; /* nothing */
     +	the_hash_algo->update_fn(c, hdr, hdrlen);
    ++
    ++	return fd;
    ++}
    ++
    ++static void end_loose_object_common(int ret, git_hash_ctx *c,
    ++				    git_zstream *stream,
    ++				    struct object_id *parano_oid,
    ++				    const struct object_id *expected_oid,
    ++				    const char *die_msg1_fmt,
    ++				    const char *die_msg2_fmt)
    ++{
    ++	if (ret != Z_STREAM_END)
    ++		die(_(die_msg1_fmt), ret, expected_oid);
    ++	ret = git_deflate_end_gently(stream);
    ++	if (ret != Z_OK)
    ++		die(_(die_msg2_fmt), ret, expected_oid);
    ++	the_hash_algo->final_oid_fn(parano_oid, c);
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	stream.next_out = compressed;
     -	stream.avail_out = sizeof(compressed);
     -	the_hash_algo->init_fn(&c);
    -+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
    -+	if (fd < 0)
    -+		return -1;
    - 
    +-
     -	/* First header.. */
     -	stream.next_in = (unsigned char *)hdr;
     -	stream.avail_in = hdrlen;
     -	while (git_deflate(&stream, 0) == Z_OK)
     -		; /* nothing */
     -	the_hash_algo->update_fn(&c, hdr, hdrlen);
    -+	/* Set it up and write header */
    -+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
    -+				&c, hdr, hdrlen);
    ++	/* Common steps for write_loose_object and stream_loose_object to
    ++	 * start writing loose oject:
    ++	 *
    ++	 *  - Create tmpfile for the loose object.
    ++	 *  - Setup zlib stream for compression.
    ++	 *  - Start to feed header to zlib stream.
    ++	 */
    ++	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
    ++				       &stream, compressed, sizeof(compressed),
    ++				       &c, OBJ_NONE, 0, hdr, hdrlen);
    ++	if (fd < 0)
    ++		return -1;
      
      	/* Then the data itself.. */
      	stream.next_in = (void *)buf;
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    + 		stream.avail_out = sizeof(compressed);
    + 	} while (ret == Z_OK);
    + 
    +-	if (ret != Z_STREAM_END)
    +-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
    +-		    ret);
    +-	ret = git_deflate_end_gently(&stream);
    +-	if (ret != Z_OK)
    +-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
    +-		    ret);
    +-	the_hash_algo->final_oid_fn(&parano_oid, &c);
    ++	/* Common steps for write_loose_object and stream_loose_object to
    ++	 * end writing loose oject:
    ++	 *
    ++	 *  - End the compression of zlib stream.
    ++	 *  - Get the calculated oid to "parano_oid".
    ++	 */
    ++	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
    ++				N_("unable to deflate new object %s (%d)"),
    ++				N_("deflateEnd on object %s failed (%d)"));
    ++
    + 	if (!oideq(oid, &parano_oid))
    + 		die(_("confused by unstable object source data for %s"),
    + 		    oid_to_hex(oid));
      
      	close_loose_object(fd);
      
-:  ---------- > 3:  18dd21122d object-file.c: remove the slash for directory_size()
-:  ---------- > 4:  964715451b object-file.c: add "stream_loose_object()" to handle large object
-:  ---------- > 5:  3f620466fe unpack-objects: unpack_non_delta_entry() read data in a stream
2:  0d2e0f3a00 ! 6:  8073a3888d object-file API: add a format_object_header() function
    @@ builtin/index-pack.c: static void *unpack_entry_data(off_t offset, unsigned long
      	if (!is_delta_type(type)) {
     -		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
     -				   type_name(type),(uintmax_t)size) + 1;
    -+		hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)size);
    ++		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
      		the_hash_algo->init_fn(&c);
      		the_hash_algo->update_fn(&c, hdr, hdrlen);
      	} else
    @@ bulk-checkin.c: static int deflate_to_pack(struct bulk_checkin_state *state,
     -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
     -			       type_name(type), (uintmax_t)size) + 1;
     +	header_len = format_object_header((char *)obuf, sizeof(obuf),
    -+					 type, (uintmax_t)size);
    ++					 type, size);
      	the_hash_algo->init_fn(&ctx);
      	the_hash_algo->update_fn(&ctx, obuf, header_len);
      
    @@ http-push.c: static void start_put(struct transfer_request *request)
      
      	unpacked = read_object_file(&request->obj->oid, &type, &len);
     -	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
    -+	hdrlen = format_object_header(hdr, sizeof(hdr), type, (uintmax_t)len);
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
      
      	/* Set it up */
      	git_deflate_init(&stream, zlib_compression_level);
    @@ object-file.c: static void write_object_file_prepare(const struct git_hash_algo
      
      	/* Sha1.. */
      	algo->init_fn(&c);
    +@@ object-file.c: int stream_loose_object(struct input_stream *in_stream, size_t len,
    + 
    + 	/* Since oid is not determined, save tmp file to odb path. */
    + 	strbuf_addf(&filename, "%s/", get_object_directory());
    +-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
    + 
    + 	/* Common steps for write_loose_object and stream_loose_object to
    + 	 * start writing loose oject:
     @@ object-file.c: int force_object_loose(const struct object_id *oid, time_t mtime)
      	buf = read_object(the_repository, oid, &type, &len);
      	if (!buf)
4:  1de06a8f5c < -:  ---------- object-file.c: add "write_stream_object_file()" to support read in stream
5:  e7b4e426ef < -:  ---------- unpack-objects: unpack_non_delta_entry() read data in a stream
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2021-12-17 11:26       ` Han Xin
                           ` (6 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 12:28           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
                           ` (4 subsequent siblings)
  12 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
 t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)
 create mode 100755 t/t5329-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c6d6c17072 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
new file mode 100755
index 0000000000..39c7a62d94
--- /dev/null
+++ b/t/t5329-unpack-large-objects.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+assert_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	eval "echo $glob" >actual &&
+	test_cmp expect actual
+}
+
+assert_no_pack () {
+	rmdir dest.git/objects/pack
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	assert_no_loose &&
+	assert_no_pack
+'
+
+test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2021-12-17 11:26       ` Han Xin
                           ` (7 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 12:28           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
                           ` (3 subsequent siblings)
  12 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in latter commit to resolve
this issue.

Before introducing a stream vesion function for writing loose object,
do some refactoring on "write_loose_object()" to reuse code for both
versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new funciton
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 44 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..5d163081b1 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     enum object_type type, size_t len,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename, flags);
+	if (fd < 0)
+		return -1;
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_NONE, 0, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid to "parano_oid".
+	 */
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
+					       mtime, flags);
 }
 
 static int freshen_loose_object(const struct object_id *oid)
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2021-12-17 11:26       ` Han Xin
                           ` (8 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08 17:24           ` René Scharfe
  2022-01-08  8:54         ` [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                           ` (2 subsequent siblings)
  12 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
slash as many users of it want.

Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index 5d163081b1..4f0127e823 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
 		die_errno(_("error when closing loose object file"));
 }
 
-/* Size of directory component, including the ending '/' */
+/* Size of directory component, excluding the ending '/' */
 static inline int directory_size(const char *filename)
 {
 	const char *s = strrchr(filename, '/');
 	if (!s)
 		return 0;
-	return s - filename + 1;
+	return s - filename;
 }
 
 /*
@@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
 
 	strbuf_reset(tmp);
 	strbuf_add(tmp, filename, dirlen);
-	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
+	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
 	do {
 		if (fd >= 0 || !dirlen || errno != ENOENT)
@@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
 		 * scratch.
 		 */
 		strbuf_reset(tmp);
-		strbuf_add(tmp, filename, dirlen - 1);
+		strbuf_add(tmp, filename, dirlen);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
 			break;
 		if (adjust_shared_perm(tmp->buf))
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object
  2021-12-17 11:26       ` Han Xin
                           ` (9 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08  8:54         ` [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2022-01-08  8:54         ` [PATCH v8 6/6] object-file API: add a format_object_header() function Han Xin
  12 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in latter commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   9 +++++
 2 files changed, 110 insertions(+)

diff --git a/object-file.c b/object-file.c
index 4f0127e823..a462a21629 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2012,6 +2012,107 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, OBJ_BLOB, len, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	close_loose_object(fd);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..cc41c64d69 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +238,9 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream
  2021-12-17 11:26       ` Han Xin
                           ` (10 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-01-08  8:54         ` Han Xin
  2022-01-08  8:54         ` [PATCH v8 6/6] object-file API: add a format_object_header() function Han Xin
  12 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()". However, unpack
non-delta objects from a stream instead of from an entrie buffer will
have 10% performance penalty.

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

Therefore, only unpack objects larger than the "core.bigFileThreshold"
in zstream. Until now, the config variable has been used in the
following cases, and our new case belongs to the packfile category.

 * Archive:

   + archive.c: write_entry(): write large blob entries to archive in
     stream.

 * Loose objects:

   + object-file.c: index_fd(): when hashing large files in worktree,
     read files in a stream, and create one packfile per large blob if
     want to save files to git object store.

   + object-file.c: read_loose_object(): when checking loose objects
     using "git-fsck", do not read full content of large loose objects.

 * Packfile:

   + fast-import.c: parse_and_store_blob(): streaming large blob from
     foreign source to packfile.

   + index-pack.c: check_collison(): read and check large blob in stream.

   + index-pack.c: unpack_entry_data(): do not return the entire
     contents of the big blob from packfile, but uses a fixed buf to
     perform some integrity checks on the object.

   + pack-check.c: verify_packfile(): used by "git-fsck" and will call
     check_object_signature() to check large blob in pack with the
     streaming interface.

   + pack-objects.c: get_object_details(): set "no_try_delta" for large
     blobs when counting objects.

   + pack-objects.c: write_no_reuse_object(): streaming large blob to
     pack.

   + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
     stream from packfile.

 * Others:

   + diff.c: diff_populate_filespec(): treat large blob file as binary.

   + streaming.c: istream_source(): as a helper of "open_istream()" to
     select proper streaming interface to read large blob from packfile.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5329-unpack-large-objects.sh | 23 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c6d6c17072..e9ec2b349d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
index 39c7a62d94..6f3bfb3df7 100755
--- a/t/t5329-unpack-large-objects.sh
+++ b/t/t5329-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 assert_no_loose () {
@@ -37,16 +41,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	assert_no_loose &&
 	assert_no_pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	assert_no_loose
+'
+
 test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v8 6/6] object-file API: add a format_object_header() function
  2021-12-17 11:26       ` Han Xin
                           ` (11 preceding siblings ...)
  2022-01-08  8:54         ` [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-08  8:54         ` Han Xin
  12 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-08  8:54 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 16 ++++++++++++----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..8a6ce77940 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..9e685f0f1a 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f0c044dcf7 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index a462a21629..d384ef2952 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2027,7 +2035,7 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
 
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
 
 	/* Common steps for write_loose_object and stream_loose_object to
 	 * start writing loose oject:
@@ -2168,7 +2176,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-01-08 12:28           ` René Scharfe
  2022-01-11 10:41             ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2022-01-08 12:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

 Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> As the name implies, "get_data(size)" will allocate and return a given
> size of memory. Allocating memory for a large blob object may cause the
> system to run out of memory. Before preparing to replace calling of
> "get_data()" to unpack large blob objects in latter commits, refactor
> "get_data()" to reduce memory footprint for dry_run mode.
>
> Because in dry_run mode, "get_data()" is only used to check the
> integrity of data, and the returned buffer is not used at all, we can
> allocate a smaller buffer and reuse it as zstream output. Therefore,
> in dry_run mode, "get_data()" will release the allocated buffer and
> return NULL instead of returning garbage data.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
>  t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
>  create mode 100755 t/t5329-unpack-large-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..c6d6c17072 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,31 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>
> +/*
> + * Decompress zstream from stdin and return specific size of data.
> + * The caller is responsible to free the returned buffer.
> + *
> + * But for dry_run mode, "get_data()" is only used to check the
> + * integrity of data, and the returned buffer is not used at all.
> + * Therefore, in dry_run mode, "get_data()" will release the small
> + * allocated buffer which is reused to hold temporary zstream output
> + * and return NULL instead of returning garbage data.
> + */
>  static void *get_data(unsigned long size)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize;
> +	void *buf;
>
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);
>
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
> +	if (dry_run)
> +		FREE_AND_NULL(buf);
>  	return buf;
>  }
>
> @@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  {
>  	void *buf = get_data(size);
>
> -	if (!dry_run && buf)
> +	if (buf)
>  		write_object(nr, type, buf, size);
> -	else
> -		free(buf);
>  }
>
>  static int resolve_against_held(unsigned nr, const struct object_id *base,
> @@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  		oidread(&base_oid, fill(the_hash_algo->rawsz));
>  		use(the_hash_algo->rawsz);
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		if (has_object_file(&base_oid))
>  			; /* Ok we have this one */
>  		else if (resolve_against_held(nr, &base_oid,
> @@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  			die("offset value out of bound for delta base object");
>
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		lo = 0;
>  		hi = nr;
>  		while (lo < hi) {

Nice!

> diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
> new file mode 100755
> index 0000000000..39c7a62d94
> --- /dev/null
> +++ b/t/t5329-unpack-large-objects.sh
> @@ -0,0 +1,52 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='git unpack-objects with large objects'
> +
> +. ./test-lib.sh
> +
> +prepare_dest () {
> +	test_when_finished "rm -rf dest.git" &&
> +	git init --bare dest.git
> +}
> +
> +assert_no_loose () {
> +	glob=dest.git/objects/?? &&
> +	echo "$glob" >expect &&
> +	eval "echo $glob" >actual &&
> +	test_cmp expect actual
> +}
> +
> +assert_no_pack () {
> +	rmdir dest.git/objects/pack

I would expect a function whose name starts with "assert" to have no
side effects.  It doesn't matter here, because it's called only at the
very end, but that might change.  You can use test_dir_is_empty instead
of rmdir.

> +}
> +
> +test_expect_success "create large objects (1.5 MB) and PACK" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	PACK=$(echo HEAD | git pack-objects --revs test)
> +'
> +
> +test_expect_success 'set memory limitation to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'unpack-objects failed under memory limitation' '
> +	prepare_dest &&
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	grep "fatal: attempting to allocate" err
> +'
> +
> +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
> +	prepare_dest &&
> +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
> +	assert_no_loose &&
> +	assert_no_pack
> +'
> +
> +test_done

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-08 12:28           ` René Scharfe
  2022-01-11 10:33             ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2022-01-08 12:28 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When writing a large blob using "write_loose_object()", we have to pass
> a buffer with the whole content of the blob, and this behavior will
> consume lots of memory and may cause OOM. We will introduce a stream
> version function ("stream_loose_object()") in latter commit to resolve
> this issue.
>
> Before introducing a stream vesion function for writing loose object,
> do some refactoring on "write_loose_object()" to reuse code for both
> versions.
>
> Rewrite "write_loose_object()" as follows:
>
>  1. Figure out a path for the (temp) object file. This step is only
>     used in "write_loose_object()".
>
>  2. Move common steps for starting to write loose objects into a new
>     function "start_loose_object_common()".
>
>  3. Compress data.
>
>  4. Move common steps for ending zlib stream into a new funciton
>     "end_loose_object_common()".
>
>  5. Close fd and finalize the object file.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 105 insertions(+), 44 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb1426f98c..5d163081b1 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
>  	algo->final_oid_fn(oid, &c);
>  }
>
> +/*
> + * Move the just written object with proper mtime into its final resting place.
> + */
> +static int finalize_object_file_with_mtime(const char *tmpfile,
> +					   const char *filename,
> +					   time_t mtime,
> +					   unsigned flags)

This function is called only once after your series.  Should it be used by
stream_loose_object()?  Probably not -- the latter doesn't have a way to
force a certain modification time and its caller doesn't need one.  So
creating finalize_object_file_with_mtime() seems unnecessary for this
series.

> +{
> +	struct utimbuf utb;
> +
> +	if (mtime) {
> +		utb.actime = mtime;
> +		utb.modtime = mtime;
> +		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
> +			warning_errno(_("failed utime() on %s"), tmpfile);
> +	}
> +	return finalize_object_file(tmpfile, filename);
> +}
> +
>  /*
>   * Move the just written object into its final resting place.
>   */
> @@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
>   * We want to avoid cross-directory filename renames, because those
>   * can have problems on various filesystems (FAT, NFS, Coda).
>   */
> -static int create_tmpfile(struct strbuf *tmp, const char *filename)
> +static int create_tmpfile(struct strbuf *tmp, const char *filename,
> +			  unsigned flags)

create_tmpfile() is not mentioned in the commit message, yet it's
changed here.  Hrm.

>  {
>  	int fd, dirlen = directory_size(filename);
>
> @@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	strbuf_add(tmp, filename, dirlen);
>  	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
>  	fd = git_mkstemp_mode(tmp->buf, 0444);
> -	if (fd < 0 && dirlen && errno == ENOENT) {
> +	do {
> +		if (fd >= 0 || !dirlen || errno != ENOENT)
> +			break;

Why turn this branch into a loop?  Is this done to mkdir multiple
components, e.g. with filename being "a/b/c/file" to create "a", "a/b",
and "a/b/c"?  It's only used for loose objects, so a fan-out directory
(e.g. ".git/objects/ff") can certainly be missing, but can their parent
be missing as well sometimes?  If that's the point then such a fix
would be worth its own patch.  (Which probably would benefit from using
safe_create_leading_directories()).

>  		/*
>  		 * Make sure the directory exists; note that the contents
>  		 * of the buffer are undefined after mkstemp returns an
> @@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;

Or is it just to replace these returns with a jump to the new error
reporting section?

>
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);

In that case a break would be missing here.

> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());
> +		else
> +			return error_errno(_("unable to create temporary file"));
>  	}

Why move this error reporting code into create_tmpfile()?  This function
has a single caller both before and after your series, so the code could
just as well stay at its call-site, avoiding the need to add the flags
parameter.

> +
>  	return fd;
>  }
>
> +static int start_loose_object_common(struct strbuf *tmp_file,
> +				     const char *filename, unsigned flags,
> +				     git_zstream *stream,
> +				     unsigned char *buf, size_t buflen,
> +				     git_hash_ctx *c,
> +				     enum object_type type, size_t len,

The parameters type and len are not used by this function and thus can
be dropped.

> +				     char *hdr, int hdrlen)
> +{
> +	int fd;
> +
> +	fd = create_tmpfile(tmp_file, filename, flags);
> +	if (fd < 0)
> +		return -1;
> +
> +	/*  Setup zlib stream for compression */
> +	git_deflate_init(stream, zlib_compression_level);
> +	stream->next_out = buf;
> +	stream->avail_out = buflen;
> +	the_hash_algo->init_fn(c);
> +
> +	/*  Start to feed header to zlib stream */
> +	stream->next_in = (unsigned char *)hdr;
> +	stream->avail_in = hdrlen;
> +	while (git_deflate(stream, 0) == Z_OK)
> +		; /* nothing */
> +	the_hash_algo->update_fn(c, hdr, hdrlen);
> +
> +	return fd;
> +}
> +
> +static void end_loose_object_common(int ret, git_hash_ctx *c,
> +				    git_zstream *stream,
> +				    struct object_id *parano_oid,
> +				    const struct object_id *expected_oid,
> +				    const char *die_msg1_fmt,
> +				    const char *die_msg2_fmt)

Hmm, the signature needs as many lines as the function body.

> +{
> +	if (ret != Z_STREAM_END)
> +		die(_(die_msg1_fmt), ret, expected_oid);
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		die(_(die_msg2_fmt), ret, expected_oid);

These format strings cannot be checked by the compiler.

Considering those two together I think I'd either unify the error
messages and move their strings here (losing the ability for users
to see if streaming was used) or not extract the function and
duplicate its few shared lines.  Just a feeling, though.

> +	the_hash_algo->final_oid_fn(parano_oid, c);
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
> @@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>  	loose_object_path(the_repository, &filename, oid);
>
> -	fd = create_tmpfile(&tmp_file, filename.buf);
> -	if (fd < 0) {
> -		if (flags & HASH_SILENT)
> -			return -1;
> -		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -		else
> -			return error_errno(_("unable to create temporary file"));
> -	}
> -
> -	/* Set it up */
> -	git_deflate_init(&stream, zlib_compression_level);
> -	stream.next_out = compressed;
> -	stream.avail_out = sizeof(compressed);
> -	the_hash_algo->init_fn(&c);
> -
> -	/* First header.. */
> -	stream.next_in = (unsigned char *)hdr;
> -	stream.avail_in = hdrlen;
> -	while (git_deflate(&stream, 0) == Z_OK)
> -		; /* nothing */
> -	the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * start writing loose oject:
> +	 *
> +	 *  - Create tmpfile for the loose object.
> +	 *  - Setup zlib stream for compression.
> +	 *  - Start to feed header to zlib stream.
> +	 */
> +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, OBJ_NONE, 0, hdr, hdrlen);
> +	if (fd < 0)
> +		return -1;
>
>  	/* Then the data itself.. */
>  	stream.next_in = (void *)buf;
> @@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		stream.avail_out = sizeof(compressed);
>  	} while (ret == Z_OK);
>
> -	if (ret != Z_STREAM_END)
> -		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> -		    ret);
> -	ret = git_deflate_end_gently(&stream);
> -	if (ret != Z_OK)
> -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> -		    ret);
> -	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * end writing loose oject:
> +	 *
> +	 *  - End the compression of zlib stream.
> +	 *  - Get the calculated oid to "parano_oid".
> +	 */
> +	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
> +				N_("unable to deflate new object %s (%d)"),
> +				N_("deflateEnd on object %s failed (%d)"));
> +
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>
>  	close_loose_object(fd);
>
> -	if (mtime) {
> -		struct utimbuf utb;
> -		utb.actime = mtime;
> -		utb.modtime = mtime;
> -		if (utime(tmp_file.buf, &utb) < 0 &&
> -		    !(flags & HASH_SILENT))
> -			warning_errno(_("failed utime() on %s"), tmp_file.buf);
> -	}
> -
> -	return finalize_object_file(tmp_file.buf, filename.buf);
> +	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
> +					       mtime, flags);
>  }
>
>  static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
@ 2022-01-08 17:24           ` René Scharfe
  2022-01-11 10:14             ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2022-01-08 17:24 UTC (permalink / raw)
  To: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Ævar Arnfjörð Bjarmason,
	Derrick Stolee
  Cc: Han Xin

Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
> slash as many users of it want.
>
> Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 5d163081b1..4f0127e823 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
>  		die_errno(_("error when closing loose object file"));
>  }
>
> -/* Size of directory component, including the ending '/' */
> +/* Size of directory component, excluding the ending '/' */
>  static inline int directory_size(const char *filename)
>  {
>  	const char *s = strrchr(filename, '/');
>  	if (!s)
>  		return 0;
> -	return s - filename + 1;
> +	return s - filename;

This will return zero both for "filename" and "/filename".  Hmm.  Since
it's only used for loose object files we can assume that at least one
slash is present, so this removal of functionality is not actually a
problem.  But I don't understand its benefit.

>  }
>
>  /*
> @@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
>
>  	strbuf_reset(tmp);
>  	strbuf_add(tmp, filename, dirlen);
> -	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> +	strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  	fd = git_mkstemp_mode(tmp->buf, 0444);
>  	do {
>  		if (fd >= 0 || !dirlen || errno != ENOENT)
> @@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
>  		 * scratch.
>  		 */
>  		strbuf_reset(tmp);
> -		strbuf_add(tmp, filename, dirlen - 1);
> +		strbuf_add(tmp, filename, dirlen);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)

This code makes sure that mkdir(2) is called without the trailing slash,
both with or without this patch.  From the commit message above I
somehow expected a change in this regard -- but again I wouldn't
understand its benefit.

Is this change really needed?  Is streaming unpack not possible with the
original directory_size() function?

>  			break;
>  		if (adjust_shared_perm(tmp->buf))

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 3/6] object-file.c: remove the slash for directory_size()
  2022-01-08 17:24           ` René Scharfe
@ 2022-01-11 10:14             ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-11 10:14 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sun, Jan 9, 2022 at 1:24 AM René Scharfe <l.s.r@web.de> wrote:
>
> Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Since "mkdir foo/" works as well as "mkdir foo", let's remove the end
> > slash as many users of it want.
> >
> > Suggested-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 5d163081b1..4f0127e823 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1831,13 +1831,13 @@ static void close_loose_object(int fd)
> >               die_errno(_("error when closing loose object file"));
> >  }
> >
> > -/* Size of directory component, including the ending '/' */
> > +/* Size of directory component, excluding the ending '/' */
> >  static inline int directory_size(const char *filename)
> >  {
> >       const char *s = strrchr(filename, '/');
> >       if (!s)
> >               return 0;
> > -     return s - filename + 1;
> > +     return s - filename;
>
> This will return zero both for "filename" and "/filename".  Hmm.  Since
> it's only used for loose object files we can assume that at least one
> slash is present, so this removal of functionality is not actually a
> problem.  But I don't understand its benefit.
>
> >  }
> >
> >  /*
> > @@ -1854,7 +1854,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
> >
> >       strbuf_reset(tmp);
> >       strbuf_add(tmp, filename, dirlen);
> > -     strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> > +     strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >       fd = git_mkstemp_mode(tmp->buf, 0444);
> >       do {
> >               if (fd >= 0 || !dirlen || errno != ENOENT)
> > @@ -1866,7 +1866,7 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename,
> >                * scratch.
> >                */
> >               strbuf_reset(tmp);
> > -             strbuf_add(tmp, filename, dirlen - 1);
> > +             strbuf_add(tmp, filename, dirlen);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
>
> This code makes sure that mkdir(2) is called without the trailing slash,
> both with or without this patch.  From the commit message above I
> somehow expected a change in this regard -- but again I wouldn't
> understand its benefit.
>
> Is this change really needed?  Is streaming unpack not possible with the
> original directory_size() function?
>

*nod*
Streaming unpacking still works with the original directory_size().

This patch is more of a code cleanup that reduces the extra handling of
directory size first increasing and then decreasing. I'll seriously consider
if I should remove this patch, or move it to the tail.

Thanks
-Han Xin

> >                       break;
> >               if (adjust_shared_perm(tmp->buf))

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps
  2022-01-08 12:28           ` René Scharfe
@ 2022-01-11 10:33             ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-11 10:33 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote:
>
> Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > When writing a large blob using "write_loose_object()", we have to pass
> > a buffer with the whole content of the blob, and this behavior will
> > consume lots of memory and may cause OOM. We will introduce a stream
> > version function ("stream_loose_object()") in latter commit to resolve
> > this issue.
> >
> > Before introducing a stream vesion function for writing loose object,
> > do some refactoring on "write_loose_object()" to reuse code for both
> > versions.
> >
> > Rewrite "write_loose_object()" as follows:
> >
> >  1. Figure out a path for the (temp) object file. This step is only
> >     used in "write_loose_object()".
> >
> >  2. Move common steps for starting to write loose objects into a new
> >     function "start_loose_object_common()".
> >
> >  3. Compress data.
> >
> >  4. Move common steps for ending zlib stream into a new funciton
> >     "end_loose_object_common()".
> >
> >  5. Close fd and finalize the object file.
> >
> > Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c | 149 +++++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 105 insertions(+), 44 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb1426f98c..5d163081b1 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1743,6 +1743,25 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
> >       algo->final_oid_fn(oid, &c);
> >  }
> >
> > +/*
> > + * Move the just written object with proper mtime into its final resting place.
> > + */
> > +static int finalize_object_file_with_mtime(const char *tmpfile,
> > +                                        const char *filename,
> > +                                        time_t mtime,
> > +                                        unsigned flags)
>
> This function is called only once after your series.  Should it be used by
> stream_loose_object()?  Probably not -- the latter doesn't have a way to
> force a certain modification time and its caller doesn't need one.  So
> creating finalize_object_file_with_mtime() seems unnecessary for this
> series.
>

After accepting the suggestion by Ævar Arnfjörð Bjarmason[1] to remove
finalize_object_file_with_mtime() from stream_loose_object() , it seems to
be an overkill for write_loose_object() now. I'll put it back into
write_loose_object() .

1. https://lore.kernel.org/git/211221.86pmpqq9aj.gmgdl@evledraar.gmail.com/

Thanks
-Han Xin

> > +{
> > +     struct utimbuf utb;
> > +
> > +     if (mtime) {
> > +             utb.actime = mtime;
> > +             utb.modtime = mtime;
> > +             if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
> > +                     warning_errno(_("failed utime() on %s"), tmpfile);
> > +     }
> > +     return finalize_object_file(tmpfile, filename);
> > +}
> > +
> >  /*
> >   * Move the just written object into its final resting place.
> >   */
> > @@ -1828,7 +1847,8 @@ static inline int directory_size(const char *filename)
> >   * We want to avoid cross-directory filename renames, because those
> >   * can have problems on various filesystems (FAT, NFS, Coda).
> >   */
> > -static int create_tmpfile(struct strbuf *tmp, const char *filename)
> > +static int create_tmpfile(struct strbuf *tmp, const char *filename,
> > +                       unsigned flags)
>
> create_tmpfile() is not mentioned in the commit message, yet it's
> changed here.  Hrm.
>
> >  {
> >       int fd, dirlen = directory_size(filename);
> >
> > @@ -1836,7 +1856,9 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       strbuf_add(tmp, filename, dirlen);
> >       strbuf_addstr(tmp, "tmp_obj_XXXXXX");
> >       fd = git_mkstemp_mode(tmp->buf, 0444);
> > -     if (fd < 0 && dirlen && errno == ENOENT) {
> > +     do {
> > +             if (fd >= 0 || !dirlen || errno != ENOENT)
> > +                     break;
>
> Why turn this branch into a loop?  Is this done to mkdir multiple
> components, e.g. with filename being "a/b/c/file" to create "a", "a/b",
> and "a/b/c"?  It's only used for loose objects, so a fan-out directory
> (e.g. ".git/objects/ff") can certainly be missing, but can their parent
> be missing as well sometimes?  If that's the point then such a fix
> would be worth its own patch.  (Which probably would benefit from using
> safe_create_leading_directories()).
>
> >               /*
> >                * Make sure the directory exists; note that the contents
> >                * of the buffer are undefined after mkstemp returns an
> > @@ -1846,17 +1868,72 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >               strbuf_reset(tmp);
> >               strbuf_add(tmp, filename, dirlen - 1);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> > -                     return -1;
> > +                     break;
> >               if (adjust_shared_perm(tmp->buf))
> > -                     return -1;
> > +                     break;
>
> Or is it just to replace these returns with a jump to the new error
> reporting section?
>
> >
> >               /* Try again */
> >               strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >               fd = git_mkstemp_mode(tmp->buf, 0444);
>
> In that case a break would be missing here.
>
> > +     } while (0);
> > +
> > +     if (fd < 0 && !(flags & HASH_SILENT)) {
> > +             if (errno == EACCES)
> > +                     return error(_("insufficient permission for adding an "
> > +                                    "object to repository database %s"),
> > +                                  get_object_directory());
> > +             else
> > +                     return error_errno(_("unable to create temporary file"));
> >       }
>
> Why move this error reporting code into create_tmpfile()?  This function
> has a single caller both before and after your series, so the code could
> just as well stay at its call-site, avoiding the need to add the flags
> parameter.
>

Here is a legacy from v7, now there is no step called "Figuring out a path
for the (temp) object file.", and it's only used in start_loose_object_common().
I will bring it back to what it was.

Thanks
-Han Xin
> > +
> >       return fd;
> >  }
> >
> > +static int start_loose_object_common(struct strbuf *tmp_file,
> > +                                  const char *filename, unsigned flags,
> > +                                  git_zstream *stream,
> > +                                  unsigned char *buf, size_t buflen,
> > +                                  git_hash_ctx *c,
> > +                                  enum object_type type, size_t len,
>
> The parameters type and len are not used by this function and thus can
> be dropped.
>

*nod*

> > +                                  char *hdr, int hdrlen)
> > +{
> > +     int fd;
> > +
> > +     fd = create_tmpfile(tmp_file, filename, flags);
> > +     if (fd < 0)
> > +             return -1;
> > +
> > +     /*  Setup zlib stream for compression */
> > +     git_deflate_init(stream, zlib_compression_level);
> > +     stream->next_out = buf;
> > +     stream->avail_out = buflen;
> > +     the_hash_algo->init_fn(c);
> > +
> > +     /*  Start to feed header to zlib stream */
> > +     stream->next_in = (unsigned char *)hdr;
> > +     stream->avail_in = hdrlen;
> > +     while (git_deflate(stream, 0) == Z_OK)
> > +             ; /* nothing */
> > +     the_hash_algo->update_fn(c, hdr, hdrlen);
> > +
> > +     return fd;
> > +}
> > +
> > +static void end_loose_object_common(int ret, git_hash_ctx *c,
> > +                                 git_zstream *stream,
> > +                                 struct object_id *parano_oid,
> > +                                 const struct object_id *expected_oid,
> > +                                 const char *die_msg1_fmt,
> > +                                 const char *die_msg2_fmt)
>
> Hmm, the signature needs as many lines as the function body.
>
> > +{
> > +     if (ret != Z_STREAM_END)
> > +             die(_(die_msg1_fmt), ret, expected_oid);
> > +     ret = git_deflate_end_gently(stream);
> > +     if (ret != Z_OK)
> > +             die(_(die_msg2_fmt), ret, expected_oid);
>
> These format strings cannot be checked by the compiler.
>
> Considering those two together I think I'd either unify the error
> messages and move their strings here (losing the ability for users
> to see if streaming was used) or not extract the function and
> duplicate its few shared lines.  Just a feeling, though.
>
> > +     the_hash_algo->final_oid_fn(parano_oid, c);
> > +}
> > +
> >  static int write_loose_object(const struct object_id *oid, char *hdr,
> >                             int hdrlen, const void *buf, unsigned long len,
> >                             time_t mtime, unsigned flags)
> > @@ -1871,28 +1948,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >
> >       loose_object_path(the_repository, &filename, oid);
> >
> > -     fd = create_tmpfile(&tmp_file, filename.buf);
> > -     if (fd < 0) {
> > -             if (flags & HASH_SILENT)
> > -                     return -1;
> > -             else if (errno == EACCES)
> > -                     return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> > -             else
> > -                     return error_errno(_("unable to create temporary file"));
> > -     }
> > -
> > -     /* Set it up */
> > -     git_deflate_init(&stream, zlib_compression_level);
> > -     stream.next_out = compressed;
> > -     stream.avail_out = sizeof(compressed);
> > -     the_hash_algo->init_fn(&c);
> > -
> > -     /* First header.. */
> > -     stream.next_in = (unsigned char *)hdr;
> > -     stream.avail_in = hdrlen;
> > -     while (git_deflate(&stream, 0) == Z_OK)
> > -             ; /* nothing */
> > -     the_hash_algo->update_fn(&c, hdr, hdrlen);
> > +     /* Common steps for write_loose_object and stream_loose_object to
> > +      * start writing loose oject:
> > +      *
> > +      *  - Create tmpfile for the loose object.
> > +      *  - Setup zlib stream for compression.
> > +      *  - Start to feed header to zlib stream.
> > +      */
> > +     fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> > +                                    &stream, compressed, sizeof(compressed),
> > +                                    &c, OBJ_NONE, 0, hdr, hdrlen);
> > +     if (fd < 0)
> > +             return -1;
> >
> >       /* Then the data itself.. */
> >       stream.next_in = (void *)buf;
> > @@ -1907,30 +1974,24 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
> >               stream.avail_out = sizeof(compressed);
> >       } while (ret == Z_OK);
> >
> > -     if (ret != Z_STREAM_END)
> > -             die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> > -                 ret);
> > -     ret = git_deflate_end_gently(&stream);
> > -     if (ret != Z_OK)
> > -             die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> > -                 ret);
> > -     the_hash_algo->final_oid_fn(&parano_oid, &c);
> > +     /* Common steps for write_loose_object and stream_loose_object to
> > +      * end writing loose oject:
> > +      *
> > +      *  - End the compression of zlib stream.
> > +      *  - Get the calculated oid to "parano_oid".
> > +      */
> > +     end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
> > +                             N_("unable to deflate new object %s (%d)"),
> > +                             N_("deflateEnd on object %s failed (%d)"));
> > +
> >       if (!oideq(oid, &parano_oid))
> >               die(_("confused by unstable object source data for %s"),
> >                   oid_to_hex(oid));
> >
> >       close_loose_object(fd);
> >
> > -     if (mtime) {
> > -             struct utimbuf utb;
> > -             utb.actime = mtime;
> > -             utb.modtime = mtime;
> > -             if (utime(tmp_file.buf, &utb) < 0 &&
> > -                 !(flags & HASH_SILENT))
> > -                     warning_errno(_("failed utime() on %s"), tmp_file.buf);
> > -     }
> > -
> > -     return finalize_object_file(tmp_file.buf, filename.buf);
> > +     return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
> > +                                            mtime, flags);
> >  }
> >
> >  static int freshen_loose_object(const struct object_id *oid)

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08 12:28           ` René Scharfe
@ 2022-01-11 10:41             ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-11 10:41 UTC (permalink / raw)
  To: René Scharfe
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee, Han Xin

On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote:
>
>  Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > +assert_no_loose () {
> > +     glob=dest.git/objects/?? &&
> > +     echo "$glob" >expect &&
> > +     eval "echo $glob" >actual &&
> > +     test_cmp expect actual
> > +}
> > +
> > +assert_no_pack () {
> > +     rmdir dest.git/objects/pack
>
> I would expect a function whose name starts with "assert" to have no
> side effects.  It doesn't matter here, because it's called only at the
> very end, but that might change.  You can use test_dir_is_empty instead
> of rmdir.
>

*nod*
I think it would be better to rename "assert_no_loose()" to "test_no_loose()".
I will remove "assert_no_pack()" and use "test_dir_is_empty()" instead.

Thanks
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v9 0/5] unpack large blobs in stream
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  2022-01-20 11:21           ` [PATCH v9 1/5] " Han Xin
                             ` (4 subsequent siblings)
  5 siblings, 2 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Changes since v8:
* Rename "assert_no_loose ()" into "test_no_loose ()" in
  "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
  "test_dir_is_empty" instead.

* Revert changes to "create_tmpfile()" and error handling is now in
  "start_loose_object_common()".

* Remove "finalize_object_file_with_mtime()" which seems to be an overkill
  for "write_loose_object()" now. 

* Remove the commit "object-file.c: remove the slash for directory_size()",
  it can be in a separate patch if necessary.

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: unpack_non_delta_entry() read data in a stream

Ævar Arnfjörð Bjarmason (1):
  object-file API: add a format_object_header() function

 builtin/index-pack.c            |   3 +-
 builtin/unpack-objects.c        | 110 ++++++++++++++--
 bulk-checkin.c                  |   4 +-
 cache.h                         |  21 +++
 http-push.c                     |   2 +-
 object-file.c                   | 220 +++++++++++++++++++++++++++-----
 object-store.h                  |   9 ++
 t/t5328-unpack-large-objects.sh |  65 ++++++++++
 8 files changed, 384 insertions(+), 50 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

Range-diff against v8:
1:  bd34da5816 ! 1:  6a6c11ba93 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ builtin/unpack-objects.c: static void unpack_delta_entry(enum object_type type,
      		hi = nr;
      		while (lo < hi) {
     
    - ## t/t5329-unpack-large-objects.sh (new) ##
    + ## t/t5328-unpack-large-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    -+# Copyright (c) 2021 Han Xin
    ++# Copyright (c) 2022 Han Xin
     +#
     +
     +test_description='git unpack-objects with large objects'
    @@ t/t5329-unpack-large-objects.sh (new)
     +	git init --bare dest.git
     +}
     +
    -+assert_no_loose () {
    ++test_no_loose () {
     +	glob=dest.git/objects/?? &&
     +	echo "$glob" >expect &&
     +	eval "echo $glob" >actual &&
     +	test_cmp expect actual
     +}
     +
    -+assert_no_pack () {
    -+	rmdir dest.git/objects/pack
    -+}
    -+
     +test_expect_success "create large objects (1.5 MB) and PACK" '
     +	test-tool genrandom foo 1500000 >big-blob &&
     +	test_commit --append foo big-blob &&
    @@ t/t5329-unpack-large-objects.sh (new)
     +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
     +	prepare_dest &&
     +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    -+	assert_no_loose &&
    -+	assert_no_pack
    ++	test_no_loose &&
    ++	test_dir_is_empty dest.git/objects/pack
     +'
     +
     +test_done
2:  f9a4365a7d ! 2:  bab9e0402f object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## object-file.c ##
    -@@ object-file.c: static void write_object_file_prepare(const struct git_hash_algo *algo,
    - 	algo->final_oid_fn(oid, &c);
    - }
    - 
    -+/*
    -+ * Move the just written object with proper mtime into its final resting place.
    -+ */
    -+static int finalize_object_file_with_mtime(const char *tmpfile,
    -+					   const char *filename,
    -+					   time_t mtime,
    -+					   unsigned flags)
    -+{
    -+	struct utimbuf utb;
    -+
    -+	if (mtime) {
    -+		utb.actime = mtime;
    -+		utb.modtime = mtime;
    -+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
    -+			warning_errno(_("failed utime() on %s"), tmpfile);
    -+	}
    -+	return finalize_object_file(tmpfile, filename);
    -+}
    -+
    - /*
    -  * Move the just written object into its final resting place.
    -  */
    -@@ object-file.c: static inline int directory_size(const char *filename)
    -  * We want to avoid cross-directory filename renames, because those
    -  * can have problems on various filesystems (FAT, NFS, Coda).
    -  */
    --static int create_tmpfile(struct strbuf *tmp, const char *filename)
    -+static int create_tmpfile(struct strbuf *tmp, const char *filename,
    -+			  unsigned flags)
    - {
    - 	int fd, dirlen = directory_size(filename);
    - 
    -@@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
    - 	strbuf_add(tmp, filename, dirlen);
    - 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
    - 	fd = git_mkstemp_mode(tmp->buf, 0444);
    --	if (fd < 0 && dirlen && errno == ENOENT) {
    -+	do {
    -+		if (fd >= 0 || !dirlen || errno != ENOENT)
    -+			break;
    - 		/*
    - 		 * Make sure the directory exists; note that the contents
    - 		 * of the buffer are undefined after mkstemp returns an
     @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
    - 		strbuf_reset(tmp);
    - 		strbuf_add(tmp, filename, dirlen - 1);
    - 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
    --			return -1;
    -+			break;
    - 		if (adjust_shared_perm(tmp->buf))
    --			return -1;
    -+			break;
    - 
    - 		/* Try again */
    - 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
    - 		fd = git_mkstemp_mode(tmp->buf, 0444);
    -+	} while (0);
    -+
    -+	if (fd < 0 && !(flags & HASH_SILENT)) {
    -+		if (errno == EACCES)
    -+			return error(_("insufficient permission for adding an "
    -+				       "object to repository database %s"),
    -+				     get_object_directory());
    -+		else
    -+			return error_errno(_("unable to create temporary file"));
    - 	}
    -+
      	return fd;
      }
      
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +				     git_zstream *stream,
     +				     unsigned char *buf, size_t buflen,
     +				     git_hash_ctx *c,
    -+				     enum object_type type, size_t len,
     +				     char *hdr, int hdrlen)
     +{
     +	int fd;
     +
    -+	fd = create_tmpfile(tmp_file, filename, flags);
    -+	if (fd < 0)
    -+		return -1;
    ++	fd = create_tmpfile(tmp_file, filename);
    ++	if (fd < 0) {
    ++		if (flags & HASH_SILENT)
    ++			return -1;
    ++		else if (errno == EACCES)
    ++			return error(_("insufficient permission for adding "
    ++				       "an object to repository database %s"),
    ++				     get_object_directory());
    ++		else
    ++			return error_errno(
    ++				_("unable to create temporary file"));
    ++	}
     +
     +	/*  Setup zlib stream for compression */
     +	git_deflate_init(stream, zlib_compression_level);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
     +				       &stream, compressed, sizeof(compressed),
    -+				       &c, OBJ_NONE, 0, hdr, hdrlen);
    ++				       &c, hdr, hdrlen);
     +	if (fd < 0)
     +		return -1;
      
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
    - 
    - 	close_loose_object(fd);
    - 
    --	if (mtime) {
    --		struct utimbuf utb;
    --		utb.actime = mtime;
    --		utb.modtime = mtime;
    --		if (utime(tmp_file.buf, &utb) < 0 &&
    --		    !(flags & HASH_SILENT))
    --			warning_errno(_("failed utime() on %s"), tmp_file.buf);
    --	}
    --
    --	return finalize_object_file(tmp_file.buf, filename.buf);
    -+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
    -+					       mtime, flags);
    - }
    - 
    - static int freshen_loose_object(const struct object_id *oid)
3:  18dd21122d < -:  ---------- object-file.c: remove the slash for directory_size()
4:  964715451b ! 3:  dd13614985 object-file.c: add "stream_loose_object()" to handle large object
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
     +				       &stream, compressed, sizeof(compressed),
    -+				       &c, OBJ_BLOB, len, hdr, hdrlen);
    ++				       &c, hdr, hdrlen);
     +	if (fd < 0) {
     +		err = -1;
     +		goto cleanup;
5:  3f620466fe ! 4:  cd84e27b08 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      		write_object(nr, type, buf, size);
      }
     
    - ## t/t5329-unpack-large-objects.sh ##
    -@@ t/t5329-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
    + ## t/t5328-unpack-large-objects.sh ##
    +@@ t/t5328-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
      
      prepare_dest () {
      	test_when_finished "rm -rf dest.git" &&
    @@ t/t5329-unpack-large-objects.sh: test_description='git unpack-objects with large
     +	fi
      }
      
    - assert_no_loose () {
    -@@ t/t5329-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
    + test_no_loose () {
    +@@ t/t5328-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
      '
      
      test_expect_success 'unpack-objects failed under memory limitation' '
    @@ t/t5329-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1
     -	prepare_dest &&
     +	prepare_dest 2m &&
      	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    - 	assert_no_loose &&
    - 	assert_no_pack
    + 	test_no_loose &&
    + 	test_dir_is_empty dest.git/objects/pack
      '
      
     +test_expect_success 'unpack big object in stream' '
     +	prepare_dest 1m &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
    -+	assert_no_pack
    ++	test_dir_is_empty dest.git/objects/pack
     +'
     +
     +test_expect_success 'do not unpack existing large objects' '
     +	prepare_dest 1m &&
     +	git -C dest.git index-pack --stdin <test-$PACK.pack &&
     +	git -C dest.git unpack-objects <test-$PACK.pack &&
    -+	assert_no_loose
    ++	test_no_loose
     +'
     +
      test_done
6:  8073a3888d = 5:  59f0ad95c7 object-file API: add a format_object_header() function
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v9 1/5] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps Han Xin
                             ` (3 subsequent siblings)
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 39 +++++++++++++++++++--------
 t/t5328-unpack-large-objects.sh | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 11 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c6d6c17072 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
new file mode 100755
index 0000000000..45a3316e06
--- /dev/null
+++ b/t/t5328-unpack-large-objects.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	eval "echo $glob" >actual &&
+	test_cmp expect actual
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	test_no_loose &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
  2022-01-20 11:21           ` [PATCH v9 1/5] " Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                             ` (2 subsequent siblings)
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in latter commit to resolve
this issue.

Before introducing a stream vesion function for writing loose object,
do some refactoring on "write_loose_object()" to reuse code for both
versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new funciton
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 105 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 30 deletions(-)

diff --git a/object-file.c b/object-file.c
index eb1426f98c..422b43212a 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1857,6 +1857,59 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+static void end_loose_object_common(int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1871,28 +1924,18 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1907,14 +1950,16 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid to "parano_oid".
+	 */
+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (2 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
  2022-01-20 11:21           ` [PATCH v9 5/5] object-file API: add a format_object_header() function Han Xin
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in latter commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object, so we have to save the temporary file in ".git/objects/"
directory instead.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   9 +++++
 2 files changed, 110 insertions(+)

diff --git a/object-file.c b/object-file.c
index 422b43212a..a738f47cb2 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1996,6 +1996,107 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	close_loose_object(fd);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    const char *type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..cc41c64d69 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -232,6 +238,9 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
+
 int hash_object_file_literally(const void *buf, unsigned long len,
 			       const char *type, struct object_id *oid,
 			       unsigned flags);
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (3 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-01-20 11:21           ` Han Xin
  2022-01-20 11:21           ` [PATCH v9 5/5] object-file API: add a format_object_header() function Han Xin
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()". However, unpack
non-delta objects from a stream instead of from an entrie buffer will
have 10% performance penalty.

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

Therefore, only unpack objects larger than the "core.bigFileThreshold"
in zstream. Until now, the config variable has been used in the
following cases, and our new case belongs to the packfile category.

 * Archive:

   + archive.c: write_entry(): write large blob entries to archive in
     stream.

 * Loose objects:

   + object-file.c: index_fd(): when hashing large files in worktree,
     read files in a stream, and create one packfile per large blob if
     want to save files to git object store.

   + object-file.c: read_loose_object(): when checking loose objects
     using "git-fsck", do not read full content of large loose objects.

 * Packfile:

   + fast-import.c: parse_and_store_blob(): streaming large blob from
     foreign source to packfile.

   + index-pack.c: check_collison(): read and check large blob in stream.

   + index-pack.c: unpack_entry_data(): do not return the entire
     contents of the big blob from packfile, but uses a fixed buf to
     perform some integrity checks on the object.

   + pack-check.c: verify_packfile(): used by "git-fsck" and will call
     check_object_signature() to check large blob in pack with the
     streaming interface.

   + pack-objects.c: get_object_details(): set "no_try_delta" for large
     blobs when counting objects.

   + pack-objects.c: write_no_reuse_object(): streaming large blob to
     pack.

   + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
     stream from packfile.

 * Others:

   + diff.c: diff_populate_filespec(): treat large blob file as binary.

   + streaming.c: istream_source(): as a helper of "open_istream()" to
     select proper streaming interface to read large blob from packfile.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5328-unpack-large-objects.sh | 23 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index c6d6c17072..e9ec2b349d 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
index 45a3316e06..f4129979f9 100755
--- a/t/t5328-unpack-large-objects.sh
+++ b/t/t5328-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_no_loose () {
@@ -33,16 +37,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	test_no_loose &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_no_loose
+'
+
 test_done
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v9 5/5] object-file API: add a format_object_header() function
  2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
                             ` (4 preceding siblings ...)
  2022-01-20 11:21           ` [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
@ 2022-01-20 11:21           ` Han Xin
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-01-20 11:21 UTC (permalink / raw)
  To: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Ævar Arnfjörð Bjarmason, Derrick Stolee,
	René Scharfe, Jiang Xin
  Cc: Han Xin

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Add a convenience function to wrap the xsnprintf() command that
generates loose object headers. This code was copy/pasted in various
parts of the codebase, let's define it in one place and re-use it from
there.

All except one caller of it had a valid "enum object_type" for us,
it's only write_object_file_prepare() which might need to deal with
"git hash-object --literally" and a potential garbage type. Let's have
the primary API use an "enum object_type", and define an *_extended()
function that can take an arbitrary "const char *" for the type.

See [1] for the discussion that prompted this patch, i.e. new code in
object-file.c that wanted to copy/paste the xsnprintf() invocation.

1. https://lore.kernel.org/git/211213.86bl1l9bfz.gmgdl@evledraar.gmail.com/

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/index-pack.c |  3 +--
 bulk-checkin.c       |  4 ++--
 cache.h              | 21 +++++++++++++++++++++
 http-push.c          |  2 +-
 object-file.c        | 16 ++++++++++++----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c23d01de7d..8a6ce77940 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -449,8 +449,7 @@ static void *unpack_entry_data(off_t offset, unsigned long size,
 	int hdrlen;
 
 	if (!is_delta_type(type)) {
-		hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX,
-				   type_name(type),(uintmax_t)size) + 1;
+		hdrlen = format_object_header(hdr, sizeof(hdr), type, size);
 		the_hash_algo->init_fn(&c);
 		the_hash_algo->update_fn(&c, hdr, hdrlen);
 	} else
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 8785b2ac80..9e685f0f1a 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	if (seekback == (off_t) -1)
 		return error("cannot find the current offset");
 
-	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
-			       type_name(type), (uintmax_t)size) + 1;
+	header_len = format_object_header((char *)obuf, sizeof(obuf),
+					 type, size);
 	the_hash_algo->init_fn(&ctx);
 	the_hash_algo->update_fn(&ctx, obuf, header_len);
 
diff --git a/cache.h b/cache.h
index cfba463aa9..64071a8d80 100644
--- a/cache.h
+++ b/cache.h
@@ -1310,6 +1310,27 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 						    unsigned long bufsiz,
 						    struct strbuf *hdrbuf);
 
+/**
+ * format_object_header() is a thin wrapper around s xsnprintf() that
+ * writes the initial "<type> <obj-len>" part of the loose object
+ * header. It returns the size that snprintf() returns + 1.
+ *
+ * The format_object_header_extended() function allows for writing a
+ * type_name that's not one of the "enum object_type" types. This is
+ * used for "git hash-object --literally". Pass in a OBJ_NONE as the
+ * type, and a non-NULL "type_str" to do that.
+ *
+ * format_object_header() is a convenience wrapper for
+ * format_object_header_extended().
+ */
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *type_str, size_t objsize);
+static inline int format_object_header(char *str, size_t size,
+				      enum object_type type, size_t objsize)
+{
+	return format_object_header_extended(str, size, type, NULL, objsize);
+}
+
 /**
  * parse_loose_header() parses the starting "<type> <len>\0" of an
  * object. If it doesn't follow that format -1 is returned. To check
diff --git a/http-push.c b/http-push.c
index 3309aaf004..f0c044dcf7 100644
--- a/http-push.c
+++ b/http-push.c
@@ -363,7 +363,7 @@ static void start_put(struct transfer_request *request)
 	git_zstream stream;
 
 	unpacked = read_object_file(&request->obj->oid, &type, &len);
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
diff --git a/object-file.c b/object-file.c
index a738f47cb2..0dce5d2fec 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1006,6 +1006,14 @@ void *xmmap(void *start, size_t length,
 	return ret;
 }
 
+int format_object_header_extended(char *str, size_t size, enum object_type type,
+				 const char *typestr, size_t objsize)
+{
+	const char *s = type == OBJ_NONE ? typestr : type_name(type);
+
+	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
+}
+
 /*
  * With an in-core object data in "map", rehash it to make sure the
  * object name actually matches "oid" to detect object corruption.
@@ -1034,7 +1042,7 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
 		return -1;
 
 	/* Generate the header */
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(obj_type), (uintmax_t)size) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size);
 
 	/* Sha1.. */
 	r->hash_algo->init_fn(&c);
@@ -1734,7 +1742,7 @@ static void write_object_file_prepare(const struct git_hash_algo *algo,
 	git_hash_ctx c;
 
 	/* Generate the header */
-	*hdrlen = xsnprintf(hdr, *hdrlen, "%s %"PRIuMAX , type, (uintmax_t)len)+1;
+	*hdrlen = format_object_header_extended(hdr, *hdrlen, OBJ_NONE, type, len);
 
 	/* Sha1.. */
 	algo->init_fn(&c);
@@ -2011,7 +2019,7 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
 
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
 
 	/* Common steps for write_loose_object and stream_loose_object to
 	 * start writing loose oject:
@@ -2152,7 +2160,7 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
 	buf = read_object(the_repository, oid, &type, &len);
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
-	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
+	hdrlen = format_object_header(hdr, sizeof(hdr), type, len);
 	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
 	free(buf);
 
-- 
2.34.1.52.gc288e771b4.agit.6.5.6


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* C99 %z (was: [PATCH v7 2/5] object-file API: add a format_object_header() function)
  2021-12-21 14:30           ` René Scharfe
@ 2022-02-01 14:28             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-01 14:28 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, Junio C Hamano, Git List, Jeff King, Jiang Xin,
	Philip Oakley, Derrick Stolee, Han Xin


On Tue, Dec 21 2021, René Scharfe wrote:

> Am 21.12.21 um 12:51 schrieb Han Xin:
>> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> [...]
>>  		the_hash_algo->init_fn(&c);
>>  		the_hash_algo->update_fn(&c, hdr, hdrlen);
>>  	} else
>> diff --git a/bulk-checkin.c b/bulk-checkin.c
>> index 8785b2ac80..1733a1de4f 100644
>> --- a/bulk-checkin.c
>> +++ b/bulk-checkin.c
>> @@ -220,8 +220,8 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
>>  	if (seekback == (off_t) -1)
>>  		return error("cannot find the current offset");
>>
>> -	header_len = xsnprintf((char *)obuf, sizeof(obuf), "%s %" PRIuMAX,
>> -			       type_name(type), (uintmax_t)size) + 1;
>> +	header_len = format_object_header((char *)obuf, sizeof(obuf),
>> +					 type, (uintmax_t)size);
>                                                ^^^^^^^^^^^
> Same here, just that size is already of type size_t, so a cast makes
> even less sense.

Thanks, this and the below is something I made sure to include in a
re-roll I'm about to send (to do these cleanups in object-file.c
separately from Han Xin's series).

>> +int format_object_header_extended(char *str, size_t size, enum object_type type,
>> +				 const char *typestr, size_t objsize)
>> +{
>> +	const char *s = type == OBJ_NONE ? typestr : type_name(type);
>> +
>> +	return xsnprintf(str, size, "%s %"PRIuMAX, s, (uintmax_t)objsize) + 1;
>                                                       ^^^^^^^^^^^
> This cast is necessary to match PRIuMAX.  And that is used because the z
> modifier (as in e.g. printf("%zu", sizeof(size_t));) was only added in
> C99 and not all platforms may have it.  (Perhaps this cautious approach
> is worth revisiting separately, now that some time has passed, but this
> patch series should still use PRIuMAX, as it does.)

I tried to use %z recently and found that the CI breaks on Windows, but
this was a few months ago. But I think the status of that particular C99
feature is that we can't use it freely, unfortunately. I may be wrong
about that, I haven't looked it any detail beyond running those CI
errors.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
@ 2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
  2022-02-02  8:32               ` Han Xin
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-01 21:24 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin


On Thu, Jan 20 2022, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> Changes since v8:
> * Rename "assert_no_loose ()" into "test_no_loose ()" in
>   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
>   "test_dir_is_empty" instead.
>
> * Revert changes to "create_tmpfile()" and error handling is now in
>   "start_loose_object_common()".
>
> * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
>   for "write_loose_object()" now. 
>
> * Remove the commit "object-file.c: remove the slash for directory_size()",
>   it can be in a separate patch if necessary.
>
> Han Xin (4):
>   unpack-objects: low memory footprint for get_data() in dry_run mode
>   object-file.c: refactor write_loose_object() to several steps
>   object-file.c: add "stream_loose_object()" to handle large object
>   unpack-objects: unpack_non_delta_entry() read data in a stream
>
> Ævar Arnfjörð Bjarmason (1):
>   object-file API: add a format_object_header() function

I sent
https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
today which suggests splitting out the 5/5 cleanup you'd integrated.

I then rebased these patches of yours on top of that, the result is
here:
https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9

The range-diff to your version is below. There's a few unrelated
fixes/nits in it.

I think with/without basing this on top of my series above your patches
here look good with the nits pointed out in the diff below addressed
(and some don't need to be). I.e. the dependency on it is rather
trivial, and the two could be split up.

What do you think is a good way to proceed? I could just submit the
below as a proposed v10 if you'd like & agree...

1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ Commit message
         unpack-objects: low memory footprint for get_data() in dry_run mode
     
         As the name implies, "get_data(size)" will allocate and return a given
    -    size of memory. Allocating memory for a large blob object may cause the
    +    amount of memory. Allocating memory for a large blob object may cause the
         system to run out of memory. Before preparing to replace calling of
         "get_data()" to unpack large blob objects in latter commits, refactor
         "get_data()" to reduce memory footprint for dry_run mode.
    @@ Commit message
     
         Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void use(int bytes)
    @@ t/t5328-unpack-large-objects.sh (new)
     +
     +test_no_loose () {
     +	glob=dest.git/objects/?? &&
    -+	echo "$glob" >expect &&
    -+	eval "echo $glob" >actual &&
    ++	echo $glob >expect &&
    ++	echo "$glob" >actual &&
     +	test_cmp expect actual
     +}
     +
-:  ----------- > 2:  c6b0437db03 object-file.c: do fsync() and close() before post-write die()
2:  88c91affd61 ! 3:  77bcfe3da6f object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
         When writing a large blob using "write_loose_object()", we have to pass
         a buffer with the whole content of the blob, and this behavior will
         consume lots of memory and may cause OOM. We will introduce a stream
    -    version function ("stream_loose_object()") in latter commit to resolve
    +    version function ("stream_loose_object()") in later commit to resolve
         this issue.
     
    -    Before introducing a stream vesion function for writing loose object,
    -    do some refactoring on "write_loose_object()" to reuse code for both
    -    versions.
    +    Before introducing that streaming function, do some refactoring on
    +    "write_loose_object()" to reuse code for both versions.
     
         Rewrite "write_loose_object()" as follows:
     
    @@ Commit message
     
          3. Compress data.
     
    -     4. Move common steps for ending zlib stream into a new funciton
    +     4. Move common steps for ending zlib stream into a new function
             "end_loose_object_common()".
     
          5. Close fd and finalize the object file.
    @@ Commit message
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
     @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
      	return fd;
      }
      
    ++/**
    ++ * Common steps for loose object writers to start writing loose
    ++ * objects:
    ++ *
    ++ * - Create tmpfile for the loose object.
    ++ * - Setup zlib stream for compression.
    ++ * - Start to feed header to zlib stream.
    ++ *
    ++ * Returns a "fd", which should later be provided to
    ++ * end_loose_object_common().
    ++ */
     +static int start_loose_object_common(struct strbuf *tmp_file,
     +				     const char *filename, unsigned flags,
     +				     git_zstream *stream,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	return fd;
     +}
     +
    -+static void end_loose_object_common(int ret, git_hash_ctx *c,
    ++/**
    ++ * Common steps for loose object writers to end writing loose objects:
    ++ *
    ++ * - End the compression of zlib stream.
    ++ * - Get the calculated oid to "parano_oid".
    ++ * - fsync() and close() the "fd"
    ++ */
    ++static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
     +				    git_zstream *stream,
     +				    struct object_id *parano_oid,
     +				    const struct object_id *expected_oid,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	if (ret != Z_OK)
     +		die(_(die_msg2_fmt), ret, expected_oid);
     +	the_hash_algo->final_oid_fn(parano_oid, c);
    ++
    ++	/*
    ++	 * We already did a write_buffer() to the "fd", let's fsync()
    ++	 * and close().
    ++	 *
    ++	 * We might still die() on a subsequent sanity check, but
    ++	 * let's not add to that confusion by not flushing any
    ++	 * outstanding writes to disk first.
    ++	 */
    ++	close_loose_object(fd);
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	while (git_deflate(&stream, 0) == Z_OK)
     -		; /* nothing */
     -	the_hash_algo->update_fn(&c, hdr, hdrlen);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * start writing loose oject:
    -+	 *
    -+	 *  - Create tmpfile for the loose object.
    -+	 *  - Setup zlib stream for compression.
    -+	 *  - Start to feed header to zlib stream.
    -+	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
     +				       &stream, compressed, sizeof(compressed),
     +				       &c, hdr, hdrlen);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
     -		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * end writing loose oject:
    -+	 *
    -+	 *  - End the compression of zlib stream.
    -+	 *  - Get the calculated oid to "parano_oid".
    -+	 */
    -+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
    +-
    +-	/*
    +-	 * We already did a write_buffer() to the "fd", let's fsync()
    +-	 * and close().
    +-	 *
    +-	 * We might still die() on a subsequent sanity check, but
    +-	 * let's not add to that confusion by not flushing any
    +-	 * outstanding writes to disk first.
    +-	 */
    +-	close_loose_object(fd);
    ++	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
     +				N_("unable to deflate new object %s (%d)"),
     +				N_("deflateEnd on object %s failed (%d)"));
    -+
    + 
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
    - 		    oid_to_hex(oid));
3:  054a00ed21d ! 4:  71c10e734d1 object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
     
         Add a new function "stream_loose_object()", which is a stream version of
         "write_loose_object()" but with a low memory footprint. We will use this
    -    function to unpack large blob object in latter commit.
    +    function to unpack large blob object in later commit.
     
         Another difference with "write_loose_object()" is that we have no chance
         to run "write_object_file_prepare()" to calculate the oid in advance.
         In "write_loose_object()", we know the oid and we can write the
         temporary file in the same directory as the final object, but for an
         object with an undetermined oid, we don't know the exact directory for
    -    the object, so we have to save the temporary file in ".git/objects/"
    -    directory instead.
    +    the object.
    +
    +    Still, we need to save the temporary file we're preparing
    +    somewhere. We'll do that in the top-level ".git/objects/"
    +    directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
    +    streamed it we'll know the OID, and will move it to its canonical
    +    path.
     
         "freshen_packed_object()" or "freshen_loose_object()" will be called
         inside "stream_loose_object()" after obtaining the "oid".
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +
     +	/* Since oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
    -+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
     +
     +	/* Common steps for write_loose_object and stream_loose_object to
     +	 * start writing loose oject:
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 *  - End the compression of zlib stream.
     +	 *  - Get the calculated oid.
     +	 */
    -+	end_loose_object_common(ret, &c, &stream, oid, NULL,
    ++	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
     +				N_("unable to stream deflate new object (%d)"),
     +				N_("deflateEnd on stream object failed (%d)"));
     +
    -+	close_loose_object(fd);
    -+
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
     +		goto cleanup;
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +}
     +
      int write_object_file_flags(const void *buf, unsigned long len,
    - 			    const char *type, struct object_id *oid,
    + 			    enum object_type type, struct object_id *oid,
      			    unsigned flags)
     
      ## object-store.h ##
    @@ object-store.h: static inline int write_object_file(const void *buf, unsigned lo
      
     +int stream_loose_object(struct input_stream *in_stream, size_t len,
     +			struct object_id *oid);
    -+
    - int hash_object_file_literally(const void *buf, unsigned long len,
    - 			       const char *type, struct object_id *oid,
    - 			       unsigned flags);
    + int hash_write_object_file_literally(const void *buf, unsigned long len,
    + 				     const char *type, struct object_id *oid,
    + 				     unsigned flags);
-:  ----------- > 5:  3c1d788d69d core doc: modernize core.bigFileThreshold documentation
4:  6bcba6bce66 ! 6:  8b83f6d6b83 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack_non_delta_entry() read data in a stream
    +    unpack-objects: use stream_loose_object() to unpack large objects
     
    -    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    -    entire contents of a blob object, no matter how big it is. This
    -    implementation may consume all the memory and cause OOM.
    +    Make use of the stream_loose_object() function introduced in the
    +    preceding commit to unpack large objects. Before this we'd need to
    +    malloc() the size of the blob before unpacking it, which could cause
    +    OOM with very large blobs.
     
    -    By implementing a zstream version of input_stream interface, we can use
    -    a small fixed buffer for "unpack_non_delta_entry()". However, unpack
    -    non-delta objects from a stream instead of from an entrie buffer will
    -    have 10% performance penalty.
    +    We could use this new interface to unpack all blobs, but doing so
    +    would result in a performance penalty of around 10%, as the below
    +    "hyperfine" benchmark will show. We therefore limit this to files
    +    larger than "core.bigFileThreshold":
     
             $ hyperfine \
               --setup \
    @@ Commit message
                         -c core.bigFileThreshold=16k unpack-objects
                         <small.pack' in 'HEAD~1'
     
    -    Therefore, only unpack objects larger than the "core.bigFileThreshold"
    -    in zstream. Until now, the config variable has been used in the
    -    following cases, and our new case belongs to the packfile category.
    +    An earlier version of this patch introduced a new
    +    "core.bigFileStreamingThreshold" instead of re-using the existing
    +    "core.bigFileThreshold" variable[1]. As noted in a detailed overview
    +    of its users in [2] using it has several different meanings.
     
    -     * Archive:
    +    Still, we consider it good enough to simply re-use it. While it's
    +    possible that someone might want to e.g. consider objects "small" for
    +    the purposes of diffing but "big" for the purposes of writing them
    +    such use-cases are probably too obscure to worry about. We can always
    +    split up "core.bigFileThreshold" in the future if there's a need for
    +    that.
     
    -       + archive.c: write_entry(): write large blob entries to archive in
    -         stream.
    -
    -     * Loose objects:
    -
    -       + object-file.c: index_fd(): when hashing large files in worktree,
    -         read files in a stream, and create one packfile per large blob if
    -         want to save files to git object store.
    -
    -       + object-file.c: read_loose_object(): when checking loose objects
    -         using "git-fsck", do not read full content of large loose objects.
    -
    -     * Packfile:
    -
    -       + fast-import.c: parse_and_store_blob(): streaming large blob from
    -         foreign source to packfile.
    -
    -       + index-pack.c: check_collison(): read and check large blob in stream.
    -
    -       + index-pack.c: unpack_entry_data(): do not return the entire
    -         contents of the big blob from packfile, but uses a fixed buf to
    -         perform some integrity checks on the object.
    -
    -       + pack-check.c: verify_packfile(): used by "git-fsck" and will call
    -         check_object_signature() to check large blob in pack with the
    -         streaming interface.
    -
    -       + pack-objects.c: get_object_details(): set "no_try_delta" for large
    -         blobs when counting objects.
    -
    -       + pack-objects.c: write_no_reuse_object(): streaming large blob to
    -         pack.
    -
    -       + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
    -         stream from packfile.
    -
    -     * Others:
    -
    -       + diff.c: diff_populate_filespec(): treat large blob file as binary.
    -
    -       + streaming.c: istream_source(): as a helper of "open_istream()" to
    -         select proper streaming interface to read large blob from packfile.
    +    1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
    +    2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    + ## Documentation/config/core.txt ##
    +@@ Documentation/config/core.txt: usage, at the slight expense of increased disk usage.
    + * Will be generally be streamed when written, which avoids excessive
    + memory usage, at the cost of some fixed overhead. Commands that make
    + use of this include linkgit:git-archive[1],
    +-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
    +-linkgit:git-fsck[1].
    ++linkgit:git-fast-import[1], linkgit:git-index-pack[1],
    ++linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
    + 
    + core.excludesFile::
    + 	Specifies the pathname to the file that contains patterns to
    +
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
      	}
5:  1bfaf89ee0b < -:  ----------- object-file API: add a format_object_header() function

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
@ 2022-02-02  8:32               ` Han Xin
  2022-02-02 10:59                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-02-02  8:32 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin

On Wed, Feb 2, 2022 at 5:28 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Thu, Jan 20 2022, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > Changes since v8:
> > * Rename "assert_no_loose ()" into "test_no_loose ()" in
> >   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
> >   "test_dir_is_empty" instead.
> >
> > * Revert changes to "create_tmpfile()" and error handling is now in
> >   "start_loose_object_common()".
> >
> > * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
> >   for "write_loose_object()" now.
> >
> > * Remove the commit "object-file.c: remove the slash for directory_size()",
> >   it can be in a separate patch if necessary.
> >
> > Han Xin (4):
> >   unpack-objects: low memory footprint for get_data() in dry_run mode
> >   object-file.c: refactor write_loose_object() to several steps
> >   object-file.c: add "stream_loose_object()" to handle large object
> >   unpack-objects: unpack_non_delta_entry() read data in a stream
> >
> > Ævar Arnfjörð Bjarmason (1):
> >   object-file API: add a format_object_header() function
>
> I sent
> https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
> today which suggests splitting out the 5/5 cleanup you'd integrated.
>
> I then rebased these patches of yours on top of that, the result is
> here:
> https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9
>
> The range-diff to your version is below. There's a few unrelated
> fixes/nits in it.
>
> I think with/without basing this on top of my series above your patches
> here look good with the nits pointed out in the diff below addressed
> (and some don't need to be). I.e. the dependency on it is rather
> trivial, and the two could be split up.
>
> What do you think is a good way to proceed? I could just submit the
> below as a proposed v10 if you'd like & agree...
>

Yes, thanks for the suggestions, and I'm glad you're happy to do so.

Thanks.
-Han Xin

> 1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
>     @@ Commit message
>          unpack-objects: low memory footprint for get_data() in dry_run mode
>
>          As the name implies, "get_data(size)" will allocate and return a given
>     -    size of memory. Allocating memory for a large blob object may cause the
>     +    amount of memory. Allocating memory for a large blob object may cause the
>          system to run out of memory. Before preparing to replace calling of
>          "get_data()" to unpack large blob objects in latter commits, refactor
>          "get_data()" to reduce memory footprint for dry_run mode.
>     @@ Commit message
>
>          Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>          Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>     +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
>       ## builtin/unpack-objects.c ##
>      @@ builtin/unpack-objects.c: static void use(int bytes)
>     @@ t/t5328-unpack-large-objects.sh (new)
>      +
>      +test_no_loose () {
>      +  glob=dest.git/objects/?? &&
>     -+  echo "$glob" >expect &&
>     -+  eval "echo $glob" >actual &&
>     ++  echo $glob >expect &&
>     ++  echo "$glob" >actual &&
>      +  test_cmp expect actual
>      +}
>      +

I have a small doubt with this, it works fine with dash, but not
others like zsh. Wouldn't
it be better to do compatibility, or would it introduce other issues
that I don't know?

Thanks.
-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v9 0/5] unpack large blobs in stream
  2022-02-02  8:32               ` Han Xin
@ 2022-02-02 10:59                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-02 10:59 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jeff King, Jiang Xin, Philip Oakley,
	Derrick Stolee, René Scharfe, Jiang Xin, Han Xin


On Wed, Feb 02 2022, Han Xin wrote:

> On Wed, Feb 2, 2022 at 5:28 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>
>>
>> On Thu, Jan 20 2022, Han Xin wrote:
>>
>> > From: Han Xin <hanxin.hx@alibaba-inc.com>
>> >
>> > Changes since v8:
>> > * Rename "assert_no_loose ()" into "test_no_loose ()" in
>> >   "t5329-unpack-large-objects.sh". Remove "assert_no_pack ()" and use
>> >   "test_dir_is_empty" instead.
>> >
>> > * Revert changes to "create_tmpfile()" and error handling is now in
>> >   "start_loose_object_common()".
>> >
>> > * Remove "finalize_object_file_with_mtime()" which seems to be an overkill
>> >   for "write_loose_object()" now.
>> >
>> > * Remove the commit "object-file.c: remove the slash for directory_size()",
>> >   it can be in a separate patch if necessary.
>> >
>> > Han Xin (4):
>> >   unpack-objects: low memory footprint for get_data() in dry_run mode
>> >   object-file.c: refactor write_loose_object() to several steps
>> >   object-file.c: add "stream_loose_object()" to handle large object
>> >   unpack-objects: unpack_non_delta_entry() read data in a stream
>> >
>> > Ævar Arnfjörð Bjarmason (1):
>> >   object-file API: add a format_object_header() function
>>
>> I sent
>> https://lore.kernel.org/git/cover-00.10-00000000000-20220201T144803Z-avarab@gmail.com/
>> today which suggests splitting out the 5/5 cleanup you'd integrated.
>>
>> I then rebased these patches of yours on top of that, the result is
>> here:
>> https://github.com/avar/git/tree/han-xin-avar/unpack-loose-object-streaming-9
>>
>> The range-diff to your version is below. There's a few unrelated
>> fixes/nits in it.
>>
>> I think with/without basing this on top of my series above your patches
>> here look good with the nits pointed out in the diff below addressed
>> (and some don't need to be). I.e. the dependency on it is rather
>> trivial, and the two could be split up.
>>
>> What do you think is a good way to proceed? I could just submit the
>> below as a proposed v10 if you'd like & agree...
>>
>
> Yes, thanks for the suggestions, and I'm glad you're happy to do so.

Willdo.

>> 1:  553a9377eb3 ! 1:  61fcfe7b840 unpack-objects: low memory footprint for get_data() in dry_run mode
>>     @@ Commit message
>>          unpack-objects: low memory footprint for get_data() in dry_run mode
>>
>>          As the name implies, "get_data(size)" will allocate and return a given
>>     -    size of memory. Allocating memory for a large blob object may cause the
>>     +    amount of memory. Allocating memory for a large blob object may cause the
>>          system to run out of memory. Before preparing to replace calling of
>>          "get_data()" to unpack large blob objects in latter commits, refactor
>>          "get_data()" to reduce memory footprint for dry_run mode.
>>     @@ Commit message
>>
>>          Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
>>          Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
>>     +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>>
>>       ## builtin/unpack-objects.c ##
>>      @@ builtin/unpack-objects.c: static void use(int bytes)
>>     @@ t/t5328-unpack-large-objects.sh (new)
>>      +
>>      +test_no_loose () {
>>      +  glob=dest.git/objects/?? &&
>>     -+  echo "$glob" >expect &&
>>     -+  eval "echo $glob" >actual &&
>>     ++  echo $glob >expect &&
>>     ++  echo "$glob" >actual &&
>>      +  test_cmp expect actual
>>      +}
>>      +
>
> I have a small doubt with this, it works fine with dash, but not
> others like zsh. Wouldn't
> it be better to do compatibility, or would it introduce other issues
> that I don't know?

Ah, I hadn't spotted that zsh issue. I don't think the test suite will
run on it in general, but in any case I'll fix this.

There's a few other tests that do this just by piping "find" to "wc -l",
it's probably better to just follow that pattern. I think the eval
works, but I thought it was a bit unusual/stood out.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v10 0/6] unpack-objects: support streaming large objects to disk
  2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
  2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07             ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
                                 ` (6 more replies)
  1 sibling, 7 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

This is a v10 re-roll of Han Xin's series[1] to stream large objects
to disk in "git unpack-objects". This v9 had integrated a proposed
cleanup patch of mine, which is now a part of its own series, which
this series now depends on: [2]. This v10 is sent with Han Xin's
approval[3].

Changes since v9:

 * Now based on [2]
 * Small grammar/typo fixes in commit messages
 * Replaced an echo/eval pattern in a test with a $(find ... | wc -l)
   comparison, which is a pattern we already use in another test for
   the same (or similar) assertion.
 * I added a new 2/6 to do an fsync() before an oideq() assertion. I
   don't think it matters in practice, but allows 3/6 to be smaller by
   having that code-now-utility-function share more logic among its two callers.
 * Changed inline comments in 3/6 to API docs where appropriate, the
   helper function now gets a "fd" per 2/6.
 * 4/6 could use the format_object_header() function in the base
   topic, and now does so (instead of that conversion coming later in
   v9).
 * A new 5/6 updates the core.bigFileThreshold documentation to
   account for 12 years of behavior changes we hadn't documented.
 * The updated 6/6 now links to those docs, and I removed a very
   detailed accounting of all in-tree uses of core.bigFileThreshold
   from the commit message. I think linking to the summary docs should
   suffice, and for anyone digging in the future 5/6 links to the more
   detailed summary in the old patch.

More generally I've been heavily involved in the review for the past
iterations, and I think barring any last minute nits in this v10 this
topic should be ready to advance. As the above summary shows we're
down to typo fixes, doc and test tweaks etc. at this point.

The core functionality being added here isn't changed in any
meaningful way, and has had a lot of careful review already.

1. https://lore.kernel.org/git/20220120112114.47618-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/cover-v2-00.11-00000000000-20220204T135005Z-avarab@gmail.com/
3. https://lore.kernel.org/git/CAO0brD2Pe0aKSiBphZS861gC=nZk+q2GtXDN4pPjAQnPdns3TA@mail.gmail.com/

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (2):
  object-file.c: do fsync() and close() before post-write die()
  core doc: modernize core.bigFileThreshold documentation

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 110 ++++++++++++++--
 object-file.c                   | 221 +++++++++++++++++++++++++++-----
 object-store.h                  |   8 ++
 t/t5328-unpack-large-objects.sh |  62 +++++++++
 5 files changed, 381 insertions(+), 53 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

Range-diff against v9:
1:  553a9377eb3 ! 1:  e46eb75b98f unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ Commit message
         unpack-objects: low memory footprint for get_data() in dry_run mode
     
         As the name implies, "get_data(size)" will allocate and return a given
    -    size of memory. Allocating memory for a large blob object may cause the
    +    amount of memory. Allocating memory for a large blob object may cause the
         system to run out of memory. Before preparing to replace calling of
         "get_data()" to unpack large blob objects in latter commits, refactor
         "get_data()" to reduce memory footprint for dry_run mode.
    @@ Commit message
         in dry_run mode, "get_data()" will release the allocated buffer and
         return NULL instead of returning garbage data.
     
    +    The "find [...]objects/?? -type f | wc -l" test idiom being used here
    +    is adapted from the same "find" use added to another test in
    +    d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).
    +
         Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void use(int bytes)
    @@ t/t5328-unpack-large-objects.sh (new)
     +}
     +
     +test_no_loose () {
    -+	glob=dest.git/objects/?? &&
    -+	echo "$glob" >expect &&
    -+	eval "echo $glob" >actual &&
    -+	test_cmp expect actual
    ++	test $(find dest.git/objects/?? -type f | wc -l) = 0
     +}
     +
     +test_expect_success "create large objects (1.5 MB) and PACK" '
-:  ----------- > 2:  48bf9090058 object-file.c: do fsync() and close() before post-write die()
2:  88c91affd61 ! 3:  0e33d2a6e35 object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
         When writing a large blob using "write_loose_object()", we have to pass
         a buffer with the whole content of the blob, and this behavior will
         consume lots of memory and may cause OOM. We will introduce a stream
    -    version function ("stream_loose_object()") in latter commit to resolve
    +    version function ("stream_loose_object()") in later commit to resolve
         this issue.
     
    -    Before introducing a stream vesion function for writing loose object,
    -    do some refactoring on "write_loose_object()" to reuse code for both
    -    versions.
    +    Before introducing that streaming function, do some refactoring on
    +    "write_loose_object()" to reuse code for both versions.
     
         Rewrite "write_loose_object()" as follows:
     
    @@ Commit message
     
          3. Compress data.
     
    -     4. Move common steps for ending zlib stream into a new funciton
    +     4. Move common steps for ending zlib stream into a new function
             "end_loose_object_common()".
     
          5. Close fd and finalize the object file.
    @@ Commit message
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
     @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filename)
      	return fd;
      }
      
    ++/**
    ++ * Common steps for loose object writers to start writing loose
    ++ * objects:
    ++ *
    ++ * - Create tmpfile for the loose object.
    ++ * - Setup zlib stream for compression.
    ++ * - Start to feed header to zlib stream.
    ++ *
    ++ * Returns a "fd", which should later be provided to
    ++ * end_loose_object_common().
    ++ */
     +static int start_loose_object_common(struct strbuf *tmp_file,
     +				     const char *filename, unsigned flags,
     +				     git_zstream *stream,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	return fd;
     +}
     +
    -+static void end_loose_object_common(int ret, git_hash_ctx *c,
    ++/**
    ++ * Common steps for loose object writers to end writing loose objects:
    ++ *
    ++ * - End the compression of zlib stream.
    ++ * - Get the calculated oid to "parano_oid".
    ++ * - fsync() and close() the "fd"
    ++ */
    ++static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
     +				    git_zstream *stream,
     +				    struct object_id *parano_oid,
     +				    const struct object_id *expected_oid,
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     +	if (ret != Z_OK)
     +		die(_(die_msg2_fmt), ret, expected_oid);
     +	the_hash_algo->final_oid_fn(parano_oid, c);
    ++
    ++	/*
    ++	 * We already did a write_buffer() to the "fd", let's fsync()
    ++	 * and close().
    ++	 *
    ++	 * We might still die() on a subsequent sanity check, but
    ++	 * let's not add to that confusion by not flushing any
    ++	 * outstanding writes to disk first.
    ++	 */
    ++	close_loose_object(fd);
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	while (git_deflate(&stream, 0) == Z_OK)
     -		; /* nothing */
     -	the_hash_algo->update_fn(&c, hdr, hdrlen);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * start writing loose oject:
    -+	 *
    -+	 *  - Create tmpfile for the loose object.
    -+	 *  - Setup zlib stream for compression.
    -+	 *  - Start to feed header to zlib stream.
    -+	 */
     +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
     +				       &stream, compressed, sizeof(compressed),
     +				       &c, hdr, hdrlen);
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
     -		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * end writing loose oject:
    -+	 *
    -+	 *  - End the compression of zlib stream.
    -+	 *  - Get the calculated oid to "parano_oid".
    -+	 */
    -+	end_loose_object_common(ret, &c, &stream, &parano_oid, oid,
    +-
    +-	/*
    +-	 * We already did a write_buffer() to the "fd", let's fsync()
    +-	 * and close().
    +-	 *
    +-	 * We might still die() on a subsequent sanity check, but
    +-	 * let's not add to that confusion by not flushing any
    +-	 * outstanding writes to disk first.
    +-	 */
    +-	close_loose_object(fd);
    ++	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
     +				N_("unable to deflate new object %s (%d)"),
     +				N_("deflateEnd on object %s failed (%d)"));
    -+
    + 
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
    - 		    oid_to_hex(oid));
3:  054a00ed21d ! 4:  9644df5c744 object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
     
         Add a new function "stream_loose_object()", which is a stream version of
         "write_loose_object()" but with a low memory footprint. We will use this
    -    function to unpack large blob object in latter commit.
    +    function to unpack large blob object in later commit.
     
         Another difference with "write_loose_object()" is that we have no chance
         to run "write_object_file_prepare()" to calculate the oid in advance.
         In "write_loose_object()", we know the oid and we can write the
         temporary file in the same directory as the final object, but for an
         object with an undetermined oid, we don't know the exact directory for
    -    the object, so we have to save the temporary file in ".git/objects/"
    -    directory instead.
    +    the object.
    +
    +    Still, we need to save the temporary file we're preparing
    +    somewhere. We'll do that in the top-level ".git/objects/"
    +    directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
    +    streamed it we'll know the OID, and will move it to its canonical
    +    path.
     
         "freshen_packed_object()" or "freshen_loose_object()" will be called
         inside "stream_loose_object()" after obtaining the "oid".
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +
     +	/* Since oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
    -+	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX, type_name(OBJ_BLOB), len) + 1;
    ++	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
     +
     +	/* Common steps for write_loose_object and stream_loose_object to
     +	 * start writing loose oject:
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 *  - End the compression of zlib stream.
     +	 *  - Get the calculated oid.
     +	 */
    -+	end_loose_object_common(ret, &c, &stream, oid, NULL,
    ++	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
     +				N_("unable to stream deflate new object (%d)"),
     +				N_("deflateEnd on stream object failed (%d)"));
     +
    -+	close_loose_object(fd);
    -+
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
     +		goto cleanup;
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +}
     +
      int write_object_file_flags(const void *buf, unsigned long len,
    - 			    const char *type, struct object_id *oid,
    + 			    enum object_type type, struct object_id *oid,
      			    unsigned flags)
     
      ## object-store.h ##
    @@ object-store.h: struct object_directory {
      	struct object_directory *, 1, fspathhash, fspatheq)
      
     @@ object-store.h: static inline int write_object_file(const void *buf, unsigned long len,
    - 	return write_object_file_flags(buf, len, type, oid, 0);
    - }
    - 
    + int write_object_file_literally(const void *buf, unsigned long len,
    + 				const char *type, struct object_id *oid,
    + 				unsigned flags);
     +int stream_loose_object(struct input_stream *in_stream, size_t len,
     +			struct object_id *oid);
    -+
    - int hash_object_file_literally(const void *buf, unsigned long len,
    - 			       const char *type, struct object_id *oid,
    - 			       unsigned flags);
    + 
    + /*
    +  * Add an object file to the in-memory object store, without writing it
-:  ----------- > 5:  4550f3a2745 core doc: modernize core.bigFileThreshold documentation
4:  6bcba6bce66 ! 6:  6a70e49a346 unpack-objects: unpack_non_delta_entry() read data in a stream
    @@ Metadata
     Author: Han Xin <hanxin.hx@alibaba-inc.com>
     
      ## Commit message ##
    -    unpack-objects: unpack_non_delta_entry() read data in a stream
    +    unpack-objects: use stream_loose_object() to unpack large objects
     
    -    We used to call "get_data()" in "unpack_non_delta_entry()" to read the
    -    entire contents of a blob object, no matter how big it is. This
    -    implementation may consume all the memory and cause OOM.
    +    Make use of the stream_loose_object() function introduced in the
    +    preceding commit to unpack large objects. Before this we'd need to
    +    malloc() the size of the blob before unpacking it, which could cause
    +    OOM with very large blobs.
     
    -    By implementing a zstream version of input_stream interface, we can use
    -    a small fixed buffer for "unpack_non_delta_entry()". However, unpack
    -    non-delta objects from a stream instead of from an entrie buffer will
    -    have 10% performance penalty.
    +    We could use this new interface to unpack all blobs, but doing so
    +    would result in a performance penalty of around 10%, as the below
    +    "hyperfine" benchmark will show. We therefore limit this to files
    +    larger than "core.bigFileThreshold":
     
             $ hyperfine \
               --setup \
    @@ Commit message
                         -c core.bigFileThreshold=16k unpack-objects
                         <small.pack' in 'HEAD~1'
     
    -    Therefore, only unpack objects larger than the "core.bigFileThreshold"
    -    in zstream. Until now, the config variable has been used in the
    -    following cases, and our new case belongs to the packfile category.
    +    An earlier version of this patch introduced a new
    +    "core.bigFileStreamingThreshold" instead of re-using the existing
    +    "core.bigFileThreshold" variable[1]. As noted in a detailed overview
    +    of its users in [2] using it has several different meanings.
     
    -     * Archive:
    +    Still, we consider it good enough to simply re-use it. While it's
    +    possible that someone might want to e.g. consider objects "small" for
    +    the purposes of diffing but "big" for the purposes of writing them
    +    such use-cases are probably too obscure to worry about. We can always
    +    split up "core.bigFileThreshold" in the future if there's a need for
    +    that.
     
    -       + archive.c: write_entry(): write large blob entries to archive in
    -         stream.
    -
    -     * Loose objects:
    -
    -       + object-file.c: index_fd(): when hashing large files in worktree,
    -         read files in a stream, and create one packfile per large blob if
    -         want to save files to git object store.
    -
    -       + object-file.c: read_loose_object(): when checking loose objects
    -         using "git-fsck", do not read full content of large loose objects.
    -
    -     * Packfile:
    -
    -       + fast-import.c: parse_and_store_blob(): streaming large blob from
    -         foreign source to packfile.
    -
    -       + index-pack.c: check_collison(): read and check large blob in stream.
    -
    -       + index-pack.c: unpack_entry_data(): do not return the entire
    -         contents of the big blob from packfile, but uses a fixed buf to
    -         perform some integrity checks on the object.
    -
    -       + pack-check.c: verify_packfile(): used by "git-fsck" and will call
    -         check_object_signature() to check large blob in pack with the
    -         streaming interface.
    -
    -       + pack-objects.c: get_object_details(): set "no_try_delta" for large
    -         blobs when counting objects.
    -
    -       + pack-objects.c: write_no_reuse_object(): streaming large blob to
    -         pack.
    -
    -       + unpack-objects.c: unpack_non_delta_entry(): unpack large blob in
    -         stream from packfile.
    -
    -     * Others:
    -
    -       + diff.c: diff_populate_filespec(): treat large blob file as binary.
    -
    -       + streaming.c: istream_source(): as a helper of "open_istream()" to
    -         select proper streaming interface to read large blob from packfile.
    +    1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
    +    2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
     
    + ## Documentation/config/core.txt ##
    +@@ Documentation/config/core.txt: usage, at the slight expense of increased disk usage.
    + * Will be generally be streamed when written, which avoids excessive
    + memory usage, at the cost of some fixed overhead. Commands that make
    + use of this include linkgit:git-archive[1],
    +-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
    +-linkgit:git-fsck[1].
    ++linkgit:git-fast-import[1], linkgit:git-index-pack[1],
    ++linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
    + 
    + core.excludesFile::
    + 	Specifies the pathname to the file that contains patterns to
    +
      ## builtin/unpack-objects.c ##
     @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
      	}
5:  1bfaf89ee0b < -:  ----------- object-file API: add a format_object_header() function
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v10 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 2/6] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
                                 ` (5 subsequent siblings)
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin, Jiang Xin,
	Ævar Arnfjörð Bjarmason

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 39 ++++++++++++++++++++--------
 t/t5328-unpack-large-objects.sh | 45 +++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 11 deletions(-)
 create mode 100755 t/t5328-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index dbeb0680a58..896ea8aceb4 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
new file mode 100755
index 00000000000..1432dfc8386
--- /dev/null
+++ b/t/t5328-unpack-large-objects.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_no_loose () {
+	test $(find dest.git/objects/?? -type f | wc -l) = 0
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	test_no_loose &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v10 2/6] object-file.c: do fsync() and close() before post-write die()
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 3/6] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
                                 ` (4 subsequent siblings)
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

Change write_loose_object() to do an fsync() and close() before the
oideq() sanity check at the end. This change re-joins code that was
split up by the die() sanity check added in 748af44c63e (sha1_file: be
paranoid when creating loose objects, 2010-02-21).

I don't think that this change matters in itself, if we called die()
it was possible that our data wouldn't fully make it to disk, but in
any case we were writing data that we'd consider corrupted. It's
possible that a subsequent "git fsck" will be less confused now.

The real reason to make this change is that in a subsequent commit
we'll split this code in write_loose_object() into a utility function,
all its callers will want the preceding sanity checks, but not the
"oideq" check. By moving the close_loose_object() earlier it'll be
easier to reason about the introduction of the utility function.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 5c9525479c2..edebdc91221 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2001,12 +2001,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	/*
+	 * We already did a write_buffer() to the "fd", let's fsync()
+	 * and close().
+	 *
+	 * We might still die() on a subsequent sanity check, but
+	 * let's not add to that confusion by not flushing any
+	 * outstanding writes to disk first.
+	 */
+	close_loose_object(fd);
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));

-	close_loose_object(fd);
-
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.35.1.940.ge7a5b4b05f2

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v10 3/6] object-file.c: refactor write_loose_object() to several steps
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 2/6] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 4/6] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
                                 ` (3 subsequent siblings)
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 129 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 40 deletions(-)

diff --git a/object-file.c b/object-file.c
index edebdc91221..f5c579e42e3 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1943,6 +1943,87 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "parano_oid".
+ * - fsync() and close() the "fd"
+ */
+static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+
+	/*
+	 * We already did a write_buffer() to the "fd", let's fsync()
+	 * and close().
+	 *
+	 * We might still die() on a subsequent sanity check, but
+	 * let's not add to that confusion by not flushing any
+	 * outstanding writes to disk first.
+	 */
+	close_loose_object(fd);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1957,28 +2038,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1993,24 +2057,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
-
-	/*
-	 * We already did a write_buffer() to the "fd", let's fsync()
-	 * and close().
-	 *
-	 * We might still die() on a subsequent sanity check, but
-	 * let's not add to that confusion by not flushing any
-	 * outstanding writes to disk first.
-	 */
-	close_loose_object(fd);
+	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
 
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v10 4/6] object-file.c: add "stream_loose_object()" to handle large object
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
                                 ` (2 preceding siblings ...)
  2022-02-04 14:07               ` [PATCH v10 3/6] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 5/6] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
                                 ` (2 subsequent siblings)
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  8 ++++
 2 files changed, 107 insertions(+)

diff --git a/object-file.c b/object-file.c
index f5c579e42e3..2ef0bf0e5c3 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2095,6 +2095,105 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose oject:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = Z_FINISH;
+		}
+		ret = git_deflate(&stream, flush);
+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+			die(_("unable to write loose object file"));
+		stream.next_out = compressed;
+		stream.avail_out = sizeof(compressed);
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index bd2322ed8ce..1099455bc2e 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -261,6 +267,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v10 5/6] core doc: modernize core.bigFileThreshold documentation
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
                                 ` (3 preceding siblings ...)
  2022-02-04 14:07               ` [PATCH v10 4/6] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-02-04 14:07               ` [PATCH v10 6/6] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a1..b6a12218665 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -412,17 +412,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if though they were labeled "binary" (see
+  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
+  and linkgit:git-diff[1] will not diffs for files above this limit.
++
+* Will be generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v10 6/6] unpack-objects: use stream_loose_object() to unpack large objects
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
                                 ` (4 preceding siblings ...)
  2022-02-04 14:07               ` [PATCH v10 5/6] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
@ 2022-02-04 14:07               ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  6 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-02-04 14:07 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use this new interface to unpack all blobs, but doing so
would result in a performance penalty of around 10%, as the below
"hyperfine" benchmark will show. We therefore limit this to files
larger than "core.bigFileThreshold":

    $ hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare
       https://github.com/microsoft/scalar.git;
       cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git' \
      ...

    Summary
      './git -C dest.git -c core.bigFileThreshold=512m
      unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=512m unpack-objects
                <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git
                -c core.bigFileThreshold=16k unpack-objects
                <small.pack' in 'HEAD~1'

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 71 ++++++++++++++++++++++++++++++++-
 t/t5328-unpack-large-objects.sh | 23 +++++++++--
 3 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index b6a12218665..5aca987632c 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -436,8 +436,8 @@ usage, at the slight expense of increased disk usage.
 * Will be generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 896ea8aceb4..7ce3cb61086 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -343,11 +343,80 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size);
 	if (buf)
 		write_object(nr, type, buf, size);
 }
diff --git a/t/t5328-unpack-large-objects.sh b/t/t5328-unpack-large-objects.sh
index 1432dfc8386..5c1042b4d91 100755
--- a/t/t5328-unpack-large-objects.sh
+++ b/t/t5328-unpack-large-objects.sh
@@ -9,7 +9,11 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_no_loose () {
@@ -30,16 +34,29 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
 	test_no_loose &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	test_no_loose
+'
+
 test_done
-- 
2.35.1.940.ge7a5b4b05f2


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 0/8] unpack-objects: support streaming blobs to disk
  2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
                                 ` (5 preceding siblings ...)
  2022-02-04 14:07               ` [PATCH v10 6/6] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23               ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
                                   ` (9 more replies)
  6 siblings, 10 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

This series by Han Xin was waiting on some in-flight patches that
landed in 430883a70c7 (Merge branch 'ab/object-file-api-updates',
2022-03-16).

This series teaches "git unpack-objects" to stream objects larger than
core.bigFileThreshold to disk. As 8/8 shows streaming e.g. a 100MB
blob now uses ~5MB of memory instead of ~105MB. This streaming method
is slower if you've got memory to handle the blobs in-core, but if you
don't it allows you to unpack objects at all, as you might otherwise
OOM.

Changes since v10:

 * Renamed the new test file, its number conflicted with a
   since-landed commit-graph test.

 * Some minor code changes to make diffs to the pre-image smaller
   (e.g. the top of the range-diff below)

 * The whole "find dest.git" to see if we have loose objects is now
   either a test for "do we have objects at all?" (--dry-run mode), or
   uses a simpler implementation. We could use
   "test_stdout_line_count" for that.

 * We also test that as we use "unpack-objects" to stream directly to
   a pack that the result is byte-for-byte the same as the source.

 * A new 4/8 that I added allows for more code sharing in
   object-file.c, our two end-state functions now share more logic.

 * Minor typo/grammar/comment etc. fixes throughout.

 * Updated 8/8 with benchmarks, somewhere along the line we lost the
   code to run the benchmark mentioned in the commit message...

1. https://lore.kernel.org/git/cover-v10-0.6-00000000000-20220204T135538Z-avarab@gmail.com/

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (4):
  object-file.c: do fsync() and close() before post-write die()
  object-file.c: factor out deflate part of write_loose_object()
  core doc: modernize core.bigFileThreshold documentation
  unpack-objects: refactor away unpack_non_delta_entry()

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 109 +++++++++++---
 object-file.c                   | 250 +++++++++++++++++++++++++++-----
 object-store.h                  |   8 +
 t/t5351-unpack-large-objects.sh |  61 ++++++++
 5 files changed, 397 insertions(+), 64 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

Range-diff against v10:
1:  e46eb75b98f ! 1:  2103d5bfd96 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ builtin/unpack-objects.c: static void use(int bytes)
      {
      	git_zstream stream;
     -	void *buf = xmallocz(size);
    -+	unsigned long bufsize;
    -+	void *buf;
    ++	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
    ++	void *buf = xmallocz(bufsize);
      
      	memset(&stream, 0, sizeof(stream));
    -+	if (dry_run && size > 8192)
    -+		bufsize = 8192;
    -+	else
    -+		bufsize = size;
    -+	buf = xmallocz(bufsize);
      
      	stream.next_out = buf;
     -	stream.avail_out = size;
    @@ builtin/unpack-objects.c: static void unpack_delta_entry(enum object_type type,
      		hi = nr;
      		while (lo < hi) {
     
    - ## t/t5328-unpack-large-objects.sh (new) ##
    + ## t/t5351-unpack-large-objects.sh (new) ##
     @@
     +#!/bin/sh
     +#
    @@ t/t5328-unpack-large-objects.sh (new)
     +	git init --bare dest.git
     +}
     +
    -+test_no_loose () {
    -+	test $(find dest.git/objects/?? -type f | wc -l) = 0
    -+}
    -+
     +test_expect_success "create large objects (1.5 MB) and PACK" '
     +	test-tool genrandom foo 1500000 >big-blob &&
     +	test_commit --append foo big-blob &&
     +	test-tool genrandom bar 1500000 >big-blob &&
     +	test_commit --append bar big-blob &&
    -+	PACK=$(echo HEAD | git pack-objects --revs test)
    ++	PACK=$(echo HEAD | git pack-objects --revs pack)
     +'
     +
     +test_expect_success 'set memory limitation to 1MB' '
    @@ t/t5328-unpack-large-objects.sh (new)
     +
     +test_expect_success 'unpack-objects failed under memory limitation' '
     +	prepare_dest &&
    -+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    ++	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
     +	grep "fatal: attempting to allocate" err
     +'
     +
     +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
     +	prepare_dest &&
    -+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    -+	test_no_loose &&
    ++	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
    ++	test_stdout_line_count = 0 find dest.git/objects -type f &&
     +	test_dir_is_empty dest.git/objects/pack
     +'
     +
2:  48bf9090058 = 2:  6acd8759772 object-file.c: do fsync() and close() before post-write die()
3:  0e33d2a6e35 = 3:  f7b02c307fc object-file.c: refactor write_loose_object() to several steps
-:  ----------- > 4:  20d97cc2605 object-file.c: factor out deflate part of write_loose_object()
4:  9644df5c744 ! 5:  db40f4160c4 object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
     @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	strbuf_addf(&filename, "%s/", get_object_directory());
     +	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
     +
    -+	/* Common steps for write_loose_object and stream_loose_object to
    -+	 * start writing loose oject:
    ++	/*
    ++	 * Common steps for write_loose_object and stream_loose_object to
    ++	 * start writing loose objects:
     +	 *
     +	 *  - Create tmpfile for the loose object.
     +	 *  - Setup zlib stream for compression.
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	/* Then the data itself.. */
     +	do {
     +		unsigned char *in0 = stream.next_in;
    ++
     +		if (!stream.avail_in && !in_stream->is_finished) {
     +			const void *in = in_stream->read(in_stream, &stream.avail_in);
     +			stream.next_in = (void *)in;
     +			in0 = (unsigned char *)in;
     +			/* All data has been read. */
     +			if (in_stream->is_finished)
    -+				flush = Z_FINISH;
    ++				flush = 1;
     +		}
    -+		ret = git_deflate(&stream, flush);
    -+		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
    -+		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
    -+			die(_("unable to write loose object file"));
    -+		stream.next_out = compressed;
    -+		stream.avail_out = sizeof(compressed);
    ++		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
    ++						compressed, sizeof(compressed));
     +		/*
     +		 * Unlike write_loose_object(), we do not have the entire
     +		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
5:  4550f3a2745 = 6:  d8ae2eadb98 core doc: modernize core.bigFileThreshold documentation
-:  ----------- > 7:  2b403e7cd9c unpack-objects: refactor away unpack_non_delta_entry()
6:  6a70e49a346 ! 8:  5eded902496 unpack-objects: use stream_loose_object() to unpack large objects
    @@ Commit message
         malloc() the size of the blob before unpacking it, which could cause
         OOM with very large blobs.
     
    -    We could use this new interface to unpack all blobs, but doing so
    -    would result in a performance penalty of around 10%, as the below
    -    "hyperfine" benchmark will show. We therefore limit this to files
    -    larger than "core.bigFileThreshold":
    -
    -        $ hyperfine \
    -          --setup \
    -          'if ! test -d scalar.git; then git clone --bare
    -           https://github.com/microsoft/scalar.git;
    -           cp scalar.git/objects/pack/*.pack small.pack; fi' \
    -          --prepare 'rm -rf dest.git && git init --bare dest.git' \
    -          ...
    -
    -        Summary
    -          './git -C dest.git -c core.bigFileThreshold=512m
    -          unpack-objects <small.pack' in 'origin/master'
    -            1.01 ± 0.04 times faster than './git -C dest.git
    -                    -c core.bigFileThreshold=512m unpack-objects
    -                    <small.pack' in 'HEAD~1'
    -            1.01 ± 0.04 times faster than './git -C dest.git
    -                    -c core.bigFileThreshold=512m unpack-objects
    -                    <small.pack' in 'HEAD~0'
    -            1.03 ± 0.10 times faster than './git -C dest.git
    -                    -c core.bigFileThreshold=16k unpack-objects
    -                    <small.pack' in 'origin/master'
    -            1.02 ± 0.07 times faster than './git -C dest.git
    -                    -c core.bigFileThreshold=16k unpack-objects
    -                    <small.pack' in 'HEAD~0'
    -            1.10 ± 0.04 times faster than './git -C dest.git
    -                    -c core.bigFileThreshold=16k unpack-objects
    -                    <small.pack' in 'HEAD~1'
    +    We could use the new streaming interface to unpack all blobs, but
    +    doing so would be much slower, as demonstrated e.g. with this
    +    benchmark using git-hyperfine[0]:
    +
    +            rm -rf /tmp/scalar.git &&
    +            git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
    +            mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
    +            git hyperfine \
    +                    -r 2 --warmup 1 \
    +                    -L rev origin/master,HEAD -L v "10,512,1k,1m" \
    +                    -s 'make' \
    +                    -p 'git init --bare dest.git' \
    +                    -c 'rm -rf dest.git' \
    +                    './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'
    +
    +    Here we'll perform worse with lower core.bigFileThreshold settings
    +    with this change in terms of speed, but we're getting lower memory use
    +    in return:
    +
    +            Summary
    +              './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
    +                1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
    +                1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
    +                1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
    +                1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
    +                1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
    +                1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
    +                1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
    +
    +    A better benchmark to demonstrate the benefits of that this one, which
    +    creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:
    +
    +            rm -rf /tmp/repo &&
    +            git init /tmp/repo &&
    +            (
    +                    cd /tmp/repo &&
    +                    for i in 1 25 50 75 100
    +                    do
    +                            dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
    +                    done &&
    +                    git add blob.* &&
    +                    git commit -mblobs &&
    +                    git gc &&
    +                    PACK=$(echo .git/objects/pack/pack-*.pack) &&
    +                    cp "$PACK" my.pack
    +            ) &&
    +            git hyperfine \
    +                    --show-output \
    +                    -L rev origin/master,HEAD -L v "512,50m,100m" \
    +                    -s 'make' \
    +                    -p 'git init --bare dest.git' \
    +                    -c 'rm -rf dest.git' \
    +                    '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'
    +
    +    Using this test we'll always use >100MB of memory on
    +    origin/master (around ~105MB), but max out at e.g. ~55MB if we set
    +    core.bigFileThreshold=50m.
    +
    +    The relevant "Maximum resident set size" lines were manually added
    +    below the relevant benchmark:
    +
    +      '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
    +            Maximum resident set size (kbytes): 107080
    +        1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
    +            Maximum resident set size (kbytes): 106968
    +        1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
    +            Maximum resident set size (kbytes): 107032
    +        1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
    +            Maximum resident set size (kbytes): 107072
    +        1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
    +            Maximum resident set size (kbytes): 55704
    +        2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
    +            Maximum resident set size (kbytes): 4564
    +
    +    This shows that if you have enough memory this new streaming method is
    +    slower the lower you set the streaming threshold, but the benefit is
    +    more bounded memory use.
     
         An earlier version of this patch introduced a new
         "core.bigFileStreamingThreshold" instead of re-using the existing
    @@ Commit message
         split up "core.bigFileThreshold" in the future if there's a need for
         that.
     
    +    0. https://github.com/avar/git-hyperfine/
         1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
         2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/
     
    @@ Commit message
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
         Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## Documentation/config/core.txt ##
     @@ Documentation/config/core.txt: usage, at the slight expense of increased disk usage.
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	return data->buf;
     +}
     +
    -+static void write_stream_blob(unsigned nr, size_t size)
    ++static void stream_blob(unsigned long size, unsigned nr)
     +{
     +	git_zstream zstream = { 0 };
     +	struct input_zstream_data data = { 0 };
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +		.read = feed_input_zstream,
     +		.data = &data,
     +	};
    ++	struct obj_info *info = &obj_list[nr];
     +
     +	data.zstream = &zstream;
     +	git_inflate_init(&zstream);
     +
    -+	if (stream_loose_object(&in_stream, size, &obj_list[nr].oid))
    ++	if (stream_loose_object(&in_stream, size, &info->oid))
     +		die(_("failed to write object in stream"));
     +
     +	if (data.status != Z_STREAM_END)
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
     +	git_inflate_end(&zstream);
     +
     +	if (strict) {
    -+		struct blob *blob =
    -+			lookup_blob(the_repository, &obj_list[nr].oid);
    -+		if (blob)
    -+			blob->object.flags |= FLAG_WRITTEN;
    -+		else
    ++		struct blob *blob = lookup_blob(the_repository, &info->oid);
    ++
    ++		if (!blob)
     +			die(_("invalid blob object from stream"));
    ++		blob->object.flags |= FLAG_WRITTEN;
     +	}
    -+	obj_list[nr].obj = NULL;
    ++	info->obj = NULL;
     +}
     +
    - static void unpack_non_delta_entry(enum object_type type, unsigned long size,
    - 				   unsigned nr)
    + static int resolve_against_held(unsigned nr, const struct object_id *base,
    + 				void *delta_data, unsigned long delta_size)
      {
    --	void *buf = get_data(size);
    -+	void *buf;
    -+
    -+	/* Write large blob in stream without allocating full buffer. */
    -+	if (!dry_run && type == OBJ_BLOB && size > big_file_threshold) {
    -+		write_stream_blob(nr, size);
    -+		return;
    -+	}
    +@@ builtin/unpack-objects.c: static void unpack_one(unsigned nr)
      
    -+	buf = get_data(size);
    - 	if (buf)
    - 		write_object(nr, type, buf, size);
    - }
    + 	switch (type) {
    + 	case OBJ_BLOB:
    ++		if (!dry_run && size > big_file_threshold) {
    ++			stream_blob(size, nr);
    ++			return;
    ++		}
    ++		/* fallthrough */
    + 	case OBJ_COMMIT:
    + 	case OBJ_TREE:
    + 	case OBJ_TAG:
     
    - ## t/t5328-unpack-large-objects.sh ##
    -@@ t/t5328-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
    + ## t/t5351-unpack-large-objects.sh ##
    +@@ t/t5351-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
      
      prepare_dest () {
      	test_when_finished "rm -rf dest.git" &&
     -	git init --bare dest.git
     +	git init --bare dest.git &&
    -+	if test -n "$1"
    -+	then
    -+		git -C dest.git config core.bigFileThreshold $1
    -+	fi
    ++	git -C dest.git config core.bigFileThreshold "$1"
      }
      
    - test_no_loose () {
    -@@ t/t5328-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
    + test_expect_success "create large objects (1.5 MB) and PACK" '
    +@@ t/t5351-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
      '
      
      test_expect_success 'unpack-objects failed under memory limitation' '
     -	prepare_dest &&
     +	prepare_dest 2m &&
    - 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
    + 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
      	grep "fatal: attempting to allocate" err
      '
      
      test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
     -	prepare_dest &&
     +	prepare_dest 2m &&
    - 	git -C dest.git unpack-objects -n <test-$PACK.pack &&
    - 	test_no_loose &&
    + 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
    + 	test_stdout_line_count = 0 find dest.git/objects -type f &&
      	test_dir_is_empty dest.git/objects/pack
      '
      
     +test_expect_success 'unpack big object in stream' '
     +	prepare_dest 1m &&
    -+	git -C dest.git unpack-objects <test-$PACK.pack &&
    ++	git -C dest.git unpack-objects <pack-$PACK.pack &&
     +	test_dir_is_empty dest.git/objects/pack
     +'
     +
     +test_expect_success 'do not unpack existing large objects' '
     +	prepare_dest 1m &&
    -+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
    -+	git -C dest.git unpack-objects <test-$PACK.pack &&
    -+	test_no_loose
    ++	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
    ++	git -C dest.git unpack-objects <pack-$PACK.pack &&
    ++
    ++	# The destination came up with the exact same pack...
    ++	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
    ++	test_cmp pack-$PACK.pack $DEST_PACK &&
    ++
    ++	# ...and wrote no loose objects
    ++	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
     +'
     +
      test_done
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v11 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
                                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin, Jiang Xin,
	Ævar Arnfjörð Bjarmason

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 34 ++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index dbeb0680a58..e3d30025979 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,26 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +135,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +343,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +374,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +411,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 00000000000..8d84313221c
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 2/8] object-file.c: do fsync() and close() before post-write die()
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
                                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

Change write_loose_object() to do an fsync() and close() before the
oideq() sanity check at the end. This change re-joins code that was
split up by the die() sanity check added in 748af44c63e (sha1_file: be
paranoid when creating loose objects, 2010-02-21).

I don't think that this change matters in itself, if we called die()
it was possible that our data wouldn't fully make it to disk, but in
any case we were writing data that we'd consider corrupted. It's
possible that a subsequent "git fsck" will be less confused now.

The real reason to make this change is that in a subsequent commit
we'll split this code in write_loose_object() into a utility function,
all its callers will want the preceding sanity checks, but not the
"oideq" check. By moving the close_loose_object() earlier it'll be
easier to reason about the introduction of the utility function.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index bdc5cbdd386..4c140eda6bf 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2001,12 +2001,21 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
+
+	/*
+	 * We already did a write_buffer() to the "fd", let's fsync()
+	 * and close().
+	 *
+	 * We might still die() on a subsequent sanity check, but
+	 * let's not add to that confusion by not flushing any
+	 * outstanding writes to disk first.
+	 */
+	close_loose_object(fd);
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));

-	close_loose_object(fd);
-
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.35.1.1438.g8874c8eeb35

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19 10:11                   ` René Scharfe
  2022-03-19  0:23                 ` [PATCH v11 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
                                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 129 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 40 deletions(-)

diff --git a/object-file.c b/object-file.c
index 4c140eda6bf..4fcaf7a36ce 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1943,6 +1943,87 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "parano_oid".
+ * - fsync() and close() the "fd"
+ */
+static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
+				    git_zstream *stream,
+				    struct object_id *parano_oid,
+				    const struct object_id *expected_oid,
+				    const char *die_msg1_fmt,
+				    const char *die_msg2_fmt)
+{
+	if (ret != Z_STREAM_END)
+		die(_(die_msg1_fmt), ret, expected_oid);
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		die(_(die_msg2_fmt), ret, expected_oid);
+	the_hash_algo->final_oid_fn(parano_oid, c);
+
+	/*
+	 * We already did a write_buffer() to the "fd", let's fsync()
+	 * and close().
+	 *
+	 * We might still die() on a subsequent sanity check, but
+	 * let's not add to that confusion by not flushing any
+	 * outstanding writes to disk first.
+	 */
+	close_loose_object(fd);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1957,28 +2038,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1993,24 +2057,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 
-	if (ret != Z_STREAM_END)
-		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
-		    ret);
-	ret = git_deflate_end_gently(&stream);
-	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
-
-	/*
-	 * We already did a write_buffer() to the "fd", let's fsync()
-	 * and close().
-	 *
-	 * We might still die() on a subsequent sanity check, but
-	 * let's not add to that confusion by not flushing any
-	 * outstanding writes to disk first.
-	 */
-	close_loose_object(fd);
+	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
+				N_("unable to deflate new object %s (%d)"),
+				N_("deflateEnd on object %s failed (%d)"));
 
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 4/8] object-file.c: factor out deflate part of write_loose_object()
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (2 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
                                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

Split out the part of write_loose_object() that deals with calling
git_deflate() into a utility function, a subsequent commit will
introduce another function that'll make use of it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 4fcaf7a36ce..b66dc24e4b8 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1992,6 +1992,28 @@ static int start_loose_object_common(struct strbuf *tmp_file,
 	return fd;
 }
 
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+				     git_zstream *stream, const int flush,
+				     unsigned char *in0, const int fd,
+				     unsigned char *compressed,
+				     const size_t compressed_len)
+{
+	int ret;
+
+	ret = git_deflate(stream, flush ? Z_FINISH : 0);
+	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+		die(_("unable to write loose object file"));
+	stream->next_out = compressed;
+	stream->avail_out = compressed_len;
+
+	return ret;
+}
+
 /**
  * Common steps for loose object writers to end writing loose objects:
  *
@@ -2049,12 +2071,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
+
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+						compressed, sizeof(compressed));
 	} while (ret == Z_OK);
 
 	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 5/8] object-file.c: add "stream_loose_object()" to handle large object
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (3 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
                                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c  | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |  8 +++++
 2 files changed, 105 insertions(+)

diff --git a/object-file.c b/object-file.c
index b66dc24e4b8..548fef71b98 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2114,6 +2114,103 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose objects:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = 1;
+		}
+		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+						compressed, sizeof(compressed));
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
+				N_("unable to stream deflate new object (%d)"),
+				N_("deflateEnd on stream object failed (%d)"));
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index bd2322ed8ce..1099455bc2e 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -261,6 +267,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 6/8] core doc: modernize core.bigFileThreshold documentation
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (4 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
                                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a1..b6a12218665 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -412,17 +412,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if though they were labeled "binary" (see
+  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
+  and linkgit:git-diff[1] will not diffs for files above this limit.
++
+* Will be generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 7/8] unpack-objects: refactor away unpack_non_delta_entry()
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (5 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-19  0:23                 ` [PATCH v11 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
                                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley,
	Ævar Arnfjörð Bjarmason

The unpack_one() function will call either a non-trivial
unpack_delta_entry() or a trivial unpack_non_delta_entry(). Let's
inline the latter in the only caller.

Since 21666f1aae4 (convert object type handling from a string to a
number, 2007-02-26) the unpack_non_delta_entry() function has been
rather trivial, and in a preceding commit the "dry_run" condition it
was handling went away.

This is not done as an optimization, as the compiler will easily
discover that it can do the same, rather this makes a subsequent
commit easier to reason about. As it'll be handling "OBJ_BLOB" in a
special manner let's re-arrange that "case" in preparation for that
change.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index e3d30025979..d374599d544 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -338,15 +338,6 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
-static void unpack_non_delta_entry(enum object_type type, unsigned long size,
-				   unsigned nr)
-{
-	void *buf = get_data(size);
-
-	if (buf)
-		write_object(nr, type, buf, size);
-}
-
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -479,12 +470,17 @@ static void unpack_one(unsigned nr)
 	}
 
 	switch (type) {
+	case OBJ_BLOB:
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-	case OBJ_BLOB:
 	case OBJ_TAG:
-		unpack_non_delta_entry(type, size, nr);
+	{
+		void *buf = get_data(size);
+
+		if (buf)
+			write_object(nr, type, buf, size);
 		return;
+	}
 	case OBJ_REF_DELTA:
 	case OBJ_OFS_DELTA:
 		unpack_delta_entry(type, size, nr);
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v11 8/8] unpack-objects: use stream_loose_object() to unpack large objects
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (6 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
@ 2022-03-19  0:23                 ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-05-20  3:05                 ` [PATCH 0/1] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
  9 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-19  0:23 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Han Xin,
	Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use the new streaming interface to unpack all blobs, but
doing so would be much slower, as demonstrated e.g. with this
benchmark using git-hyperfine[0]:

	rm -rf /tmp/scalar.git &&
	git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
	mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
	git hyperfine \
		-r 2 --warmup 1 \
		-L rev origin/master,HEAD -L v "10,512,1k,1m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'

Here we'll perform worse with lower core.bigFileThreshold settings
with this change in terms of speed, but we're getting lower memory use
in return:

	Summary
	  './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'

A better benchmark to demonstrate the benefits of that this one, which
creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:

	rm -rf /tmp/repo &&
	git init /tmp/repo &&
	(
		cd /tmp/repo &&
		for i in 1 25 50 75 100
		do
			dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
		done &&
		git add blob.* &&
		git commit -mblobs &&
		git gc &&
		PACK=$(echo .git/objects/pack/pack-*.pack) &&
		cp "$PACK" my.pack
	) &&
	git hyperfine \
		--show-output \
		-L rev origin/master,HEAD -L v "512,50m,100m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'

Using this test we'll always use >100MB of memory on
origin/master (around ~105MB), but max out at e.g. ~55MB if we set
core.bigFileThreshold=50m.

The relevant "Maximum resident set size" lines were manually added
below the relevant benchmark:

  '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
        Maximum resident set size (kbytes): 107080
    1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 106968
    1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 107032
    1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 107072
    1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 55704
    2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 4564

This shows that if you have enough memory this new streaming method is
slower the lower you set the streaming threshold, but the benefit is
more bounded memory use.

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

0. https://github.com/avar/git-hyperfine/
1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 67 +++++++++++++++++++++++++++++++++
 t/t5351-unpack-large-objects.sh | 26 +++++++++++--
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index b6a12218665..5aca987632c 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -436,8 +436,8 @@ usage, at the slight expense of increased disk usage.
 * Will be generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index d374599d544..9d7b325c23b 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -338,6 +338,68 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct obj_info *info = &obj_list[nr];
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &info->oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+		if (!blob)
+			die(_("invalid blob object from stream"));
+		blob->object.flags |= FLAG_WRITTEN;
+	}
+	info->obj = NULL;
+}
+
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -471,6 +533,11 @@ static void unpack_one(unsigned nr)
 
 	switch (type) {
 	case OBJ_BLOB:
+		if (!dry_run && size > big_file_threshold) {
+			stream_blob(size, nr);
+			return;
+		}
+		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
 	case OBJ_TAG:
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 8d84313221c..461ca060b2b 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold "$1"
 }
 
 test_expect_success "create large objects (1.5 MB) and PACK" '
@@ -26,16 +27,35 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+	# The destination came up with the exact same pack...
+	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+	test_cmp pack-$PACK.pack $DEST_PACK &&
+
+	# ...and wrote no loose objects
+	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
 test_done
-- 
2.35.1.1438.g8874c8eeb35


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps
  2022-03-19  0:23                 ` [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-03-19 10:11                   ` René Scharfe
  0 siblings, 0 replies; 211+ messages in thread
From: René Scharfe @ 2022-03-19 10:11 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, Derrick Stolee, Philip Oakley,
	Han Xin, Jiang Xin

Am 19.03.22 um 01:23 schrieb Ævar Arnfjörð Bjarmason:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> When writing a large blob using "write_loose_object()", we have to pass
> a buffer with the whole content of the blob, and this behavior will
> consume lots of memory and may cause OOM. We will introduce a stream
> version function ("stream_loose_object()") in later commit to resolve
> this issue.
>
> Before introducing that streaming function, do some refactoring on
> "write_loose_object()" to reuse code for both versions.
>
> Rewrite "write_loose_object()" as follows:
>
>  1. Figure out a path for the (temp) object file. This step is only
>     used in "write_loose_object()".
>
>  2. Move common steps for starting to write loose objects into a new
>     function "start_loose_object_common()".
>
>  3. Compress data.
>
>  4. Move common steps for ending zlib stream into a new function
>     "end_loose_object_common()".
>
>  5. Close fd and finalize the object file.
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  object-file.c | 129 ++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 89 insertions(+), 40 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 4c140eda6bf..4fcaf7a36ce 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1943,6 +1943,87 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>
> +/**
> + * Common steps for loose object writers to start writing loose
> + * objects:
> + *
> + * - Create tmpfile for the loose object.
> + * - Setup zlib stream for compression.
> + * - Start to feed header to zlib stream.
> + *
> + * Returns a "fd", which should later be provided to
> + * end_loose_object_common().
> + */
> +static int start_loose_object_common(struct strbuf *tmp_file,
> +				     const char *filename, unsigned flags,
> +				     git_zstream *stream,
> +				     unsigned char *buf, size_t buflen,
> +				     git_hash_ctx *c,
> +				     char *hdr, int hdrlen)
> +{
> +	int fd;
> +
> +	fd = create_tmpfile(tmp_file, filename);
> +	if (fd < 0) {
> +		if (flags & HASH_SILENT)
> +			return -1;
> +		else if (errno == EACCES)
> +			return error(_("insufficient permission for adding "
> +				       "an object to repository database %s"),
> +				     get_object_directory());
> +		else
> +			return error_errno(
> +				_("unable to create temporary file"));
> +	}
> +
> +	/*  Setup zlib stream for compression */
> +	git_deflate_init(stream, zlib_compression_level);
> +	stream->next_out = buf;
> +	stream->avail_out = buflen;
> +	the_hash_algo->init_fn(c);
> +
> +	/*  Start to feed header to zlib stream */
> +	stream->next_in = (unsigned char *)hdr;
> +	stream->avail_in = hdrlen;
> +	while (git_deflate(stream, 0) == Z_OK)
> +		; /* nothing */
> +	the_hash_algo->update_fn(c, hdr, hdrlen);
> +
> +	return fd;
> +}
> +
> +/**
> + * Common steps for loose object writers to end writing loose objects:
> + *
> + * - End the compression of zlib stream.
> + * - Get the calculated oid to "parano_oid".
> + * - fsync() and close() the "fd"
> + */
> +static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
> +				    git_zstream *stream,
> +				    struct object_id *parano_oid,
> +				    const struct object_id *expected_oid,
> +				    const char *die_msg1_fmt,
> +				    const char *die_msg2_fmt)
> +{
> +	if (ret != Z_STREAM_END)
> +		die(_(die_msg1_fmt), ret, expected_oid);
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		die(_(die_msg2_fmt), ret, expected_oid);

stream_loose_object(), added in patch 5, passes NULL as expected_oid,
but these die() messages need a valid value.  end_loose_object_common()
has more parameters than lines of code in its body.  Inline it to allow
fully custom messages?

> +	the_hash_algo->final_oid_fn(parano_oid, c);
> +
> +	/*
> +	 * We already did a write_buffer() to the "fd", let's fsync()
> +	 * and close().
> +	 *
> +	 * We might still die() on a subsequent sanity check, but
> +	 * let's not add to that confusion by not flushing any
> +	 * outstanding writes to disk first.
> +	 */
> +	close_loose_object(fd);
> +}
> +
>  static int write_loose_object(const struct object_id *oid, char *hdr,
>  			      int hdrlen, const void *buf, unsigned long len,
>  			      time_t mtime, unsigned flags)
> @@ -1957,28 +2038,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>
>  	loose_object_path(the_repository, &filename, oid);
>
> -	fd = create_tmpfile(&tmp_file, filename.buf);
> -	if (fd < 0) {
> -		if (flags & HASH_SILENT)
> -			return -1;
> -		else if (errno == EACCES)
> -			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
> -		else
> -			return error_errno(_("unable to create temporary file"));
> -	}
> -
> -	/* Set it up */
> -	git_deflate_init(&stream, zlib_compression_level);
> -	stream.next_out = compressed;
> -	stream.avail_out = sizeof(compressed);
> -	the_hash_algo->init_fn(&c);
> -
> -	/* First header.. */
> -	stream.next_in = (unsigned char *)hdr;
> -	stream.avail_in = hdrlen;
> -	while (git_deflate(&stream, 0) == Z_OK)
> -		; /* nothing */
> -	the_hash_algo->update_fn(&c, hdr, hdrlen);
> +	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, hdr, hdrlen);
> +	if (fd < 0)
> +		return -1;
>
>  	/* Then the data itself.. */
>  	stream.next_in = (void *)buf;
> @@ -1993,24 +2057,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		stream.avail_out = sizeof(compressed);
>  	} while (ret == Z_OK);
>
> -	if (ret != Z_STREAM_END)
> -		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
> -		    ret);
> -	ret = git_deflate_end_gently(&stream);
> -	if (ret != Z_OK)
> -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
> -		    ret);
> -	the_hash_algo->final_oid_fn(&parano_oid, &c);
> -
> -	/*
> -	 * We already did a write_buffer() to the "fd", let's fsync()
> -	 * and close().
> -	 *
> -	 * We might still die() on a subsequent sanity check, but
> -	 * let's not add to that confusion by not flushing any
> -	 * outstanding writes to disk first.
> -	 */
> -	close_loose_object(fd);
> +	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
> +				N_("unable to deflate new object %s (%d)"),
> +				N_("deflateEnd on object %s failed (%d)"));
>
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v12 0/8] unpack-objects: support streaming blobs to disk
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (7 preceding siblings ...)
  2022-03-19  0:23                 ` [PATCH v11 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                 ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
                                     ` (8 more replies)
  2022-05-20  3:05                 ` [PATCH 0/1] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
  9 siblings, 9 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

This series by Han Xin was waiting on some in-flight patches that
landed in 430883a70c7 (Merge branch 'ab/object-file-api-updates',
2022-03-16).

This series teaches "git unpack-objects" to stream objects larger than
core.bigFileThreshold to disk. As 8/8 shows streaming e.g. a 100MB
blob now uses ~5MB of memory instead of ~105MB. This streaming method
is slower if you've got memory to handle the blobs in-core, but if you
don't it allows you to unpack objects at all, as you might otherwise
OOM.

Changes since v10[1]:

 * René rightly spotted that the end_loose_object_common() function
   was feeding NULL to a format. That's now fixed, and parts of that
   function were pulled out into the two callers to make the trade-off
   of factoring that logic out worth it.

 * This topic conflicts with ns/batch-fsync in "seen" (see below). I
   moved an inline comment on close_loose_object() around to make the
   conflict easier (and it's better placed with the function anyway,
   as we'll get two callers of it).

Conflict this is the --remerge-diff with "seen" after resolving the
conflict. Botht textual and semantic (there's a new caller in this
topic) conflicts are caught by the compiler:

	diff --git a/object-file.c b/object-file.c
	remerge CONFLICT (content): Merge conflict in object-file.c
	index 9c640f1f39d..6068f8ec6c4 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -1887,7 +1887,6 @@ void hash_object_file(const struct git_hash_algo *algo, const void *buf,
	 	hash_object_file_literally(algo, buf, len, type_name(type), oid);
	 }

	-<<<<<<< 34ee6a28a54 (unpack-objects: use stream_loose_object() to unpack large objects)
	 /*
	  * We already did a write_buffer() to the "fd", let's fsync()
	  * and close().
	@@ -1896,11 +1895,7 @@ void hash_object_file(const struct git_hash_algo *algo, const void *buf,
	  * subsequent sanity check, but let's not add to that confusion by not
	  * flushing any outstanding writes to disk first.
	  */
	-static void close_loose_object(int fd)
	-=======
	-/* Finalize a file on disk, and close it. */
	 static void close_loose_object(int fd, const char *filename)
	->>>>>>> b1423c89b5a (Merge branch 'ab/reftable-aix-xlc-12' into seen)
	 {
	 	if (the_repository->objects->odb->will_destroy)
	 		goto out;
	@@ -2093,17 +2088,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
	 	ret = end_loose_object_common(&c, &stream, &parano_oid);
	 	if (ret != Z_OK)
	 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
	-	close_loose_object(fd);
	+	close_loose_object(fd, tmp_file.buf);

	 	if (!oideq(oid, &parano_oid))
	 		die(_("confused by unstable object source data for %s"),
	 		    oid_to_hex(oid));

	-<<<<<<< 34ee6a28a54 (unpack-objects: use stream_loose_object() to unpack large objects)
	-=======
	-	close_loose_object(fd, tmp_file.buf);
	-
	->>>>>>> b1423c89b5a (Merge branch 'ab/reftable-aix-xlc-12' into seen)
	 	if (mtime) {
	 		struct utimbuf utb;
	 		utb.actime = mtime;
	@@ -2206,7 +2196,7 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
	 	ret = end_loose_object_common(&c, &stream, oid);
	 	if (ret != Z_OK)
	 		die(_("deflateEnd on stream object failed (%d)"), ret);
	-	close_loose_object(fd);
	+	close_loose_object(fd, tmp_file.buf);

	 	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
	 		unlink_or_warn(tmp_file.buf);

1. https://lore.kernel.org/git/cover-v11-0.8-00000000000-20220319T001411Z-avarab@gmail.com/

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (4):
  object-file.c: do fsync() and close() before post-write die()
  object-file.c: factor out deflate part of write_loose_object()
  core doc: modernize core.bigFileThreshold documentation
  unpack-objects: refactor away unpack_non_delta_entry()

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 109 +++++++++++---
 object-file.c                   | 246 +++++++++++++++++++++++++++-----
 object-store.h                  |   8 ++
 t/t5351-unpack-large-objects.sh |  61 ++++++++
 5 files changed, 396 insertions(+), 61 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

Range-diff against v11:
1:  2103d5bfd96 = 1:  e95f6a1cfb6 unpack-objects: low memory footprint for get_data() in dry_run mode
2:  6acd8759772 ! 2:  54060eb8c6b object-file.c: do fsync() and close() before post-write die()
    @@ Commit message
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
    +@@ object-file.c: void hash_object_file(const struct git_hash_algo *algo, const void *buf,
    + 	hash_object_file_literally(algo, buf, len, type_name(type), oid);
    + }
    + 
    +-/* Finalize a file on disk, and close it. */
    ++/*
    ++ * We already did a write_buffer() to the "fd", let's fsync()
    ++ * and close().
    ++ *
    ++ * Finalize a file on disk, and close it. We might still die() on a
    ++ * subsequent sanity check, but let's not add to that confusion by not
    ++ * flushing any outstanding writes to disk first.
    ++ */
    + static void close_loose_object(int fd)
    + {
    + 	if (the_repository->objects->odb->will_destroy)
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
      		    ret);
      	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+
    -+	/*
    -+	 * We already did a write_buffer() to the "fd", let's fsync()
    -+	 * and close().
    -+	 *
    -+	 * We might still die() on a subsequent sanity check, but
    -+	 * let's not add to that confusion by not flushing any
    -+	 * outstanding writes to disk first.
    -+	 */
     +	close_loose_object(fd);
     +
      	if (!oideq(oid, &parano_oid))
3:  f7b02c307fc ! 3:  3dcaa5d6589 object-file.c: refactor write_loose_object() to several steps
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     + * Common steps for loose object writers to end writing loose objects:
     + *
     + * - End the compression of zlib stream.
    -+ * - Get the calculated oid to "parano_oid".
    ++ * - Get the calculated oid to "oid".
     + * - fsync() and close() the "fd"
     + */
    -+static void end_loose_object_common(int fd, int ret, git_hash_ctx *c,
    -+				    git_zstream *stream,
    -+				    struct object_id *parano_oid,
    -+				    const struct object_id *expected_oid,
    -+				    const char *die_msg1_fmt,
    -+				    const char *die_msg2_fmt)
    ++static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
    ++				   struct object_id *oid)
     +{
    -+	if (ret != Z_STREAM_END)
    -+		die(_(die_msg1_fmt), ret, expected_oid);
    ++	int ret;
    ++
     +	ret = git_deflate_end_gently(stream);
     +	if (ret != Z_OK)
    -+		die(_(die_msg2_fmt), ret, expected_oid);
    -+	the_hash_algo->final_oid_fn(parano_oid, c);
    ++		return ret;
    ++	the_hash_algo->final_oid_fn(oid, c);
     +
    -+	/*
    -+	 * We already did a write_buffer() to the "fd", let's fsync()
    -+	 * and close().
    -+	 *
    -+	 * We might still die() on a subsequent sanity check, but
    -+	 * let's not add to that confusion by not flushing any
    -+	 * outstanding writes to disk first.
    -+	 */
    -+	close_loose_object(fd);
    ++	return Z_OK;
     +}
     +
      static int write_loose_object(const struct object_id *oid, char *hdr,
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
      	/* Then the data itself.. */
      	stream.next_in = (void *)buf;
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
    - 		stream.avail_out = sizeof(compressed);
    - 	} while (ret == Z_OK);
    - 
    --	if (ret != Z_STREAM_END)
    --		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
    --		    ret);
    + 	if (ret != Z_STREAM_END)
    + 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
    + 		    ret);
     -	ret = git_deflate_end_gently(&stream);
    --	if (ret != Z_OK)
    ++	ret = end_loose_object_common(&c, &stream, &parano_oid);
    + 	if (ret != Z_OK)
     -		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
     -		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
    --
    --	/*
    --	 * We already did a write_buffer() to the "fd", let's fsync()
    --	 * and close().
    --	 *
    --	 * We might still die() on a subsequent sanity check, but
    --	 * let's not add to that confusion by not flushing any
    --	 * outstanding writes to disk first.
    --	 */
    --	close_loose_object(fd);
    -+	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
    -+				N_("unable to deflate new object %s (%d)"),
    -+				N_("deflateEnd on object %s failed (%d)"));
    ++		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
    + 	close_loose_object(fd);
      
      	if (!oideq(oid, &parano_oid))
    - 		die(_("confused by unstable object source data for %s"),
4:  20d97cc2605 ! 4:  03f4e91ac89 object-file.c: factor out deflate part of write_loose_object()
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     +						compressed, sizeof(compressed));
      	} while (ret == Z_OK);
      
    - 	end_loose_object_common(fd, ret, &c, &stream, &parano_oid, oid,
    + 	if (ret != Z_STREAM_END)
5:  db40f4160c4 ! 5:  3d64cf1cf33 object-file.c: add "stream_loose_object()" to handle large object
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	 *  - End the compression of zlib stream.
     +	 *  - Get the calculated oid.
     +	 */
    -+	end_loose_object_common(fd, ret, &c, &stream, oid, NULL,
    -+				N_("unable to stream deflate new object (%d)"),
    -+				N_("deflateEnd on stream object failed (%d)"));
    ++	if (ret != Z_STREAM_END)
    ++		die(_("unable to stream deflate new object (%d)"), ret);
    ++	ret = end_loose_object_common(&c, &stream, oid);
    ++	if (ret != Z_OK)
    ++		die(_("deflateEnd on stream object failed (%d)"), ret);
    ++	close_loose_object(fd);
     +
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
6:  d8ae2eadb98 = 6:  33ffcbbc1f0 core doc: modernize core.bigFileThreshold documentation
7:  2b403e7cd9c = 7:  11f7aa026b4 unpack-objects: refactor away unpack_non_delta_entry()
8:  5eded902496 = 8:  34ee6a28a54 unpack-objects: use stream_loose_object() to unpack large objects
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v12 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
                                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin, Ævar Arnfjörð Bjarmason

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 34 ++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index dbeb0680a58..e3d30025979 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,26 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +135,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +343,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +374,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +411,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 00000000000..8d84313221c
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 2/8] object-file.c: do fsync() and close() before post-write die()
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
                                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

Change write_loose_object() to do an fsync() and close() before the
oideq() sanity check at the end. This change re-joins code that was
split up by the die() sanity check added in 748af44c63e (sha1_file: be
paranoid when creating loose objects, 2010-02-21).

I don't think that this change matters in itself, if we called die()
it was possible that our data wouldn't fully make it to disk, but in
any case we were writing data that we'd consider corrupted. It's
possible that a subsequent "git fsck" will be less confused now.

The real reason to make this change is that in a subsequent commit
we'll split this code in write_loose_object() into a utility function,
all its callers will want the preceding sanity checks, but not the
"oideq" check. By moving the close_loose_object() earlier it'll be
easier to reason about the introduction of the utility function.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/object-file.c b/object-file.c
index 62ebe236c90..5da458eccbf 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1886,7 +1886,14 @@ void hash_object_file(const struct git_hash_algo *algo, const void *buf,
 	hash_object_file_literally(algo, buf, len, type_name(type), oid);
 }
 
-/* Finalize a file on disk, and close it. */
+/*
+ * We already did a write_buffer() to the "fd", let's fsync()
+ * and close().
+ *
+ * Finalize a file on disk, and close it. We might still die() on a
+ * subsequent sanity check, but let's not add to that confusion by not
+ * flushing any outstanding writes to disk first.
+ */
 static void close_loose_object(int fd)
 {
 	if (the_repository->objects->odb->will_destroy)
@@ -2006,12 +2013,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	close_loose_object(fd);
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
 
-	close_loose_object(fd);
-
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-30  7:13                     ` Han Xin
  2022-03-29 13:56                   ` [PATCH v12 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
                                     ` (5 subsequent siblings)
  8 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 102 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 26 deletions(-)

diff --git a/object-file.c b/object-file.c
index 5da458eccbf..7f160929e00 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1955,6 +1955,75 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ * - fsync() and close() the "fd"
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+				   struct object_id *oid)
+{
+	int ret;
+
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		return ret;
+	the_hash_algo->final_oid_fn(oid, c);
+
+	return Z_OK;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1969,28 +2038,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -2008,11 +2060,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
 	close_loose_object(fd);
 
 	if (!oideq(oid, &parano_oid))
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 4/8] object-file.c: factor out deflate part of write_loose_object()
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (2 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
                                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

Split out the part of write_loose_object() that deals with calling
git_deflate() into a utility function, a subsequent commit will
introduce another function that'll make use of it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 7f160929e00..6e2f2264f8c 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2004,6 +2004,28 @@ static int start_loose_object_common(struct strbuf *tmp_file,
 	return fd;
 }
 
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+				     git_zstream *stream, const int flush,
+				     unsigned char *in0, const int fd,
+				     unsigned char *compressed,
+				     const size_t compressed_len)
+{
+	int ret;
+
+	ret = git_deflate(stream, flush ? Z_FINISH : 0);
+	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+		die(_("unable to write loose object file"));
+	stream->next_out = compressed;
+	stream->avail_out = compressed_len;
+
+	return ret;
+}
+
 /**
  * Common steps for loose object writers to end writing loose objects:
  *
@@ -2049,12 +2071,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
+
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+						compressed, sizeof(compressed));
 	} while (ret == Z_OK);
 
 	if (ret != Z_STREAM_END)
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (3 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-31 19:54                     ` Neeraj Singh
  2022-03-29 13:56                   ` [PATCH v12 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
                                     ` (3 subsequent siblings)
  8 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   8 ++++
 2 files changed, 108 insertions(+)

diff --git a/object-file.c b/object-file.c
index 6e2f2264f8c..2be2bae9afa 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2118,6 +2118,106 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose objects:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = 1;
+		}
+		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+						compressed, sizeof(compressed));
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	if (ret != Z_STREAM_END)
+		die(_("unable to stream deflate new object (%d)"), ret);
+	ret = end_loose_object_common(&c, &stream, oid);
+	if (ret != Z_OK)
+		die(_("deflateEnd on stream object failed (%d)"), ret);
+	close_loose_object(fd);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index bd2322ed8ce..1099455bc2e 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -261,6 +267,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 6/8] core doc: modernize core.bigFileThreshold documentation
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (4 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-29 13:56                   ` [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
                                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 9da3e5d88f6..5fccbd56995 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -412,17 +412,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if though they were labeled "binary" (see
+  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
+  and linkgit:git-diff[1] will not diffs for files above this limit.
++
+* Will be generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry()
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (5 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-03-30 19:40                     ` René Scharfe
  2022-03-29 13:56                   ` [PATCH v12 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  8 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

The unpack_one() function will call either a non-trivial
unpack_delta_entry() or a trivial unpack_non_delta_entry(). Let's
inline the latter in the only caller.

Since 21666f1aae4 (convert object type handling from a string to a
number, 2007-02-26) the unpack_non_delta_entry() function has been
rather trivial, and in a preceding commit the "dry_run" condition it
was handling went away.

This is not done as an optimization, as the compiler will easily
discover that it can do the same, rather this makes a subsequent
commit easier to reason about. As it'll be handling "OBJ_BLOB" in a
special manner let's re-arrange that "case" in preparation for that
change.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index e3d30025979..d374599d544 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -338,15 +338,6 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
-static void unpack_non_delta_entry(enum object_type type, unsigned long size,
-				   unsigned nr)
-{
-	void *buf = get_data(size);
-
-	if (buf)
-		write_object(nr, type, buf, size);
-}
-
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -479,12 +470,17 @@ static void unpack_one(unsigned nr)
 	}
 
 	switch (type) {
+	case OBJ_BLOB:
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-	case OBJ_BLOB:
 	case OBJ_TAG:
-		unpack_non_delta_entry(type, size, nr);
+	{
+		void *buf = get_data(size);
+
+		if (buf)
+			write_object(nr, type, buf, size);
 		return;
+	}
 	case OBJ_REF_DELTA:
 	case OBJ_OFS_DELTA:
 		unpack_delta_entry(type, size, nr);
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v12 8/8] unpack-objects: use stream_loose_object() to unpack large objects
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (6 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
@ 2022-03-29 13:56                   ` Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  8 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-29 13:56 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use the new streaming interface to unpack all blobs, but
doing so would be much slower, as demonstrated e.g. with this
benchmark using git-hyperfine[0]:

	rm -rf /tmp/scalar.git &&
	git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
	mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
	git hyperfine \
		-r 2 --warmup 1 \
		-L rev origin/master,HEAD -L v "10,512,1k,1m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'

Here we'll perform worse with lower core.bigFileThreshold settings
with this change in terms of speed, but we're getting lower memory use
in return:

	Summary
	  './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'

A better benchmark to demonstrate the benefits of that this one, which
creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:

	rm -rf /tmp/repo &&
	git init /tmp/repo &&
	(
		cd /tmp/repo &&
		for i in 1 25 50 75 100
		do
			dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
		done &&
		git add blob.* &&
		git commit -mblobs &&
		git gc &&
		PACK=$(echo .git/objects/pack/pack-*.pack) &&
		cp "$PACK" my.pack
	) &&
	git hyperfine \
		--show-output \
		-L rev origin/master,HEAD -L v "512,50m,100m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'

Using this test we'll always use >100MB of memory on
origin/master (around ~105MB), but max out at e.g. ~55MB if we set
core.bigFileThreshold=50m.

The relevant "Maximum resident set size" lines were manually added
below the relevant benchmark:

  '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
        Maximum resident set size (kbytes): 107080
    1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 106968
    1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 107032
    1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 107072
    1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 55704
    2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 4564

This shows that if you have enough memory this new streaming method is
slower the lower you set the streaming threshold, but the benefit is
more bounded memory use.

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

0. https://github.com/avar/git-hyperfine/
1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 67 +++++++++++++++++++++++++++++++++
 t/t5351-unpack-large-objects.sh | 26 +++++++++++--
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 5fccbd56995..716259b6762 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -436,8 +436,8 @@ usage, at the slight expense of increased disk usage.
 * Will be generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index d374599d544..9d7b325c23b 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -338,6 +338,68 @@ static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct obj_info *info = &obj_list[nr];
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &info->oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+		if (!blob)
+			die(_("invalid blob object from stream"));
+		blob->object.flags |= FLAG_WRITTEN;
+	}
+	info->obj = NULL;
+}
+
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -471,6 +533,11 @@ static void unpack_one(unsigned nr)
 
 	switch (type) {
 	case OBJ_BLOB:
+		if (!dry_run && size > big_file_threshold) {
+			stream_blob(size, nr);
+			return;
+		}
+		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
 	case OBJ_TAG:
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 8d84313221c..461ca060b2b 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold "$1"
 }
 
 test_expect_success "create large objects (1.5 MB) and PACK" '
@@ -26,16 +27,35 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+	# The destination came up with the exact same pack...
+	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+	test_cmp pack-$PACK.pack $DEST_PACK &&
+
+	# ...and wrote no loose objects
+	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
 test_done
-- 
2.35.1.1548.g36973b18e52


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps
  2022-03-29 13:56                   ` [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-03-30  7:13                     ` Han Xin
  2022-03-30 17:34                       ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-03-30  7:13 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, hanxin.hx, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, zhiyou.jx

On Tue, Mar 29, 2022 at 3:56 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
> 
> +/**
> + * Common steps for loose object writers to end writing loose objects:
> + *
> + * - End the compression of zlib stream.
> + * - Get the calculated oid to "oid".
> + * - fsync() and close() the "fd"

Since we removed close_loose_object() from end_loose_object_common() , I
think this comment should also be removed.

Thanks.
-Han Xin

> + */
> +static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
> +				   struct object_id *oid)
> +{
> +	int ret;
> +
> +	ret = git_deflate_end_gently(stream);
> +	if (ret != Z_OK)
> +		return ret;
> +	the_hash_algo->final_oid_fn(oid, c);
> +
> +	return Z_OK;
> +}
> +

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps
  2022-03-30  7:13                     ` Han Xin
@ 2022-03-30 17:34                       ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-30 17:34 UTC (permalink / raw)
  To: Han Xin
  Cc: git, gitster, hanxin.hx, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, zhiyou.jx


On Wed, Mar 30 2022, Han Xin wrote:

> On Tue, Mar 29, 2022 at 3:56 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>> 
>> +/**
>> + * Common steps for loose object writers to end writing loose objects:
>> + *
>> + * - End the compression of zlib stream.
>> + * - Get the calculated oid to "oid".
>> + * - fsync() and close() the "fd"
>
> Since we removed close_loose_object() from end_loose_object_common() , I
> think this comment should also be removed.

You're right. I adjusted it for the "parano_oid" in this v12, but
managed to miss that somehow.

Will submit a re-roll with those changes, but will wait a bit more to
see if there's any other comments on this v12 first. Thanks!


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry()
  2022-03-29 13:56                   ` [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
@ 2022-03-30 19:40                     ` René Scharfe
  2022-03-31 12:42                       ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2022-03-30 19:40 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, Derrick Stolee, Philip Oakley,
	Neeraj Singh, Elijah Newren

Am 29.03.22 um 15:56 schrieb Ævar Arnfjörð Bjarmason:
> The unpack_one() function will call either a non-trivial
> unpack_delta_entry() or a trivial unpack_non_delta_entry(). Let's
> inline the latter in the only caller.
>
> Since 21666f1aae4 (convert object type handling from a string to a
> number, 2007-02-26) the unpack_non_delta_entry() function has been
> rather trivial, and in a preceding commit the "dry_run" condition it
> was handling went away.
>
> This is not done as an optimization, as the compiler will easily
> discover that it can do the same, rather this makes a subsequent
> commit easier to reason about.

How exactly does inlining the function make the next patch easier to
understand or discuss?  Plugging in the stream_blob() call to handle the
big blobs looks the same with or without the unpack_non_delta_entry()
call to me.

> As it'll be handling "OBJ_BLOB" in a
> special manner let's re-arrange that "case" in preparation for that
> change.
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  builtin/unpack-objects.c | 18 +++++++-----------
>  1 file changed, 7 insertions(+), 11 deletions(-)

Reducing the number of lines can be an advantage. *shrug*

>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index e3d30025979..d374599d544 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -338,15 +338,6 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>
> -static void unpack_non_delta_entry(enum object_type type, unsigned long size,
> -				   unsigned nr)
> -{
> -	void *buf = get_data(size);
> -
> -	if (buf)
> -		write_object(nr, type, buf, size);
> -}
> -
>  static int resolve_against_held(unsigned nr, const struct object_id *base,
>  				void *delta_data, unsigned long delta_size)
>  {
> @@ -479,12 +470,17 @@ static void unpack_one(unsigned nr)
>  	}
>
>  	switch (type) {
> +	case OBJ_BLOB:
>  	case OBJ_COMMIT:
>  	case OBJ_TREE:
> -	case OBJ_BLOB:
>  	case OBJ_TAG:
> -		unpack_non_delta_entry(type, size, nr);
> +	{
> +		void *buf = get_data(size);
> +
> +		if (buf)
> +			write_object(nr, type, buf, size);
>  		return;
> +	}
>  	case OBJ_REF_DELTA:
>  	case OBJ_OFS_DELTA:
>  		unpack_delta_entry(type, size, nr);

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry()
  2022-03-30 19:40                     ` René Scharfe
@ 2022-03-31 12:42                       ` Ævar Arnfjörð Bjarmason
  2022-03-31 16:38                         ` René Scharfe
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-03-31 12:42 UTC (permalink / raw)
  To: René Scharfe
  Cc: git, Junio C Hamano, Han Xin, Jiang Xin, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren


On Wed, Mar 30 2022, René Scharfe wrote:

> Am 29.03.22 um 15:56 schrieb Ævar Arnfjörð Bjarmason:
>> The unpack_one() function will call either a non-trivial
>> unpack_delta_entry() or a trivial unpack_non_delta_entry(). Let's
>> inline the latter in the only caller.
>>
>> Since 21666f1aae4 (convert object type handling from a string to a
>> number, 2007-02-26) the unpack_non_delta_entry() function has been
>> rather trivial, and in a preceding commit the "dry_run" condition it
>> was handling went away.
>>
>> This is not done as an optimization, as the compiler will easily
>> discover that it can do the same, rather this makes a subsequent
>> commit easier to reason about.
>
> How exactly does inlining the function make the next patch easier to
> understand or discuss?  Plugging in the stream_blob() call to handle the
> big blobs looks the same with or without the unpack_non_delta_entry()
> call to me.

The earlier version of it without this prep cleanup can be seen at
https://lore.kernel.org/git/patch-v10-6.6-6a70e49a346-20220204T135538Z-avarab@gmail.com/

So yes, this could be skipped, but I tought with this step it was easier
to understand.

We previously had to change "void *buf = get_data(size);" in the
function to just "void *buf", and do the assignment after the condition
that's being checked here.

I think it's also more obvious in terms of control flow if we're
checking OBJ_BLOB here to not call a function which has a special-case
just for OBJ_BLOB, we can just do that here instead.

>> As it'll be handling "OBJ_BLOB" in a
>> special manner let's re-arrange that "case" in preparation for that
>> change.
>>
>> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> ---
>>  builtin/unpack-objects.c | 18 +++++++-----------
>>  1 file changed, 7 insertions(+), 11 deletions(-)
>
> Reducing the number of lines can be an advantage. *shrug*

There was also the (admittedly rather small) knock-on-effect on
8/8. Before this it was 8 lines added / 1 removed when it came to the
code impacted by this change, now it's a 5 added/0 removed in the below
"switch".

So I think it's worth keeping.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry()
  2022-03-31 12:42                       ` Ævar Arnfjörð Bjarmason
@ 2022-03-31 16:38                         ` René Scharfe
  0 siblings, 0 replies; 211+ messages in thread
From: René Scharfe @ 2022-03-31 16:38 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Junio C Hamano, Han Xin, Jiang Xin, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren

Am 31.03.22 um 14:42 schrieb Ævar Arnfjörð Bjarmason:
>
> On Wed, Mar 30 2022, René Scharfe wrote:
>
>> Am 29.03.22 um 15:56 schrieb Ævar Arnfjörð Bjarmason:
>>> The unpack_one() function will call either a non-trivial
>>> unpack_delta_entry() or a trivial unpack_non_delta_entry(). Let's
>>> inline the latter in the only caller.
>>>
>>> Since 21666f1aae4 (convert object type handling from a string to a
>>> number, 2007-02-26) the unpack_non_delta_entry() function has been
>>> rather trivial, and in a preceding commit the "dry_run" condition it
>>> was handling went away.
>>>
>>> This is not done as an optimization, as the compiler will easily
>>> discover that it can do the same, rather this makes a subsequent
>>> commit easier to reason about.
>>
>> How exactly does inlining the function make the next patch easier to
>> understand or discuss?  Plugging in the stream_blob() call to handle the
>> big blobs looks the same with or without the unpack_non_delta_entry()
>> call to me.
>
> The earlier version of it without this prep cleanup can be seen at
> https://lore.kernel.org/git/patch-v10-6.6-6a70e49a346-20220204T135538Z-avarab@gmail.com/

This plugged the special case into unpack_non_delta_entry().  The
alternative I had in mind was to plug it into the switch statement as
the current patch does, just without inlining unpack_non_delta_entry().

> So yes, this could be skipped, but I tought with this step it was easier
> to understand.
>
> We previously had to change "void *buf = get_data(size);" in the
> function to just "void *buf", and do the assignment after the condition
> that's being checked here.
>
> I think it's also more obvious in terms of control flow if we're
> checking OBJ_BLOB here to not call a function which has a special-case
> just for OBJ_BLOB, we can just do that here instead.
>
>>> As it'll be handling "OBJ_BLOB" in a
>>> special manner let's re-arrange that "case" in preparation for that
>>> change.
>>>
>>> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>>> ---
>>>  builtin/unpack-objects.c | 18 +++++++-----------
>>>  1 file changed, 7 insertions(+), 11 deletions(-)
>>
>> Reducing the number of lines can be an advantage. *shrug*
>
> There was also the (admittedly rather small) knock-on-effect on
> 8/8. Before this it was 8 lines added / 1 removed when it came to the
> code impacted by this change, now it's a 5 added/0 removed in the below
> "switch".
>
> So I think it's worth keeping.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object
  2022-03-29 13:56                   ` [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-03-31 19:54                     ` Neeraj Singh
  0 siblings, 0 replies; 211+ messages in thread
From: Neeraj Singh @ 2022-03-31 19:54 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin

On Tue, Mar 29, 2022 at 03:56:10PM +0200, Ævar Arnfjörð Bjarmason wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> If we want unpack and write a loose object using "write_loose_object",
> we have to feed it with a buffer with the same size of the object, which
> will consume lots of memory and may cause OOM. This can be improved by
> feeding data to "stream_loose_object()" in a stream.
> 
> Add a new function "stream_loose_object()", which is a stream version of
> "write_loose_object()" but with a low memory footprint. We will use this
> function to unpack large blob object in later commit.
> 

Just a thought for optimization which you might want to try on top of this
series:
try using mmap on both the source and target files of your stream. Use a
big 'window' for the mmap (multiple MB) to reduce the TLB flush costs. TLB
flush costs should be minimal anyway if Git is single-threaded.

If you can set the source and target buffers of zlib to the source and
dest mappings respectively, you'd eliminate two copies of data into
Git's stack buffers.  You might need to over-allocate the dst file if
you don't know the size up front, but doing an over-allocate and truncate
should be pretty cheap if you're working with a big file.

Thanks,
Neeraj

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH 0/1] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                   ` (8 preceding siblings ...)
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
@ 2022-05-20  3:05                 ` Han Xin
  2022-05-20  3:05                   ` [PATCH 1/1] " Han Xin
  9 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-05-20  3:05 UTC (permalink / raw)
  To: git
  Cc: Han Xin, Junio C Hamano, Jiang Xin,
	Ævar Arnfjörð Bjarmason, René Scharfe

This patch teaches "git unpack-objects" to use a lower memory footprint
for "get_data()" in dry-run mode since the returned data is not used.

This patch is separeted from "[PATCH v12 0/8] unpack-objects: support
streaming blobs to disk"[1] because it has less impact and less controversy
on existing ones.

1. https://lore.kernel.org/git/cover-v12-0.8-00000000000-20220329T135446Z-avarab@gmail.com/

Han Xin (1):
  unpack-objects: low memory footprint for get_data() in dry_run mode

 builtin/unpack-objects.c        | 34 ++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

-- 
2.36.1


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH 1/1] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-05-20  3:05                 ` [PATCH 0/1] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-05-20  3:05                   ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-05-20  3:05 UTC (permalink / raw)
  To: git
  Cc: Han Xin, Junio C Hamano, Jiang Xin,
	Ævar Arnfjörð Bjarmason, René Scharfe,
	Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 34 ++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index dbeb0680a5..e3d3002597 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,26 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +135,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +343,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +374,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +411,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 0000000000..8d84313221
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 0/7] unpack-objects: support streaming blobs to disk
  2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                     ` (7 preceding siblings ...)
  2022-03-29 13:56                   ` [PATCH v12 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                   ` Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
                                       ` (7 more replies)
  8 siblings, 8 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

This series makes "unpack-objects" capable of streaming large objects
to disk.

As 7/7 shows streaming e.g. a 100MB blob now uses ~5MB of memory
instead of ~105MB. This streaming method is slower if you've got
memory to handle the blobs in-core, but if you don't it allows you to
unpack objects at all, as you might otherwise OOM.

This series by Han Xin was originally waiting on some in-flight
patches that landed in 430883a70c7 (Merge branch
'ab/object-file-api-updates', 2022-03-16), and until yesterday with
83937e95928 (Merge branch 'ns/batch-fsync', 2022-06-03) had a textual
and semantic conflict with "master".

Changes since v12:

 * Since v12 Han Xin submitted 1/1 here as
   https://lore.kernel.org/git/cover.1653015534.git.chiyutianyi@gmail.com/;
   I think this is better off reviewed as a whole, and hopefully will
   be picked up as such.

 * Dropped the previous 7/8, which was a refactoring to make 8/8
   slightly smaller. Per dicsussion with René it's better to leave it
   out.

 * The rest (especially 2/8) is due to rebasing on ns/batch-fsync.

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (3):
  object-file.c: do fsync() and close() before post-write die()
  object-file.c: factor out deflate part of write_loose_object()
  core doc: modernize core.bigFileThreshold documentation

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 103 ++++++++++++--
 object-file.c                   | 237 +++++++++++++++++++++++++++-----
 object-store.h                  |   8 ++
 t/t5351-unpack-large-objects.sh |  61 ++++++++
 5 files changed, 387 insertions(+), 55 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

Range-diff against v12:
1:  e95f6a1cfb6 = 1:  12873fc9915 unpack-objects: low memory footprint for get_data() in dry_run mode
2:  54060eb8c6b ! 2:  b3568f0c5c0 object-file.c: do fsync() and close() before post-write die()
    @@ Commit message
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
    -@@ object-file.c: void hash_object_file(const struct git_hash_algo *algo, const void *buf,
    - 	hash_object_file_literally(algo, buf, len, type_name(type), oid);
    - }
    - 
    --/* Finalize a file on disk, and close it. */
    -+/*
    -+ * We already did a write_buffer() to the "fd", let's fsync()
    -+ * and close().
    -+ *
    -+ * Finalize a file on disk, and close it. We might still die() on a
    -+ * subsequent sanity check, but let's not add to that confusion by not
    -+ * flushing any outstanding writes to disk first.
    -+ */
    - static void close_loose_object(int fd)
    - {
    - 	if (the_repository->objects->odb->will_destroy)
     @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *hdr,
      		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
      		    ret);
      	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+	close_loose_object(fd);
    ++	close_loose_object(fd, tmp_file.buf);
     +
      	if (!oideq(oid, &parano_oid))
      		die(_("confused by unstable object source data for %s"),
      		    oid_to_hex(oid));
      
    --	close_loose_object(fd);
    +-	close_loose_object(fd, tmp_file.buf);
     -
      	if (mtime) {
      		struct utimbuf utb;
3:  3dcaa5d6589 ! 3:  9dc0f56878a object-file.c: refactor write_loose_object() to several steps
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
     +		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
    - 	close_loose_object(fd);
    + 	close_loose_object(fd, tmp_file.buf);
      
      	if (!oideq(oid, &parano_oid))
4:  03f4e91ac89 = 4:  a0434835fe7 object-file.c: factor out deflate part of write_loose_object()
5:  3d64cf1cf33 ! 5:  0b07b29836b object-file.c: add "stream_loose_object()" to handle large object
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	ret = end_loose_object_common(&c, &stream, oid);
     +	if (ret != Z_OK)
     +		die(_("deflateEnd on stream object failed (%d)"), ret);
    -+	close_loose_object(fd);
    ++	close_loose_object(fd, tmp_file.buf);
     +
     +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
     +		unlink_or_warn(tmp_file.buf);
6:  33ffcbbc1f0 = 6:  5ed79c58b18 core doc: modernize core.bigFileThreshold documentation
7:  11f7aa026b4 < -:  ----------- unpack-objects: refactor away unpack_non_delta_entry()
8:  34ee6a28a54 ! 7:  5bc8fa9bc8d unpack-objects: use stream_loose_object() to unpack large objects
    @@ Documentation/config/core.txt: usage, at the slight expense of increased disk us
      	Specifies the pathname to the file that contains patterns to
     
      ## builtin/unpack-objects.c ##
    -@@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type type,
    - 	}
    +@@ builtin/unpack-objects.c: static void unpack_non_delta_entry(enum object_type type, unsigned long size,
    + 		write_object(nr, type, buf, size);
      }
      
     +struct input_zstream_data {
    @@ builtin/unpack-objects.c: static void added_object(unsigned nr, enum object_type
      				void *delta_data, unsigned long delta_size)
      {
     @@ builtin/unpack-objects.c: static void unpack_one(unsigned nr)
    + 	}
      
      	switch (type) {
    - 	case OBJ_BLOB:
    ++	case OBJ_BLOB:
     +		if (!dry_run && size > big_file_threshold) {
     +			stream_blob(size, nr);
     +			return;
    @@ builtin/unpack-objects.c: static void unpack_one(unsigned nr)
     +		/* fallthrough */
      	case OBJ_COMMIT:
      	case OBJ_TREE:
    +-	case OBJ_BLOB:
      	case OBJ_TAG:
    + 		unpack_non_delta_entry(type, size, nr);
    + 		return;
     
      ## t/t5351-unpack-large-objects.sh ##
     @@ t/t5351-unpack-large-objects.sh: test_description='git unpack-objects with large objects'
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-06 18:35                       ` Junio C Hamano
  2022-06-04 10:10                     ` [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
                                       ` (6 subsequent siblings)
  7 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin, Ævar Arnfjörð Bjarmason

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 34 ++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 56d05e2725d..64abba8dbac 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -97,15 +97,26 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -125,8 +136,15 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -326,10 +344,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +375,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +412,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 00000000000..8d84313221c
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-06 18:45                       ` Junio C Hamano
  2022-06-04 10:10                     ` [PATCH v13 3/7] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
                                       ` (5 subsequent siblings)
  7 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

Change write_loose_object() to do an fsync() and close() before the
oideq() sanity check at the end. This change re-joins code that was
split up by the die() sanity check added in 748af44c63e (sha1_file: be
paranoid when creating loose objects, 2010-02-21).

I don't think that this change matters in itself, if we called die()
it was possible that our data wouldn't fully make it to disk, but in
any case we were writing data that we'd consider corrupted. It's
possible that a subsequent "git fsck" will be less confused now.

The real reason to make this change is that in a subsequent commit
we'll split this code in write_loose_object() into a utility function,
all its callers will want the preceding sanity checks, but not the
"oideq" check. By moving the close_loose_object() earlier it'll be
easier to reason about the introduction of the utility function.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 79eb8339b60..e4a83012ba4 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2012,12 +2012,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	close_loose_object(fd, tmp_file.buf);
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));

-	close_loose_object(fd, tmp_file.buf);
-
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.36.1.1124.g52838f02905

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 3/7] object-file.c: refactor write_loose_object() to several steps
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 4/7] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
                                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 102 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 26 deletions(-)

diff --git a/object-file.c b/object-file.c
index e4a83012ba4..ce8b52a8dc3 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1951,6 +1951,75 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ * - fsync() and close() the "fd"
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+				   struct object_id *oid)
+{
+	int ret;
+
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		return ret;
+	the_hash_algo->final_oid_fn(oid, c);
+
+	return Z_OK;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1968,28 +2037,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -2007,11 +2059,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
 	close_loose_object(fd, tmp_file.buf);
 
 	if (!oideq(oid, &parano_oid))
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 4/7] object-file.c: factor out deflate part of write_loose_object()
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                       ` (2 preceding siblings ...)
  2022-06-04 10:10                     ` [PATCH v13 3/7] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-04 10:10                     ` [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
                                       ` (3 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

Split out the part of write_loose_object() that deals with calling
git_deflate() into a utility function, a subsequent commit will
introduce another function that'll make use of it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index ce8b52a8dc3..7946fa5e088 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2000,6 +2000,28 @@ static int start_loose_object_common(struct strbuf *tmp_file,
 	return fd;
 }
 
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+				     git_zstream *stream, const int flush,
+				     unsigned char *in0, const int fd,
+				     unsigned char *compressed,
+				     const size_t compressed_len)
+{
+	int ret;
+
+	ret = git_deflate(stream, flush ? Z_FINISH : 0);
+	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+		die(_("unable to write loose object file"));
+	stream->next_out = compressed;
+	stream->avail_out = compressed_len;
+
+	return ret;
+}
+
 /**
  * Common steps for loose object writers to end writing loose objects:
  *
@@ -2048,12 +2070,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
+
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+						compressed, sizeof(compressed));
 	} while (ret == Z_OK);
 
 	if (ret != Z_STREAM_END)
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                       ` (3 preceding siblings ...)
  2022-06-04 10:10                     ` [PATCH v13 4/7] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-06 19:44                       ` Junio C Hamano
  2022-06-07 19:53                       ` Neeraj Singh
  2022-06-04 10:10                     ` [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
                                       ` (2 subsequent siblings)
  7 siblings, 2 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid".

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   8 ++++
 2 files changed, 108 insertions(+)

diff --git a/object-file.c b/object-file.c
index 7946fa5e088..9fd449693c4 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2119,6 +2119,106 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose objects:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = 1;
+		}
+		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+						compressed, sizeof(compressed));
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/* Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	if (ret != Z_STREAM_END)
+		die(_("unable to stream deflate new object (%d)"), ret);
+	ret = end_loose_object_common(&c, &stream, oid);
+	if (ret != Z_OK)
+		die(_("deflateEnd on stream object failed (%d)"), ret);
+	close_loose_object(fd, tmp_file.buf);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 539ea439046..5222ee54600 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                       ` (4 preceding siblings ...)
  2022-06-04 10:10                     ` [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-06 19:50                       ` Junio C Hamano
  2022-06-04 10:10                     ` [PATCH v13 7/7] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
  7 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Ævar Arnfjörð Bjarmason

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 41e330f3069..ff6ae6bb647 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if though they were labeled "binary" (see
+  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
+  and linkgit:git-diff[1] will not diffs for files above this limit.
++
+* Will be generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v13 7/7] unpack-objects: use stream_loose_object() to unpack large objects
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                       ` (5 preceding siblings ...)
  2022-06-04 10:10                     ` [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
@ 2022-06-04 10:10                     ` Ævar Arnfjörð Bjarmason
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
  7 siblings, 0 replies; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-04 10:10 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Ævar Arnfjörð Bjarmason, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use the new streaming interface to unpack all blobs, but
doing so would be much slower, as demonstrated e.g. with this
benchmark using git-hyperfine[0]:

	rm -rf /tmp/scalar.git &&
	git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
	mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
	git hyperfine \
		-r 2 --warmup 1 \
		-L rev origin/master,HEAD -L v "10,512,1k,1m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'

Here we'll perform worse with lower core.bigFileThreshold settings
with this change in terms of speed, but we're getting lower memory use
in return:

	Summary
	  './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'

A better benchmark to demonstrate the benefits of that this one, which
creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:

	rm -rf /tmp/repo &&
	git init /tmp/repo &&
	(
		cd /tmp/repo &&
		for i in 1 25 50 75 100
		do
			dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
		done &&
		git add blob.* &&
		git commit -mblobs &&
		git gc &&
		PACK=$(echo .git/objects/pack/pack-*.pack) &&
		cp "$PACK" my.pack
	) &&
	git hyperfine \
		--show-output \
		-L rev origin/master,HEAD -L v "512,50m,100m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'

Using this test we'll always use >100MB of memory on
origin/master (around ~105MB), but max out at e.g. ~55MB if we set
core.bigFileThreshold=50m.

The relevant "Maximum resident set size" lines were manually added
below the relevant benchmark:

  '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
        Maximum resident set size (kbytes): 107080
    1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 106968
    1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 107032
    1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 107072
    1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 55704
    2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 4564

This shows that if you have enough memory this new streaming method is
slower the lower you set the streaming threshold, but the benefit is
more bounded memory use.

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

0. https://github.com/avar/git-hyperfine/
1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 69 ++++++++++++++++++++++++++++++++-
 t/t5351-unpack-large-objects.sh | 26 +++++++++++--
 3 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index ff6ae6bb647..b97bc7e3e55 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -468,8 +468,8 @@ usage, at the slight expense of increased disk usage.
 * Will be generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 64abba8dbac..d3124202f54 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -348,6 +348,68 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 		write_object(nr, type, buf, size);
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct obj_info *info = &obj_list[nr];
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &info->oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+		if (!blob)
+			die(_("invalid blob object from stream"));
+		blob->object.flags |= FLAG_WRITTEN;
+	}
+	info->obj = NULL;
+}
+
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -480,9 +542,14 @@ static void unpack_one(unsigned nr)
 	}
 
 	switch (type) {
+	case OBJ_BLOB:
+		if (!dry_run && size > big_file_threshold) {
+			stream_blob(size, nr);
+			return;
+		}
+		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-	case OBJ_BLOB:
 	case OBJ_TAG:
 		unpack_non_delta_entry(type, size, nr);
 		return;
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 8d84313221c..461ca060b2b 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold "$1"
 }
 
 test_expect_success "create large objects (1.5 MB) and PACK" '
@@ -26,16 +27,35 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+	# The destination came up with the exact same pack...
+	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+	test_cmp pack-$PACK.pack $DEST_PACK &&
+
+	# ...and wrote no loose objects
+	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
 test_done
-- 
2.36.1.1124.g52838f02905


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-04 10:10                     ` [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
@ 2022-06-06 18:35                       ` Junio C Hamano
  2022-06-09  4:10                         ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2022-06-06 18:35 UTC (permalink / raw)
  To: Han Xin, Ævar Arnfjörð Bjarmason
  Cc: git, Jiang Xin, René Scharfe, Derrick Stolee, Philip Oakley,
	Neeraj Singh, Elijah Newren, Han Xin, Jiang Xin

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> As the name implies, "get_data(size)" will allocate and return a given
> amount of memory. Allocating memory for a large blob object may cause the
> system to run out of memory. Before preparing to replace calling of
> "get_data()" to unpack large blob objects in latter commits, refactor
> "get_data()" to reduce memory footprint for dry_run mode.
>
> Because in dry_run mode, "get_data()" is only used to check the
> integrity of data, and the returned buffer is not used at all, we can
> allocate a smaller buffer and reuse it as zstream output. Therefore,

"reuse" -> "use"

> in dry_run mode, "get_data()" will release the allocated buffer and
> return NULL instead of returning garbage data.

It makes it sound as if we used to return garbage data, but I do not
think that is what happened in reality.  Perhaps rewrite the last
sentence like

	Make the function return NULL in the dry-run mode, as no
	callers use the returned buffer.

or something?

The overall logic sounds quite sensible.

> The "find [...]objects/?? -type f | wc -l" test idiom being used here
> is adapted from the same "find" use added to another test in
> d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).


> +/*
> + * Decompress zstream from stdin and return specific size of data.

"specific size"?  The caller specifies the size of data (because it
knows a-priori how many bytes the zstream should inflate to), so

    Decompress zstream from the standard input into a newly
    allocated buffer of specified size and return the buffer.

or something, perhaps.  In any case, it needs to say that the caller
is responsible for giving the "right" size.

> + * The caller is responsible to free the returned buffer.
> + *
> + * But for dry_run mode, "get_data()" is only used to check the
> + * integrity of data, and the returned buffer is not used at all.
> + * Therefore, in dry_run mode, "get_data()" will release the small
> + * allocated buffer which is reused to hold temporary zstream output
> + * and return NULL instead of returning garbage data.
> + */
>  static void *get_data(unsigned long size)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
> +	void *buf = xmallocz(bufsize);

OK.

>  	memset(&stream, 0, sizeof(stream));
>  
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -125,8 +136,15 @@ static void *get_data(unsigned long size)

What's hidden in the pre-context is this bit:

		int ret = git_inflate(&stream, 0);
		use(len - stream.avail_in);
		if (stream.total_out == size && ret == Z_STREAM_END)
			break;
		if (ret != Z_OK) {
			error("inflate returned %d", ret);
			FREE_AND_NULL(buf);
			if (!recover)
				exit(1);
			has_errors = 1;
			break;
		}

and it is correct to use "size", not "bufsize", for this check.
Unless we receive exactly the caller-specified "size" bytes from the
inflated zstream with Z_STREAM_END, we want to detect an error and
bail out.

I am not sure if this is not loosening the error checking in the
dry-run case, though.  In the original code, we set the avail_out
to the total expected size so

 (1) if the caller gives too small a size, git_inflate() would stop
     at stream.total_out with ret that is not STREAM_END nor OK,
     bypassing the "break", and we catch the error.

 (2) if the caller gives too large a size, git_inflate() would stop
     at the true size of inflated zstream, with STREAM_END and would
     not hit this "break", and we catch the error.

With the new code, since we keep refreshing avail_out (see below),
git_inflate() does not even learn how many bytes we are _expecting_
to see.  Is the error checking in the loop, with the updated code,
catch the mismatch between expected and actual size (plausibly
caused by a corrupted zstream) the same way as we do in the 
non dry-run code path?

>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
> +	if (dry_run)
> +		FREE_AND_NULL(buf);
>  	return buf;
>  }

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-04 10:10                     ` [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
@ 2022-06-06 18:45                       ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-06-06 18:45 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jiang Xin, René Scharfe, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> Change write_loose_object() to do an fsync() and close() before the
> oideq() sanity check at the end. This change re-joins code that was
> split up by the die() sanity check added in 748af44c63e (sha1_file: be
> paranoid when creating loose objects, 2010-02-21).
>
> I don't think that this change matters in itself, if we called die()
> it was possible that our data wouldn't fully make it to disk, but in
> any case we were writing data that we'd consider corrupted. It's
> possible that a subsequent "git fsck" will be less confused now.

write_loose_object() 

 - prepares a temporary file
 - deflates into the temporary file
 - closes and syncs it
 - moves the temporary file to the final locaiton

And any die() inserted in between any of these steps will cause the
corrupt temporary file not to become the final loose object file.

So, "git fsck" does not need this change at all.

> The real reason to make this change is that in a subsequent commit
> we'll split this code in write_loose_object() into a utility function,
> all its callers will want the preceding sanity checks, but not the
> "oideq" check. By moving the close_loose_object() earlier it'll be
> easier to reason about the introduction of the utility function.

If a "split" relies on the "close and sync" step being in any
particular place, that smells really fishy.  Is the series loosening
the object integrity check?  Are we adding some exploitable hole
into our codebase without people knowing, or something?  I am not
sure if I am following the above logic.



> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  object-file.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 79eb8339b60..e4a83012ba4 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2012,12 +2012,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +	close_loose_object(fd, tmp_file.buf);
> +
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>  
> -	close_loose_object(fd, tmp_file.buf);
> -
>  	if (mtime) {
>  		struct utimbuf utb;
>  		utb.actime = mtime;

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-04 10:10                     ` [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
@ 2022-06-06 19:44                       ` Junio C Hamano
  2022-06-06 20:02                         ` Junio C Hamano
  2022-06-09  6:14                         ` Han Xin
  2022-06-07 19:53                       ` Neeraj Singh
  1 sibling, 2 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-06-06 19:44 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jiang Xin, René Scharfe, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren, Han Xin, Jiang Xin

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> If we want unpack and write a loose object using "write_loose_object",
> we have to feed it with a buffer with the same size of the object, which
> will consume lots of memory and may cause OOM. This can be improved by
> feeding data to "stream_loose_object()" in a stream.
>
> Add a new function "stream_loose_object()", which is a stream version of
> "write_loose_object()" but with a low memory footprint. We will use this
> function to unpack large blob object in later commit.

Yay.

> Another difference with "write_loose_object()" is that we have no chance
> to run "write_object_file_prepare()" to calculate the oid in advance.

That is somewhat curious.  Is it fundamentally impossible, or is it
just that this patch was written in such a way that conflates the
two and it is cumbersome to split the "we repeat the sequence of
reading and deflating just a bit until we process all" and the "we
compute the hash over the data first and then we write out for
real"?

> In "write_loose_object()", we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object.
>
> Still, we need to save the temporary file we're preparing
> somewhere. We'll do that in the top-level ".git/objects/"
> directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
> streamed it we'll know the OID, and will move it to its canonical
> path.

This may have negative implications on some filesystems where cross
directory links do not work atomically, but it is a small price to pay.

I am very tempted to ask why we do not do this to _all_ loose object
files.  Instead of running the machinery twice over the data (once to
compute the object name, then to compute the contents and write out),
if we can produce loose object files of any size with a single pass,
wouldn't that be an overall win?

Is the fixed overhead, i.e. cost of setting up the streaming interface,
reasonably large to make it not worth doing for smaller objects?

> "freshen_packed_object()" or "freshen_loose_object()" will be called
> inside "stream_loose_object()" after obtaining the "oid".

That much we can read from the patch text.  Saying just "we do X"
without explaining "why we do so" in the proposed log message leaves
readers more confused than otherwise.  Why is it worth pointing out
in the proposed log message?  Is the reason why we need to do so
involve something tricky?

> +int stream_loose_object(struct input_stream *in_stream, size_t len,
> +			struct object_id *oid)
> +{
> +	int fd, ret, err = 0, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct strbuf tmp_file = STRBUF_INIT;
> +	struct strbuf filename = STRBUF_INIT;
> +	int dirlen;
> +	char hdr[MAX_HEADER_LEN];
> +	int hdrlen;
> +
> +	/* Since oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());
> +	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
> +
> +	/*
> +	 * Common steps for write_loose_object and stream_loose_object to
> +	 * start writing loose objects:
> +	 *
> +	 *  - Create tmpfile for the loose object.
> +	 *  - Setup zlib stream for compression.
> +	 *  - Start to feed header to zlib stream.
> +	 */
> +	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, hdr, hdrlen);
> +	if (fd < 0) {
> +		err = -1;
> +		goto cleanup;
> +	}
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +
> +		if (!stream.avail_in && !in_stream->is_finished) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (in_stream->is_finished)
> +				flush = 1;
> +		}
> +		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
> +						compressed, sizeof(compressed));
> +		/*
> +		 * Unlike write_loose_object(), we do not have the entire
> +		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
> +		 * then we'll replenish them in the next input_stream->read()
> +		 * call when we loop.
> +		 */
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
>
> +	if (stream.total_in != len + hdrlen)
> +		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
> +		    (uintmax_t)len + hdrlen);

> +	/* Common steps for write_loose_object and stream_loose_object to

Style.

> +	 * end writing loose oject:
> +	 *
> +	 *  - End the compression of zlib stream.
> +	 *  - Get the calculated oid.
> +	 */
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to stream deflate new object (%d)"), ret);

Good to check this, after the loop exits above.  I was expecting to
see it immediately after the loop, but here is also OK.

> +	ret = end_loose_object_common(&c, &stream, oid);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on stream object failed (%d)"), ret);
> +	close_loose_object(fd, tmp_file.buf);
> +
> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		goto cleanup;

So, we were told to write an object, we wrote to a temporary file,
and we wanted to mark the object to be recent and found that there
indeed is already the object.  We remove the temporary and do not
leave the new copy of the object, and the value of err at this point
is 0 (success) which is what is returned from cleanup: label.

Good.

> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +
> +	/* We finally know the object path, and create the missing dir. */
> +	dirlen = directory_size(filename.buf);
> +	if (dirlen) {
> +		struct strbuf dir = STRBUF_INIT;
> +		strbuf_add(&dir, filename.buf, dirlen);
> +
> +		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
> +			err = error_errno(_("unable to create directory %s"), dir.buf);
> +			strbuf_release(&dir);
> +			goto cleanup;
> +		}
> +		strbuf_release(&dir);
> +	}
> +
> +	err = finalize_object_file(tmp_file.buf, filename.buf);
> +cleanup:
> +	strbuf_release(&tmp_file);
> +	strbuf_release(&filename);
> +	return err;
> +}
> +


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation
  2022-06-04 10:10                     ` [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
@ 2022-06-06 19:50                       ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-06-06 19:50 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jiang Xin, René Scharfe, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren

Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> So let's attempt to summarize 12 years of changes in behavior, which
> can be seen with:
>
>     git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'
>
> To do that turn this into a bullet-point list. The summary Han Xin
> produced in [1] helped a lot, but is a bit too detailed for
> documentation aimed at users. Let's instead summarize how
> user-observable behavior differs, and generally describe how we tend
> to stream these files in various commands.

Nicely studied.  Very much appreciated.

>  core.bigFileThreshold::
> -	Files larger than this size are stored deflated, without
> -	attempting delta compression.  Storing large files without
> -	delta compression avoids excessive memory usage, at the
> -	slight expense of increased disk usage. Additionally files
> -	larger than this size are always treated as binary.
> +	The size of files considered "big", which as discussed below
> +	changes the behavior of numerous git commands, as well as how
> +	such files are stored within the repository. The default is
> +	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
> +	supported.
>  +
> -Default is 512 MiB on all platforms.  This should be reasonable
> -for most projects as source code and other text files can still
> -be delta compressed, but larger binary media files won't be.
> +Files above the configured limit will be:
>  +
> -Common unit suffixes of 'k', 'm', or 'g' are supported.
> +* Stored deflated, without attempting delta compression.

"even in packfiles" (with or without "even") is better be there in
the sentence---loose objects are always stored deflated anyway.

> +The default limit is primarily set with this use-case in mind. With it
> +most projects will have their source code and other text files delta
> +compressed, but not larger binary media files.
> ++
> +Storing large files without delta compression avoids excessive memory
> +usage, at the slight expense of increased disk usage.

> +* Will be treated as if though they were labeled "binary" (see
> +  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
> +  and linkgit:git-diff[1] will not diffs for files above this limit.

Good.  You can lose three words "This means that" and the sentence
means the same thing, so lose them (I always recommend people to
reread the sentence when they write "This means that" with an eye to
rewrite it better---it often is a sign that either the previous
sentence is insufficiently clear, in which case it can be discarded
and description after the three words can be enhanced to a better
result).

> +* Will be generally be streamed when written, which avoids excessive
> +memory usage, at the cost of some fixed overhead. Commands that make
> +use of this include linkgit:git-archive[1],
> +linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
> +linkgit:git-fsck[1].

Nice.  And this series adds unpack-objects to the mix.

>  core.excludesFile::
>  	Specifies the pathname to the file that contains patterns to

Excellent.

Thanks.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-06 19:44                       ` Junio C Hamano
@ 2022-06-06 20:02                         ` Junio C Hamano
  2022-06-09  6:04                           ` Han Xin
  2022-06-09  6:14                         ` Han Xin
  1 sibling, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2022-06-06 20:02 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Han Xin, Jiang Xin, René Scharfe, Derrick Stolee,
	Philip Oakley, Neeraj Singh, Elijah Newren, Jiang Xin

Junio C Hamano <gitster@pobox.com> writes:

>> Another difference with "write_loose_object()" is that we have no chance
>> to run "write_object_file_prepare()" to calculate the oid in advance.
>
> That is somewhat curious.  Is it fundamentally impossible, or is it
> just that this patch was written in such a way that conflates the
> two and it is cumbersome to split the "we repeat the sequence of
> reading and deflating just a bit until we process all" and the "we
> compute the hash over the data first and then we write out for
> real"?

OK, the answer lies somewhere in between.

The initial user of this streaming interface reads from an incoming
packfile and feeds the inflated bytestream to the interface, which
means we cannot seek.  That meaks it "fundamentally impossible" for
that codepath (i.e. unpack-objects to read from packstream and write
to on-disk loose objects).

But if the input source is seekable (e.g. a file in the working
tree), there is no fundamental reason why the new interface has "no
chance to run prepare to calculate the oid in advance".  It's just
that the such a different caller is not added by the series and we
chose not to allow the "prepare and then write" two-step process,
because we currently do not need it when this series lands.

> I am very tempted to ask why we do not do this to _all_ loose object
> files.  Instead of running the machinery twice over the data (once to
> compute the object name, then to compute the contents and write out),
> if we can produce loose object files of any size with a single pass,
> wouldn't that be an overall win?

There is a patch later in the series whose proposed log message has
benchmarks to show that it is slower in general.  It still is
curious where the slowness comes from and if it is something we can
tune, though.

Thanks.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-04 10:10                     ` [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
  2022-06-06 19:44                       ` Junio C Hamano
@ 2022-06-07 19:53                       ` Neeraj Singh
  2022-06-08 15:34                         ` Junio C Hamano
  2022-06-09  3:05                         ` [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object() Han Xin
  1 sibling, 2 replies; 211+ messages in thread
From: Neeraj Singh @ 2022-06-07 19:53 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason, git
  Cc: Junio C Hamano, Han Xin, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin

On 6/4/2022 3:10 AM, Ævar Arnfjörð Bjarmason wrote:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
> 
> If we want unpack and write a loose object using "write_loose_object",
> we have to feed it with a buffer with the same size of the object, which
> will consume lots of memory and may cause OOM. This can be improved by
> feeding data to "stream_loose_object()" in a stream.
> 
> Add a new function "stream_loose_object()", which is a stream version of
> "write_loose_object()" but with a low memory footprint. We will use this
> function to unpack large blob object in later commit.
> 
> Another difference with "write_loose_object()" is that we have no chance
> to run "write_object_file_prepare()" to calculate the oid in advance.
> In "write_loose_object()", we know the oid and we can write the
> temporary file in the same directory as the final object, but for an
> object with an undetermined oid, we don't know the exact directory for
> the object.
> 
> Still, we need to save the temporary file we're preparing
> somewhere. We'll do that in the top-level ".git/objects/"
> directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
> streamed it we'll know the OID, and will move it to its canonical
> path.
> 

I think this new logic doesn't play well with batched-fsync. Even 
through we don't know the final OID, we should still call 
prepare_loose_object_bulk_checkin to potentially create the bulk checkin 
objdir.


> diff --git a/object-file.c b/object-file.c
> index 7946fa5e088..9fd449693c4 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2119,6 +2119,106 @@ static int freshen_packed_object(const struct object_id *oid)
>   	return 1;
>   }
>   
> +int stream_loose_object(struct input_stream *in_stream, size_t len,
> +			struct object_id *oid)
> +{
> +	int fd, ret, err = 0, flush = 0;
> +	unsigned char compressed[4096];
> +	git_zstream stream;
> +	git_hash_ctx c;
> +	struct strbuf tmp_file = STRBUF_INIT;
> +	struct strbuf filename = STRBUF_INIT;
> +	int dirlen;
> +	char hdr[MAX_HEADER_LEN];
> +	int hdrlen;
> +
> +	/* Since oid is not determined, save tmp file to odb path. */
> +	strbuf_addf(&filename, "%s/", get_object_directory());
> +	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
> +
> +	/*
> +	 * Common steps for write_loose_object and stream_loose_object to
> +	 * start writing loose objects:
> +	 *
> +	 *  - Create tmpfile for the loose object.
> +	 *  - Setup zlib stream for compression.
> +	 *  - Start to feed header to zlib stream.
> +	 */
> +	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
> +				       &stream, compressed, sizeof(compressed),
> +				       &c, hdr, hdrlen);
> +	if (fd < 0) {
> +		err = -1;
> +		goto cleanup;
> +	}
> +
> +	/* Then the data itself.. */
> +	do {
> +		unsigned char *in0 = stream.next_in;
> +
> +		if (!stream.avail_in && !in_stream->is_finished) {
> +			const void *in = in_stream->read(in_stream, &stream.avail_in);
> +			stream.next_in = (void *)in;
> +			in0 = (unsigned char *)in;
> +			/* All data has been read. */
> +			if (in_stream->is_finished)
> +				flush = 1;
> +		}
> +		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
> +						compressed, sizeof(compressed));
> +		/*
> +		 * Unlike write_loose_object(), we do not have the entire
> +		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
> +		 * then we'll replenish them in the next input_stream->read()
> +		 * call when we loop.
> +		 */
> +	} while (ret == Z_OK || ret == Z_BUF_ERROR);
> +
> +	if (stream.total_in != len + hdrlen)
> +		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
> +		    (uintmax_t)len + hdrlen);
> +
> +	/* Common steps for write_loose_object and stream_loose_object to
> +	 * end writing loose oject:
> +	 *
> +	 *  - End the compression of zlib stream.
> +	 *  - Get the calculated oid.
> +	 */
> +	if (ret != Z_STREAM_END)
> +		die(_("unable to stream deflate new object (%d)"), ret);
> +	ret = end_loose_object_common(&c, &stream, oid);
> +	if (ret != Z_OK)
> +		die(_("deflateEnd on stream object failed (%d)"), ret);
> +	close_loose_object(fd, tmp_file.buf);
> +

If batch fsync is enabled, the close_loose_object call will refrain from 
syncing the tmp file.

> +	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> +		unlink_or_warn(tmp_file.buf);
> +		goto cleanup;
> +	}
> +
> +	loose_object_path(the_repository, &filename, oid);
> +

We expect this loose_object_path call to return a path in the bulk fsync 
object directory. It might not do so if we don't call 
prepare_loose_object_bulk_checkin.

In the new test case introduced in (7/7), we seem to be getting lucky
in that there are some small objects (commits) earlier in the packfile,
so we go through write_loose_object first.

Thanks for including me on the review!

-Neeraj

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-07 19:53                       ` Neeraj Singh
@ 2022-06-08 15:34                         ` Junio C Hamano
  2022-06-09  3:05                         ` [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object() Han Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-06-08 15:34 UTC (permalink / raw)
  To: Neeraj Singh
  Cc: Ævar Arnfjörð Bjarmason, git, Han Xin, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Han Xin, Jiang Xin

Neeraj Singh <nksingh85@gmail.com> writes:

>> Still, we need to save the temporary file we're preparing
>> somewhere. We'll do that in the top-level ".git/objects/"
>> directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
>> streamed it we'll know the OID, and will move it to its canonical
>> path.
>> 
>
> I think this new logic doesn't play well with batched-fsync. Even
> through we don't know the final OID, we should still call 
> prepare_loose_object_bulk_checkin to potentially create the bulk
> checkin objdir.

Good point.  Careful sanity checks like this are very much
appreciated.

> Thanks for including me on the review!

Yes, indeed.

Thanks, both.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object()
  2022-06-07 19:53                       ` Neeraj Singh
  2022-06-08 15:34                         ` Junio C Hamano
@ 2022-06-09  3:05                         ` Han Xin
  2022-06-09  7:35                           ` Neeraj Singh
  2022-06-09  9:30                           ` Johannes Schindelin
  1 sibling, 2 replies; 211+ messages in thread
From: Han Xin @ 2022-06-09  3:05 UTC (permalink / raw)
  To: nksingh85
  Cc: Han Xin, avarab, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, zhiyou.jx

Neeraj Singh[1] pointed out that if batch fsync is enabled, we should still
call prepare_loose_object_bulk_checkin() to potentially create the bulk checkin
objdir.

1. https://lore.kernel.org/git/7ba4858a-d1cc-a4eb-b6d6-4c04a5dd6ce7@gmail.com/

Signed-off-by: Han Xin <chiyutianyi@gmail.com>
---
 object-file.c                   |  3 +++
 t/t5351-unpack-large-objects.sh | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/object-file.c b/object-file.c
index 2dd828b45b..3a1be74775 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2131,6 +2131,9 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 
+	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
+		prepare_loose_object_bulk_checkin();
+
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
 	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 461ca060b2..a66a51f7df 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -18,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
-	PACK=$(echo HEAD | git pack-objects --revs pack)
+	PACK=$(echo HEAD | git pack-objects --revs pack) &&
+	git verify-pack -v pack-$PACK.pack |
+	    grep -E "commit|tree|blob" |
+		sed -n -e "s/^\([0-9a-f]*\).*/\1/p" >obj-list
 '
 
 test_expect_success 'set memory limitation to 1MB' '
@@ -45,6 +48,16 @@ test_expect_success 'unpack big object in stream' '
 	test_dir_is_empty dest.git/objects/pack
 '
 
+BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
+
+test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
+	prepare_dest 1m &&
+	git $BATCH_CONFIGURATION -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack &&
+	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
+	cmp obj-list current
+'
+
 test_expect_success 'do not unpack existing large objects' '
 	prepare_dest 1m &&
 	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-06 18:35                       ` Junio C Hamano
@ 2022-06-09  4:10                         ` Han Xin
  2022-06-09 18:27                           ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-06-09  4:10 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Git List, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Han Xin, Jiang Xin

On Tue, Jun 7, 2022 at 2:35 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > As the name implies, "get_data(size)" will allocate and return a given
> > amount of memory. Allocating memory for a large blob object may cause the
> > system to run out of memory. Before preparing to replace calling of
> > "get_data()" to unpack large blob objects in latter commits, refactor
> > "get_data()" to reduce memory footprint for dry_run mode.
> >
> > Because in dry_run mode, "get_data()" is only used to check the
> > integrity of data, and the returned buffer is not used at all, we can
> > allocate a smaller buffer and reuse it as zstream output. Therefore,
>
> "reuse" -> "use"
>
> > in dry_run mode, "get_data()" will release the allocated buffer and
> > return NULL instead of returning garbage data.
>
> It makes it sound as if we used to return garbage data, but I do not
> think that is what happened in reality.  Perhaps rewrite the last
> sentence like
>
>         Make the function return NULL in the dry-run mode, as no
>         callers use the returned buffer.
>
> or something?
>
> The overall logic sounds quite sensible.
>
> > The "find [...]objects/?? -type f | wc -l" test idiom being used here
> > is adapted from the same "find" use added to another test in
> > d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).
>
>
> > +/*
> > + * Decompress zstream from stdin and return specific size of data.
>
> "specific size"?  The caller specifies the size of data (because it
> knows a-priori how many bytes the zstream should inflate to), so
>
>     Decompress zstream from the standard input into a newly
>     allocated buffer of specified size and return the buffer.
>
> or something, perhaps.  In any case, it needs to say that the caller
> is responsible for giving the "right" size.
>
> > + * The caller is responsible to free the returned buffer.
> > + *
> > + * But for dry_run mode, "get_data()" is only used to check the
> > + * integrity of data, and the returned buffer is not used at all.
> > + * Therefore, in dry_run mode, "get_data()" will release the small
> > + * allocated buffer which is reused to hold temporary zstream output
> > + * and return NULL instead of returning garbage data.
> > + */
> >  static void *get_data(unsigned long size)
> >  {
> >       git_zstream stream;
> > -     void *buf = xmallocz(size);
> > +     unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
> > +     void *buf = xmallocz(bufsize);
>
> OK.
>
> >       memset(&stream, 0, sizeof(stream));
> >
> >       stream.next_out = buf;
> > -     stream.avail_out = size;
> > +     stream.avail_out = bufsize;
> >       stream.next_in = fill(1);
> >       stream.avail_in = len;
> >       git_inflate_init(&stream);
> > @@ -125,8 +136,15 @@ static void *get_data(unsigned long size)
>
> What's hidden in the pre-context is this bit:
>
>                 int ret = git_inflate(&stream, 0);
>                 use(len - stream.avail_in);
>                 if (stream.total_out == size && ret == Z_STREAM_END)
>                         break;
>                 if (ret != Z_OK) {
>                         error("inflate returned %d", ret);
>                         FREE_AND_NULL(buf);
>                         if (!recover)
>                                 exit(1);
>                         has_errors = 1;
>                         break;
>                 }
>
> and it is correct to use "size", not "bufsize", for this check.
> Unless we receive exactly the caller-specified "size" bytes from the
> inflated zstream with Z_STREAM_END, we want to detect an error and
> bail out.
>
> I am not sure if this is not loosening the error checking in the
> dry-run case, though.  In the original code, we set the avail_out
> to the total expected size so
>
>  (1) if the caller gives too small a size, git_inflate() would stop
>      at stream.total_out with ret that is not STREAM_END nor OK,
>      bypassing the "break", and we catch the error.
>
>  (2) if the caller gives too large a size, git_inflate() would stop
>      at the true size of inflated zstream, with STREAM_END and would
>      not hit this "break", and we catch the error.
>
> With the new code, since we keep refreshing avail_out (see below),
> git_inflate() does not even learn how many bytes we are _expecting_
> to see.  Is the error checking in the loop, with the updated code,
> catch the mismatch between expected and actual size (plausibly
> caused by a corrupted zstream) the same way as we do in the
> non dry-run code path?
>

Unlike the original implementation, if we get a corrupted zstream, we
won't break at Z_BUFFER_ERROR, maybe until we've read all the
input. I think it can still catch the mismatch between expected and
actual size when "fill(1)" gets an EOF, if it's not too late.

Thanks.
-Han Xin

> >               }
> >               stream.next_in = fill(1);
> >               stream.avail_in = len;
> > +             if (dry_run) {
> > +                     /* reuse the buffer in dry_run mode */
> > +                     stream.next_out = buf;
> > +                     stream.avail_out = bufsize;
> > +             }
> >       }
> >       git_inflate_end(&stream);
> > +     if (dry_run)
> > +             FREE_AND_NULL(buf);
> >       return buf;
> >  }

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-06 20:02                         ` Junio C Hamano
@ 2022-06-09  6:04                           ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-09  6:04 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Git List, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Jiang Xin

On Tue, Jun 7, 2022 at 4:03 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> Junio C Hamano <gitster@pobox.com> writes:
>
> > I am very tempted to ask why we do not do this to _all_ loose object
> > files.  Instead of running the machinery twice over the data (once to
> > compute the object name, then to compute the contents and write out),
> > if we can produce loose object files of any size with a single pass,
> > wouldn't that be an overall win?
>
> There is a patch later in the series whose proposed log message has
> benchmarks to show that it is slower in general.  It still is
> curious where the slowness comes from and if it is something we can
> tune, though.
>

Compared with getting the whole object buffer, stream_loose_object() uses
limited avail_in buffer and never fill new content until the whole
avail_in has been
deflated. It will generate small avail_in fragments due to the limited
avail_out,
and I think it is precisely because these avail_in fragments generate additional
git_deflate() loops.

In "unpack-objects", we use a buffer size of 8192. Increasing the buffer
can alleviate this problem, but maybe it's not worth it?

> Thanks.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-06 19:44                       ` Junio C Hamano
  2022-06-06 20:02                         ` Junio C Hamano
@ 2022-06-09  6:14                         ` Han Xin
  1 sibling, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-09  6:14 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Git List, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Han Xin, Jiang Xin

On Tue, Jun 7, 2022 at 3:44 AM Junio C Hamano <gitster@pobox.com> wrote:
>
>
> > "freshen_packed_object()" or "freshen_loose_object()" will be called
> > inside "stream_loose_object()" after obtaining the "oid".
>
> That much we can read from the patch text.  Saying just "we do X"
> without explaining "why we do so" in the proposed log message leaves
> readers more confused than otherwise.  Why is it worth pointing out
> in the proposed log message?  Is the reason why we need to do so
> involve something tricky?
>

Yes, it really should be made clear why this is done here.

Thanks.
-Han Xin

> > +     ret = end_loose_object_common(&c, &stream, oid);
> > +     if (ret != Z_OK)
> > +             die(_("deflateEnd on stream object failed (%d)"), ret);
> > +     close_loose_object(fd, tmp_file.buf);
> > +
> > +     if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
> > +             unlink_or_warn(tmp_file.buf);
> > +             goto cleanup;
>
> So, we were told to write an object, we wrote to a temporary file,
> and we wanted to mark the object to be recent and found that there
> indeed is already the object.  We remove the temporary and do not
> leave the new copy of the object, and the value of err at this point
> is 0 (success) which is what is returned from cleanup: label.
>
> Good.
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object()
  2022-06-09  3:05                         ` [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object() Han Xin
@ 2022-06-09  7:35                           ` Neeraj Singh
  2022-06-09  9:30                           ` Johannes Schindelin
  1 sibling, 0 replies; 211+ messages in thread
From: Neeraj Singh @ 2022-06-09  7:35 UTC (permalink / raw)
  To: Han Xin
  Cc: avarab, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, zhiyou.jx

On 6/8/2022 8:05 PM, Han Xin wrote:
> Neeraj Singh[1] pointed out that if batch fsync is enabled, we should still
> call prepare_loose_object_bulk_checkin() to potentially create the bulk checkin
> objdir.
> 
> 1. https://lore.kernel.org/git/7ba4858a-d1cc-a4eb-b6d6-4c04a5dd6ce7@gmail.com/
> 
> Signed-off-by: Han Xin <chiyutianyi@gmail.com>
> ---
>   object-file.c                   |  3 +++
>   t/t5351-unpack-large-objects.sh | 15 ++++++++++++++-
>   2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/object-file.c b/object-file.c
> index 2dd828b45b..3a1be74775 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2131,6 +2131,9 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
>   	char hdr[MAX_HEADER_LEN];
>   	int hdrlen;
>   
> +	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
> +		prepare_loose_object_bulk_checkin();
> +
>   	/* Since oid is not determined, save tmp file to odb path. */
>   	strbuf_addf(&filename, "%s/", get_object_directory());
>   	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
> diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
> index 461ca060b2..a66a51f7df 100755
> --- a/t/t5351-unpack-large-objects.sh
> +++ b/t/t5351-unpack-large-objects.sh
> @@ -18,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
>   	test_commit --append foo big-blob &&
>   	test-tool genrandom bar 1500000 >big-blob &&
>   	test_commit --append bar big-blob &&
> -	PACK=$(echo HEAD | git pack-objects --revs pack)
> +	PACK=$(echo HEAD | git pack-objects --revs pack) &&
> +	git verify-pack -v pack-$PACK.pack |
> +	    grep -E "commit|tree|blob" |
> +		sed -n -e "s/^\([0-9a-f]*\).*/\1/p" >obj-list
>   '
>   
>   test_expect_success 'set memory limitation to 1MB' '
> @@ -45,6 +48,16 @@ test_expect_success 'unpack big object in stream' '
>   	test_dir_is_empty dest.git/objects/pack
>   '
>   
> +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
> +
> +test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
> +	prepare_dest 1m &&
> +	git $BATCH_CONFIGURATION -C dest.git unpack-objects <pack-$PACK.pack &&
> +	test_dir_is_empty dest.git/objects/pack &&
> +	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
> +	cmp obj-list current
> +'
> +
>   test_expect_success 'do not unpack existing large objects' '
>   	prepare_dest 1m &&
>   	git -C dest.git index-pack --stdin <pack-$PACK.pack &&

This fix looks good to me.

Thanks.

-Neeraj

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object()
  2022-06-09  3:05                         ` [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object() Han Xin
  2022-06-09  7:35                           ` Neeraj Singh
@ 2022-06-09  9:30                           ` Johannes Schindelin
  2022-06-10 12:55                             ` Han Xin
  1 sibling, 1 reply; 211+ messages in thread
From: Johannes Schindelin @ 2022-06-09  9:30 UTC (permalink / raw)
  To: Han Xin
  Cc: nksingh85, avarab, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, zhiyou.jx

Hi,

On Thu, 9 Jun 2022, Han Xin wrote:

> Neeraj Singh[1] pointed out that if batch fsync is enabled, we should still
> call prepare_loose_object_bulk_checkin() to potentially create the bulk checkin
> objdir.
>
> 1. https://lore.kernel.org/git/7ba4858a-d1cc-a4eb-b6d6-4c04a5dd6ce7@gmail.com/
>
> Signed-off-by: Han Xin <chiyutianyi@gmail.com>

I like a good commit message that is concise and yet has all the necessary
information. Well done!

> ---
>  object-file.c                   |  3 +++
>  t/t5351-unpack-large-objects.sh | 15 ++++++++++++++-
>  2 files changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/object-file.c b/object-file.c
> index 2dd828b45b..3a1be74775 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2131,6 +2131,9 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>
> +	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
> +		prepare_loose_object_bulk_checkin();
> +

Makes sense.

>  	/* Since oid is not determined, save tmp file to odb path. */
>  	strbuf_addf(&filename, "%s/", get_object_directory());
>  	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
> diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
> index 461ca060b2..a66a51f7df 100755
> --- a/t/t5351-unpack-large-objects.sh
> +++ b/t/t5351-unpack-large-objects.sh
> @@ -18,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
>  	test_commit --append foo big-blob &&
>  	test-tool genrandom bar 1500000 >big-blob &&
>  	test_commit --append bar big-blob &&
> -	PACK=$(echo HEAD | git pack-objects --revs pack)
> +	PACK=$(echo HEAD | git pack-objects --revs pack) &&
> +	git verify-pack -v pack-$PACK.pack |
> +	    grep -E "commit|tree|blob" |
> +		sed -n -e "s/^\([0-9a-f]*\).*/\1/p" >obj-list

Here, I would recommend avoiding the pipe, to ensure that we would catch
problems in the `verify-pack` invocation, and I think we can avoid the
`grep` altogether:

	git verify-pack -v pack-$PACK.pack >out &&
	sed -n 's/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\)/\1/p' \
		<out >obj-list

>  '
>
>  test_expect_success 'set memory limitation to 1MB' '
> @@ -45,6 +48,16 @@ test_expect_success 'unpack big object in stream' '
>  	test_dir_is_empty dest.git/objects/pack
>  '
>
> +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
> +
> +test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
> +	prepare_dest 1m &&
> +	git $BATCH_CONFIGURATION -C dest.git unpack-objects <pack-$PACK.pack &&

I think the canonical way would be to use `test_config core.fsync ...`,
but the presented way works, too.

> +	test_dir_is_empty dest.git/objects/pack &&
> +	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&

Good. The `--batch-check="%(objectname)"` part forces `cat-file` to read
the actual object.

> +	cmp obj-list current
> +'

My main question about this test case is whether it _actually_ verifies
that the batch-mode `fsync()`ing took place.

I kind of had expected to see Trace2 enabled and a `grep` for
`fsync/hardware-flush`. Do you think that would still make sense to add?

Thank you for working on the `fsync()` aspects of Git!
Dscho

> +
>  test_expect_success 'do not unpack existing large objects' '
>  	prepare_dest 1m &&
>  	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
> --
> 2.36.1
>
>

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-09  4:10                         ` Han Xin
@ 2022-06-09 18:27                           ` Junio C Hamano
  2022-06-10  1:50                             ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2022-06-09 18:27 UTC (permalink / raw)
  To: Han Xin
  Cc: Ævar Arnfjörð Bjarmason, Git List, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Han Xin, Jiang Xin

Han Xin <chiyutianyi@gmail.com> writes:

>> I am not sure if this is not loosening the error checking in the
>> dry-run case, though.  In the original code, we set the avail_out
>> to the total expected size so
>>
>>  (1) if the caller gives too small a size, git_inflate() would stop
>>      at stream.total_out with ret that is not STREAM_END nor OK,
>>      bypassing the "break", and we catch the error.
>>
>>  (2) if the caller gives too large a size, git_inflate() would stop
>>      at the true size of inflated zstream, with STREAM_END and would
>>      not hit this "break", and we catch the error.
>>
>> With the new code, since we keep refreshing avail_out (see below),
>> git_inflate() does not even learn how many bytes we are _expecting_
>> to see.  Is the error checking in the loop, with the updated code,
>> catch the mismatch between expected and actual size (plausibly
>> caused by a corrupted zstream) the same way as we do in the
>> non dry-run code path?
>>
>
> Unlike the original implementation, if we get a corrupted zstream, we
> won't break at Z_BUFFER_ERROR, maybe until we've read all the
> input. I think it can still catch the mismatch between expected and
> actual size when "fill(1)" gets an EOF, if it's not too late.

That is only one half of the two possible failure cases, i.e. input
is shorter than the expected size.  If the caller specified size is
smaller than what the stream inflates to, I do not see the new code
to be limiting the .avail_out near the end of the iteration, which
would be necessary to catch such an error, even if we are not
interested in using the inflated contents, no?


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-09 18:27                           ` Junio C Hamano
@ 2022-06-10  1:50                             ` Han Xin
  2022-06-10  2:05                               ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-06-10  1:50 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Git List, Jiang Xin,
	René Scharfe, Derrick Stolee, Philip Oakley, Neeraj Singh,
	Elijah Newren, Han Xin, Jiang Xin

On Fri, Jun 10, 2022 at 2:27 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> Han Xin <chiyutianyi@gmail.com> writes:
>
> >> I am not sure if this is not loosening the error checking in the
> >> dry-run case, though.  In the original code, we set the avail_out
> >> to the total expected size so
> >>
> >>  (1) if the caller gives too small a size, git_inflate() would stop
> >>      at stream.total_out with ret that is not STREAM_END nor OK,
> >>      bypassing the "break", and we catch the error.
> >>
> >>  (2) if the caller gives too large a size, git_inflate() would stop
> >>      at the true size of inflated zstream, with STREAM_END and would
> >>      not hit this "break", and we catch the error.
> >>
> >> With the new code, since we keep refreshing avail_out (see below),
> >> git_inflate() does not even learn how many bytes we are _expecting_
> >> to see.  Is the error checking in the loop, with the updated code,
> >> catch the mismatch between expected and actual size (plausibly
> >> caused by a corrupted zstream) the same way as we do in the
> >> non dry-run code path?
> >>
> >
> > Unlike the original implementation, if we get a corrupted zstream, we
> > won't break at Z_BUFFER_ERROR, maybe until we've read all the
> > input. I think it can still catch the mismatch between expected and
> > actual size when "fill(1)" gets an EOF, if it's not too late.
>
> That is only one half of the two possible failure cases, i.e. input
> is shorter than the expected size.  If the caller specified size is
> smaller than what the stream inflates to, I do not see the new code
> to be limiting the .avail_out near the end of the iteration, which
> would be necessary to catch such an error, even if we are not
> interested in using the inflated contents, no?
>

Yes, you are right.

Instead of always using a fixed "bufsize" even if there is not enough
expected output remaining, we can get a more accurate one by comparing
"total_out" to "size", so we can catch problems early by getting
Z_BUFFER_ERROR.

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 64abba8dba..5d59144883 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -139,7 +139,8 @@ static void *get_data(unsigned long size)
                if (dry_run) {
                        /* reuse the buffer in dry_run mode */
                        stream.next_out = buf;
-                       stream.avail_out = bufsize;
+                       stream.avail_out = bufsize > size - stream.total_out ?
+                               size - stream.total_out : bufsize;
                }
        }
        git_inflate_end(&stream);

Thanks
-Han Xin

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-10  1:50                             ` Han Xin
@ 2022-06-10  2:05                               ` Ævar Arnfjörð Bjarmason
  2022-06-10 12:04                                 ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2022-06-10  2:05 UTC (permalink / raw)
  To: Han Xin
  Cc: Junio C Hamano, Git List, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin


On Fri, Jun 10 2022, Han Xin wrote:

> On Fri, Jun 10, 2022 at 2:27 AM Junio C Hamano <gitster@pobox.com> wrote:
>>
>> Han Xin <chiyutianyi@gmail.com> writes:
>>
>> >> I am not sure if this is not loosening the error checking in the
>> >> dry-run case, though.  In the original code, we set the avail_out
>> >> to the total expected size so
>> >>
>> >>  (1) if the caller gives too small a size, git_inflate() would stop
>> >>      at stream.total_out with ret that is not STREAM_END nor OK,
>> >>      bypassing the "break", and we catch the error.
>> >>
>> >>  (2) if the caller gives too large a size, git_inflate() would stop
>> >>      at the true size of inflated zstream, with STREAM_END and would
>> >>      not hit this "break", and we catch the error.
>> >>
>> >> With the new code, since we keep refreshing avail_out (see below),
>> >> git_inflate() does not even learn how many bytes we are _expecting_
>> >> to see.  Is the error checking in the loop, with the updated code,
>> >> catch the mismatch between expected and actual size (plausibly
>> >> caused by a corrupted zstream) the same way as we do in the
>> >> non dry-run code path?
>> >>
>> >
>> > Unlike the original implementation, if we get a corrupted zstream, we
>> > won't break at Z_BUFFER_ERROR, maybe until we've read all the
>> > input. I think it can still catch the mismatch between expected and
>> > actual size when "fill(1)" gets an EOF, if it's not too late.
>>
>> That is only one half of the two possible failure cases, i.e. input
>> is shorter than the expected size.  If the caller specified size is
>> smaller than what the stream inflates to, I do not see the new code
>> to be limiting the .avail_out near the end of the iteration, which
>> would be necessary to catch such an error, even if we are not
>> interested in using the inflated contents, no?
>>
>
> Yes, you are right.
>
> Instead of always using a fixed "bufsize" even if there is not enough
> expected output remaining, we can get a more accurate one by comparing
> "total_out" to "size", so we can catch problems early by getting
> Z_BUFFER_ERROR.
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 64abba8dba..5d59144883 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -139,7 +139,8 @@ static void *get_data(unsigned long size)
>                 if (dry_run) {
>                         /* reuse the buffer in dry_run mode */
>                         stream.next_out = buf;
> -                       stream.avail_out = bufsize;
> +                       stream.avail_out = bufsize > size - stream.total_out ?
> +                               size - stream.total_out : bufsize;
>                 }
>         }
>         git_inflate_end(&stream);
>
> Thanks
> -Han Xin

Han, do you want to pick this up again for a v14? It looks like you're
very on top of it already, and I re-sent your patches because I saw that
your
https://lore.kernel.org/git/cover.1653015534.git.chiyutianyi@gmail.com/
wasn't picked up in the interim & you hadn't been active on-list
otherwise.

But it looks like there's some interest now, and that you have more time
to test & follow-up on this topic than I do at the moment, so if you
wanted to do the work of properly rebasing ot in tho recent fsync
changes that would be great. Thanks.

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-10  2:05                               ` Ævar Arnfjörð Bjarmason
@ 2022-06-10 12:04                                 ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 12:04 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Git List, Jiang Xin, René Scharfe,
	Derrick Stolee, Philip Oakley, Neeraj Singh, Elijah Newren,
	Han Xin, Jiang Xin

On Fri, Jun 10, 2022 at 10:07 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Fri, Jun 10 2022, Han Xin wrote:
>
> > On Fri, Jun 10, 2022 at 2:27 AM Junio C Hamano <gitster@pobox.com> wrote:
> >>
> >> Han Xin <chiyutianyi@gmail.com> writes:
> >>
> >> >> I am not sure if this is not loosening the error checking in the
> >> >> dry-run case, though.  In the original code, we set the avail_out
> >> >> to the total expected size so
> >> >>
> >> >>  (1) if the caller gives too small a size, git_inflate() would stop
> >> >>      at stream.total_out with ret that is not STREAM_END nor OK,
> >> >>      bypassing the "break", and we catch the error.
> >> >>
> >> >>  (2) if the caller gives too large a size, git_inflate() would stop
> >> >>      at the true size of inflated zstream, with STREAM_END and would
> >> >>      not hit this "break", and we catch the error.
> >> >>
> >> >> With the new code, since we keep refreshing avail_out (see below),
> >> >> git_inflate() does not even learn how many bytes we are _expecting_
> >> >> to see.  Is the error checking in the loop, with the updated code,
> >> >> catch the mismatch between expected and actual size (plausibly
> >> >> caused by a corrupted zstream) the same way as we do in the
> >> >> non dry-run code path?
> >> >>
> >> >
> >> > Unlike the original implementation, if we get a corrupted zstream, we
> >> > won't break at Z_BUFFER_ERROR, maybe until we've read all the
> >> > input. I think it can still catch the mismatch between expected and
> >> > actual size when "fill(1)" gets an EOF, if it's not too late.
> >>
> >> That is only one half of the two possible failure cases, i.e. input
> >> is shorter than the expected size.  If the caller specified size is
> >> smaller than what the stream inflates to, I do not see the new code
> >> to be limiting the .avail_out near the end of the iteration, which
> >> would be necessary to catch such an error, even if we are not
> >> interested in using the inflated contents, no?
> >>
> >
> > Yes, you are right.
> >
> > Instead of always using a fixed "bufsize" even if there is not enough
> > expected output remaining, we can get a more accurate one by comparing
> > "total_out" to "size", so we can catch problems early by getting
> > Z_BUFFER_ERROR.
> >
> > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> > index 64abba8dba..5d59144883 100644
> > --- a/builtin/unpack-objects.c
> > +++ b/builtin/unpack-objects.c
> > @@ -139,7 +139,8 @@ static void *get_data(unsigned long size)
> >                 if (dry_run) {
> >                         /* reuse the buffer in dry_run mode */
> >                         stream.next_out = buf;
> > -                       stream.avail_out = bufsize;
> > +                       stream.avail_out = bufsize > size - stream.total_out ?
> > +                               size - stream.total_out : bufsize;
> >                 }
> >         }
> >         git_inflate_end(&stream);
> >
> > Thanks
> > -Han Xin
>
> Han, do you want to pick this up again for a v14? It looks like you're
> very on top of it already, and I re-sent your patches because I saw that
> your
> https://lore.kernel.org/git/cover.1653015534.git.chiyutianyi@gmail.com/
> wasn't picked up in the interim & you hadn't been active on-list
> otherwise.
>
> But it looks like there's some interest now, and that you have more time
> to test & follow-up on this topic than I do at the moment, so if you
> wanted to do the work of properly rebasing ot in tho recent fsync
> changes that would be great. Thanks.

OK, I am glad to do that.

Thank you very much.

-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object()
  2022-06-09  9:30                           ` Johannes Schindelin
@ 2022-06-10 12:55                             ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 12:55 UTC (permalink / raw)
  To: Johannes Schindelin
  Cc: nksingh85, Ævar Arnfjörð Bjarmason, Git List,
	Junio C Hamano, René Scharfe, Neeraj Singh, Elijah Newren,
	Philip Oakley, Derrick Stolee, Jiang Xin, Jiang Xin

On Thu, Jun 9, 2022 at 5:30 PM Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
>
> Hi,
>
> On Thu, 9 Jun 2022, Han Xin wrote:
>
> > Neeraj Singh[1] pointed out that if batch fsync is enabled, we should still
> > call prepare_loose_object_bulk_checkin() to potentially create the bulk checkin
> > objdir.
> >
> > 1. https://lore.kernel.org/git/7ba4858a-d1cc-a4eb-b6d6-4c04a5dd6ce7@gmail.com/
> >
> > Signed-off-by: Han Xin <chiyutianyi@gmail.com>
>
> I like a good commit message that is concise and yet has all the necessary
> information. Well done!
>
> > ---
> >  object-file.c                   |  3 +++
> >  t/t5351-unpack-large-objects.sh | 15 ++++++++++++++-
> >  2 files changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index 2dd828b45b..3a1be74775 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -2131,6 +2131,9 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >
> > +     if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
> > +             prepare_loose_object_bulk_checkin();
> > +
>
> Makes sense.
>
> >       /* Since oid is not determined, save tmp file to odb path. */
> >       strbuf_addf(&filename, "%s/", get_object_directory());
> >       hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
> > diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
> > index 461ca060b2..a66a51f7df 100755
> > --- a/t/t5351-unpack-large-objects.sh
> > +++ b/t/t5351-unpack-large-objects.sh
> > @@ -18,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
> >       test_commit --append foo big-blob &&
> >       test-tool genrandom bar 1500000 >big-blob &&
> >       test_commit --append bar big-blob &&
> > -     PACK=$(echo HEAD | git pack-objects --revs pack)
> > +     PACK=$(echo HEAD | git pack-objects --revs pack) &&
> > +     git verify-pack -v pack-$PACK.pack |
> > +         grep -E "commit|tree|blob" |
> > +             sed -n -e "s/^\([0-9a-f]*\).*/\1/p" >obj-list
>
> Here, I would recommend avoiding the pipe, to ensure that we would catch
> problems in the `verify-pack` invocation, and I think we can avoid the
> `grep` altogether:
>
>         git verify-pack -v pack-$PACK.pack >out &&
>         sed -n 's/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\)/\1/p' \
>                 <out >obj-list
>

Good suggestion. I will take it.

Thanks.
-Han Xin

> >  '
> >
> >  test_expect_success 'set memory limitation to 1MB' '
> > @@ -45,6 +48,16 @@ test_expect_success 'unpack big object in stream' '
> >       test_dir_is_empty dest.git/objects/pack
> >  '
> >
> > +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
> > +
> > +test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
> > +     prepare_dest 1m &&
> > +     git $BATCH_CONFIGURATION -C dest.git unpack-objects <pack-$PACK.pack &&
>
> I think the canonical way would be to use `test_config core.fsync ...`,
> but the presented way works, too.
>
> > +     test_dir_is_empty dest.git/objects/pack &&
> > +     git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
>
> Good. The `--batch-check="%(objectname)"` part forces `cat-file` to read
> the actual object.
>
> > +     cmp obj-list current
> > +'
>
> My main question about this test case is whether it _actually_ verifies
> that the batch-mode `fsync()`ing took place.
>
> I kind of had expected to see Trace2 enabled and a `grep` for
> `fsync/hardware-flush`. Do you think that would still make sense to add?
>
> Thank you for working on the `fsync()` aspects of Git!
> Dscho
>

More rigorous inspection should be adopted.

Thanks.
-Han Xin

> > +
> >  test_expect_success 'do not unpack existing large objects' '
> >       prepare_dest 1m &&
> >       git -C dest.git index-pack --stdin <pack-$PACK.pack &&
> > --
> > 2.36.1
> >
> >

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v14 0/7] unpack-objects: support streaming blobs to disk
  2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
                                       ` (6 preceding siblings ...)
  2022-06-04 10:10                     ` [PATCH v13 7/7] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
@ 2022-06-10 14:46                     ` Han Xin
  2022-06-10 14:46                       ` [PATCH v14 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
                                         ` (7 more replies)
  7 siblings, 8 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

This series makes "unpack-objects" capable of streaming large objects
to disk.

As 7/7 shows streaming e.g. a 100MB blob now uses ~5MB of memory
instead of ~105MB. This streaming method is slower if you've got
memory to handle the blobs in-core, but if you don't it allows you to
unpack objects at all, as you might otherwise OOM.

Changes since v13:

* Make the error checking in the loop of get_data() the same way as
  we do in the non dry-run mode.

* Add batched disk flushes for stream_loose_object(). This is pointed
  out by Neeraj Singh[1].

* Minor typo/grammar/comment etc. fixes throughout.

1. https://lore.kernel.org/git/7ba4858a-d1cc-a4eb-b6d6-4c04a5dd6ce7@gmail.com/

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (3):
  object-file.c: do fsync() and close() before post-write die()
  object-file.c: factor out deflate part of write_loose_object()
  core doc: modernize core.bigFileThreshold documentation

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 106 ++++++++++++--
 object-file.c                   | 240 +++++++++++++++++++++++++++-----
 object-store.h                  |   8 ++
 t/t5351-unpack-large-objects.sh |  76 ++++++++++
 5 files changed, 408 insertions(+), 55 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

Range-diff against v13:
1:  6703df6350 ! 1:  bf600a2fa8 unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ Commit message
     
         Because in dry_run mode, "get_data()" is only used to check the
         integrity of data, and the returned buffer is not used at all, we can
    -    allocate a smaller buffer and reuse it as zstream output. Therefore,
    -    in dry_run mode, "get_data()" will release the allocated buffer and
    -    return NULL instead of returning garbage data.
    +    allocate a smaller buffer and use it as zstream output. Make the function
    +    return NULL in the dry-run mode, as no callers use the returned buffer.
     
         The "find [...]objects/?? -type f | wc -l" test idiom being used here
         is adapted from the same "find" use added to another test in
    @@ builtin/unpack-objects.c: static void use(int bytes)
      }
      
     +/*
    -+ * Decompress zstream from stdin and return specific size of data.
    ++ * Decompress zstream from the standard input into a newly
    ++ * allocated buffer of specified size and return the buffer.
     + * The caller is responsible to free the returned buffer.
     + *
     + * But for dry_run mode, "get_data()" is only used to check the
    @@ builtin/unpack-objects.c: static void *get_data(unsigned long size)
     +		if (dry_run) {
     +			/* reuse the buffer in dry_run mode */
     +			stream.next_out = buf;
    -+			stream.avail_out = bufsize;
    ++			stream.avail_out = bufsize > size - stream.total_out ?
    ++						   size - stream.total_out :
    ++						   bufsize;
     +		}
      	}
      	git_inflate_end(&stream);
2:  6e289d25c1 = 2:  a327f484f7 object-file.c: do fsync() and close() before post-write die()
3:  46f9def06c ! 3:  9bc8002282 object-file.c: refactor write_loose_object() to several steps
    @@ object-file.c: static int create_tmpfile(struct strbuf *tmp, const char *filenam
     + *
     + * - End the compression of zlib stream.
     + * - Get the calculated oid to "oid".
    -+ * - fsync() and close() the "fd"
     + */
     +static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
     +				   struct object_id *oid)
4:  5a95ebede6 = 4:  7c73815f18 object-file.c: factor out deflate part of write_loose_object()
5:  26847541aa ! 5:  28a9588f9c object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
         path.
     
         "freshen_packed_object()" or "freshen_loose_object()" will be called
    -    inside "stream_loose_object()" after obtaining the "oid".
    +    inside "stream_loose_object()" after obtaining the "oid". After the
    +    temporary file is written, we wants to mark the object to recent and we
    +    may find that where indeed is already the object. We should remove the
    +    temporary and do not leave a new copy of the object.
     
         Helped-by: René Scharfe <l.s.r@web.de>
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +	char hdr[MAX_HEADER_LEN];
     +	int hdrlen;
     +
    ++	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
    ++		prepare_loose_object_bulk_checkin();
    ++
     +	/* Since oid is not determined, save tmp file to odb path. */
     +	strbuf_addf(&filename, "%s/", get_object_directory());
     +	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
    @@ object-file.c: static int freshen_packed_object(const struct object_id *oid)
     +		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
     +		    (uintmax_t)len + hdrlen);
     +
    -+	/* Common steps for write_loose_object and stream_loose_object to
    ++	/*
    ++	 * Common steps for write_loose_object and stream_loose_object to
     +	 * end writing loose oject:
     +	 *
     +	 *  - End the compression of zlib stream.
6:  eb962b60b9 ! 6:  dea5c4172b core doc: modernize core.bigFileThreshold documentation
    @@ Documentation/config/core.txt: You probably do not need to adjust this value.
     +Files above the configured limit will be:
      +
     -Common unit suffixes of 'k', 'm', or 'g' are supported.
    -+* Stored deflated, without attempting delta compression.
    ++* Stored deflated in packfiles, without attempting delta compression.
     ++
     +The default limit is primarily set with this use-case in mind. With it
     +most projects will have their source code and other text files delta
    @@ Documentation/config/core.txt: You probably do not need to adjust this value.
     +usage, at the slight expense of increased disk usage.
     ++
     +* Will be treated as if though they were labeled "binary" (see
    -+  linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
    -+  and linkgit:git-diff[1] will not diffs for files above this limit.
    ++  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
    ++  linkgit:git-diff[1] will not diffs for files above this limit.
     ++
     +* Will be generally be streamed when written, which avoids excessive
     +memory usage, at the cost of some fixed overhead. Commands that make
7:  88a2754fcb ! 7:  d236230a4c unpack-objects: use stream_loose_object() to unpack large objects
    @@ t/t5351-unpack-large-objects.sh: test_description='git unpack-objects with large
      }
      
      test_expect_success "create large objects (1.5 MB) and PACK" '
    +@@ t/t5351-unpack-large-objects.sh: test_expect_success "create large objects (1.5 MB) and PACK" '
    + 	test_commit --append foo big-blob &&
    + 	test-tool genrandom bar 1500000 >big-blob &&
    + 	test_commit --append bar big-blob &&
    +-	PACK=$(echo HEAD | git pack-objects --revs pack)
    ++	PACK=$(echo HEAD | git pack-objects --revs pack) &&
    ++	git verify-pack -v pack-$PACK.pack >out &&
    ++	sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
    ++		<out >obj-list
    + '
    + 
    + test_expect_success 'set memory limitation to 1MB' '
     @@ t/t5351-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1MB' '
      '
      
    @@ t/t5351-unpack-large-objects.sh: test_expect_success 'set memory limitation to 1
     +	test_dir_is_empty dest.git/objects/pack
     +'
     +
    ++BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
    ++
    ++test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
    ++	prepare_dest 1m &&
    ++	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
    ++		git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
    ++	grep fsync/hardware-flush trace2.txt &&
    ++	test_dir_is_empty dest.git/objects/pack &&
    ++	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
    ++	cmp obj-list current
    ++'
    ++
     +test_expect_success 'do not unpack existing large objects' '
     +	prepare_dest 1m &&
     +	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
-- 
2.36.1


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v14 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 14:46                       ` [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die() Han Xin
                                         ` (6 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and use it as zstream output. Make the function
return NULL in the dry-run mode, as no callers use the returned buffer.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 37 ++++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 56d05e2725..32e8b47059 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -97,15 +97,27 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from the standard input into a newly
+ * allocated buffer of specified size and return the buffer.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize > size - stream.total_out ?
+						   size - stream.total_out :
+						   bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -326,10 +347,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +378,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +415,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 0000000000..8d84313221
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
  2022-06-10 14:46                       ` [PATCH v14 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 21:10                         ` René Scharfe
  2022-06-10 14:46                       ` [PATCH v14 3/7] object-file.c: refactor write_loose_object() to several steps Han Xin
                                         ` (5 subsequent siblings)
  7 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Change write_loose_object() to do an fsync() and close() before the
oideq() sanity check at the end. This change re-joins code that was
split up by the die() sanity check added in 748af44c63e (sha1_file: be
paranoid when creating loose objects, 2010-02-21).

I don't think that this change matters in itself, if we called die()
it was possible that our data wouldn't fully make it to disk, but in
any case we were writing data that we'd consider corrupted. It's
possible that a subsequent "git fsck" will be less confused now.

The real reason to make this change is that in a subsequent commit
we'll split this code in write_loose_object() into a utility function,
all its callers will want the preceding sanity checks, but not the
"oideq" check. By moving the close_loose_object() earlier it'll be
easier to reason about the introduction of the utility function.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/object-file.c b/object-file.c
index 79eb8339b6..e4a83012ba 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2012,12 +2012,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
+	close_loose_object(fd, tmp_file.buf);
+
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));

-	close_loose_object(fd, tmp_file.buf);
-
 	if (mtime) {
 		struct utimbuf utb;
 		utb.actime = mtime;
-- 
2.36.1

^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 3/7] object-file.c: refactor write_loose_object() to several steps
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
  2022-06-10 14:46                       ` [PATCH v14 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
  2022-06-10 14:46                       ` [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die() Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 14:46                       ` [PATCH v14 4/7] object-file.c: factor out deflate part of write_loose_object() Han Xin
                                         ` (4 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 101 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 26 deletions(-)

diff --git a/object-file.c b/object-file.c
index e4a83012ba..f4d7f8c109 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1951,6 +1951,74 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+				   struct object_id *oid)
+{
+	int ret;
+
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		return ret;
+	the_hash_algo->final_oid_fn(oid, c);
+
+	return Z_OK;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1968,28 +2036,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -2007,11 +2058,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
-		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
-		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
+		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
 	close_loose_object(fd, tmp_file.buf);
 
 	if (!oideq(oid, &parano_oid))
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 4/7] object-file.c: factor out deflate part of write_loose_object()
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
                                         ` (2 preceding siblings ...)
  2022-06-10 14:46                       ` [PATCH v14 3/7] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 14:46                       ` [PATCH v14 5/7] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                                         ` (3 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Split out the part of write_loose_object() that deals with calling
git_deflate() into a utility function, a subsequent commit will
introduce another function that'll make use of it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index f4d7f8c109..cfae54762e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2000,6 +2000,28 @@ static int start_loose_object_common(struct strbuf *tmp_file,
 	return fd;
 }
 
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+				     git_zstream *stream, const int flush,
+				     unsigned char *in0, const int fd,
+				     unsigned char *compressed,
+				     const size_t compressed_len)
+{
+	int ret;
+
+	ret = git_deflate(stream, flush ? Z_FINISH : 0);
+	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+		die(_("unable to write loose object file"));
+	stream->next_out = compressed;
+	stream->avail_out = compressed_len;
+
+	return ret;
+}
+
 /**
  * Common steps for loose object writers to end writing loose objects:
  *
@@ -2047,12 +2069,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
+
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+						compressed, sizeof(compressed));
 	} while (ret == Z_OK);
 
 	if (ret != Z_STREAM_END)
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 5/7] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
                                         ` (3 preceding siblings ...)
  2022-06-10 14:46                       ` [PATCH v14 4/7] object-file.c: factor out deflate part of write_loose_object() Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 14:46                       ` [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation Han Xin
                                         ` (2 subsequent siblings)
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid". After the
temporary file is written, we wants to mark the object to recent and we
may find that where indeed is already the object. We should remove the
temporary and do not leave a new copy of the object.

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   8 ++++
 2 files changed, 112 insertions(+)

diff --git a/object-file.c b/object-file.c
index cfae54762e..0b8383ad47 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2118,6 +2118,110 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
+		prepare_loose_object_bulk_checkin();
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose objects:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = 1;
+		}
+		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+						compressed, sizeof(compressed));
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	if (ret != Z_STREAM_END)
+		die(_("unable to stream deflate new object (%d)"), ret);
+	ret = end_loose_object_common(&c, &stream, oid);
+	if (ret != Z_OK)
+		die(_("deflateEnd on stream object failed (%d)"), ret);
+	close_loose_object(fd, tmp_file.buf);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 539ea43904..5222ee5460 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
                                         ` (4 preceding siblings ...)
  2022-06-10 14:46                       ` [PATCH v14 5/7] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-10 21:01                         ` Junio C Hamano
  2022-06-10 14:46                       ` [PATCH v14 7/7] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
  7 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 41e330f306..f2e75dd824 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated in packfiles, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if though they were labeled "binary" (see
+  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
+  linkgit:git-diff[1] will not diffs for files above this limit.
++
+* Will be generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v14 7/7] unpack-objects: use stream_loose_object() to unpack large objects
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
                                         ` (5 preceding siblings ...)
  2022-06-10 14:46                       ` [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation Han Xin
@ 2022-06-10 14:46                       ` Han Xin
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
  7 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-10 14:46 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use the new streaming interface to unpack all blobs, but
doing so would be much slower, as demonstrated e.g. with this
benchmark using git-hyperfine[0]:

	rm -rf /tmp/scalar.git &&
	git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
	mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
	git hyperfine \
		-r 2 --warmup 1 \
		-L rev origin/master,HEAD -L v "10,512,1k,1m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'

Here we'll perform worse with lower core.bigFileThreshold settings
with this change in terms of speed, but we're getting lower memory use
in return:

	Summary
	  './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'

A better benchmark to demonstrate the benefits of that this one, which
creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:

	rm -rf /tmp/repo &&
	git init /tmp/repo &&
	(
		cd /tmp/repo &&
		for i in 1 25 50 75 100
		do
			dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
		done &&
		git add blob.* &&
		git commit -mblobs &&
		git gc &&
		PACK=$(echo .git/objects/pack/pack-*.pack) &&
		cp "$PACK" my.pack
	) &&
	git hyperfine \
		--show-output \
		-L rev origin/master,HEAD -L v "512,50m,100m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'

Using this test we'll always use >100MB of memory on
origin/master (around ~105MB), but max out at e.g. ~55MB if we set
core.bigFileThreshold=50m.

The relevant "Maximum resident set size" lines were manually added
below the relevant benchmark:

  '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
        Maximum resident set size (kbytes): 107080
    1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 106968
    1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 107032
    1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 107072
    1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 55704
    2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 4564

This shows that if you have enough memory this new streaming method is
slower the lower you set the streaming threshold, but the benefit is
more bounded memory use.

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

0. https://github.com/avar/git-hyperfine/
1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 69 ++++++++++++++++++++++++++++++++-
 t/t5351-unpack-large-objects.sh | 43 ++++++++++++++++++--
 3 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index f2e75dd824..a599dcb96b 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -468,8 +468,8 @@ usage, at the slight expense of increased disk usage.
 * Will be generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 32e8b47059..43789b8ef2 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -351,6 +351,68 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 		write_object(nr, type, buf, size);
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct obj_info *info = &obj_list[nr];
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &info->oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+		if (!blob)
+			die(_("invalid blob object from stream"));
+		blob->object.flags |= FLAG_WRITTEN;
+	}
+	info->obj = NULL;
+}
+
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -483,9 +545,14 @@ static void unpack_one(unsigned nr)
 	}
 
 	switch (type) {
+	case OBJ_BLOB:
+		if (!dry_run && size > big_file_threshold) {
+			stream_blob(size, nr);
+			return;
+		}
+		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-	case OBJ_BLOB:
 	case OBJ_TAG:
 		unpack_non_delta_entry(type, size, nr);
 		return;
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 8d84313221..8ce8aa3b14 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold "$1"
 }
 
 test_expect_success "create large objects (1.5 MB) and PACK" '
@@ -17,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
-	PACK=$(echo HEAD | git pack-objects --revs pack)
+	PACK=$(echo HEAD | git pack-objects --revs pack) &&
+	git verify-pack -v pack-$PACK.pack >out &&
+	sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
+		<out >obj-list
 '
 
 test_expect_success 'set memory limitation to 1MB' '
@@ -26,16 +30,47 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
+
+test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
+	prepare_dest 1m &&
+	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
+		git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
+	grep fsync/hardware-flush trace2.txt &&
+	test_dir_is_empty dest.git/objects/pack &&
+	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
+	cmp obj-list current
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+	# The destination came up with the exact same pack...
+	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+	test_cmp pack-$PACK.pack $DEST_PACK &&
+
+	# ...and wrote no loose objects
+	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
 test_done
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation
  2022-06-10 14:46                       ` [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation Han Xin
@ 2022-06-10 21:01                         ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-06-10 21:01 UTC (permalink / raw)
  To: Han Xin
  Cc: avarab, git, l.s.r, neerajsi, newren, philipoakley, stolee,
	worldhello.net, Neeraj Singh

Han Xin <chiyutianyi@gmail.com> writes:

>  core.bigFileThreshold::

> +	The size of files considered "big", which as discussed below
> +	changes the behavior of numerous git commands, as well as how
> +	such files are stored within the repository. The default is
> +	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
> +	supported.
>  +
> +Files above the configured limit will be:
>  +
> +* Stored deflated in packfiles, without attempting delta compression.
> ++
> +The default limit is primarily set with this use-case in mind. With it

"With it" -> "With it,"

> +most projects will have their source code and other text files delta
> +compressed, but not larger binary media files.
> +
> +Storing large files without delta compression avoids excessive memory
> +usage, at the slight expense of increased disk usage.

Makes sense.

> +* Will be treated as if though they were labeled "binary" (see

"as if though" -> "as if"

> +  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
> +  linkgit:git-diff[1] will not diffs for files above this limit.

"will not diffs" -ECANNOTPARSE.  "will not compute diffs", probably?

> ++
> +* Will be generally be streamed when written, which avoids excessive

"be generally be" -> "generally be"

> +memory usage, at the cost of some fixed overhead. Commands that make
> +use of this include linkgit:git-archive[1],
> +linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
> +linkgit:git-fsck[1].
>  
>  core.excludesFile::
>  	Specifies the pathname to the file that contains patterns to

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-10 14:46                       ` [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die() Han Xin
@ 2022-06-10 21:10                         ` René Scharfe
  2022-06-10 21:33                           ` Junio C Hamano
  0 siblings, 1 reply; 211+ messages in thread
From: René Scharfe @ 2022-06-10 21:10 UTC (permalink / raw)
  To: Han Xin, avarab
  Cc: git, gitster, neerajsi, newren, philipoakley, stolee,
	worldhello.net, Neeraj Singh

Am 10.06.22 um 16:46 schrieb Han Xin:
> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>
> Change write_loose_object() to do an fsync() and close() before the
> oideq() sanity check at the end. This change re-joins code that was
> split up by the die() sanity check added in 748af44c63e (sha1_file: be
> paranoid when creating loose objects, 2010-02-21).
>
> I don't think that this change matters in itself, if we called die()
> it was possible that our data wouldn't fully make it to disk, but in
> any case we were writing data that we'd consider corrupted. It's
> possible that a subsequent "git fsck" will be less confused now.

This is done before renaming the file, so git fsck is going to see (at
most) a tmp_obj_?????? file, which it ignores in either case, right?

> The real reason to make this change is that in a subsequent commit
> we'll split this code in write_loose_object() into a utility function,
> all its callers will want the preceding sanity checks, but not the
> "oideq" check. By moving the close_loose_object() earlier it'll be
> easier to reason about the introduction of the utility function.

This sounds like a patch would move the close_loose_object() call to
some other place, but that's not the case.  The sequence below (starting
from the close_loose_object() call) is still present after applying the
whole series, so it seems this patch is not necessary.

>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  object-file.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index 79eb8339b6..e4a83012ba 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2012,12 +2012,12 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
>  		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
>  		    ret);
>  	the_hash_algo->final_oid_fn(&parano_oid, &c);
> +	close_loose_object(fd, tmp_file.buf);
> +
>  	if (!oideq(oid, &parano_oid))
>  		die(_("confused by unstable object source data for %s"),
>  		    oid_to_hex(oid));
>
> -	close_loose_object(fd, tmp_file.buf);
> -
>  	if (mtime) {
>  		struct utimbuf utb;
>  		utb.actime = mtime;

^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-10 21:10                         ` René Scharfe
@ 2022-06-10 21:33                           ` Junio C Hamano
  2022-06-11  1:50                             ` Han Xin
  0 siblings, 1 reply; 211+ messages in thread
From: Junio C Hamano @ 2022-06-10 21:33 UTC (permalink / raw)
  To: René Scharfe
  Cc: Han Xin, avarab, git, neerajsi, newren, philipoakley, stolee,
	worldhello.net, Neeraj Singh

René Scharfe <l.s.r@web.de> writes:

> Am 10.06.22 um 16:46 schrieb Han Xin:
>> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>>
>> Change write_loose_object() to do an fsync() and close() before the
>> oideq() sanity check at the end. This change re-joins code that was
>> split up by the die() sanity check added in 748af44c63e (sha1_file: be
>> paranoid when creating loose objects, 2010-02-21).
>>
>> I don't think that this change matters in itself, if we called die()
>> it was possible that our data wouldn't fully make it to disk, but in
>> any case we were writing data that we'd consider corrupted. It's
>> possible that a subsequent "git fsck" will be less confused now.
>
> This is done before renaming the file, so git fsck is going to see (at
> most) a tmp_obj_?????? file, which it ignores in either case, right?

Yes, I thought I pointed that out in my review on the previous
round, but I missed that it was still here in this round X-<.

Thanks for noticing.


^ permalink raw reply	[flat|nested] 211+ messages in thread

* Re: [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die()
  2022-06-10 21:33                           ` Junio C Hamano
@ 2022-06-11  1:50                             ` Han Xin
  0 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  1:50 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: René Scharfe, Ævar Arnfjörð Bjarmason,
	Git List, Neeraj Singh, Elijah Newren, Philip Oakley,
	Derrick Stolee, Jiang Xin, Neeraj Singh

On Sat, Jun 11, 2022 at 5:33 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> René Scharfe <l.s.r@web.de> writes:
>
> > Am 10.06.22 um 16:46 schrieb Han Xin:
> >> From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> >>
> >> Change write_loose_object() to do an fsync() and close() before the
> >> oideq() sanity check at the end. This change re-joins code that was
> >> split up by the die() sanity check added in 748af44c63e (sha1_file: be
> >> paranoid when creating loose objects, 2010-02-21).
> >>
> >> I don't think that this change matters in itself, if we called die()
> >> it was possible that our data wouldn't fully make it to disk, but in
> >> any case we were writing data that we'd consider corrupted. It's
> >> possible that a subsequent "git fsck" will be less confused now.
> >
> > This is done before renaming the file, so git fsck is going to see (at
> > most) a tmp_obj_?????? file, which it ignores in either case, right?
>
> Yes, I thought I pointed that out in my review on the previous
> round, but I missed that it was still here in this round X-<.
>
> Thanks for noticing.
>

Yes, agree with both of you, I'll be removing this patch in the next series.

This patch was first introduced in v10[1], close_loose_object() was moved
to end_loose_object_common(), but it was put back in v12[2]. It is indeed
no longer necessary now.

1. https://lore.kernel.org/git/patch-v10-3.6-0e33d2a6e35-20220204T135538Z-avarab@gmail.com/
2. https://lore.kernel.org/git/patch-v12-2.8-54060eb8c6b-20220329T135446Z-avarab@gmail.com/

Thank you both.
-Han Xin

-Han Xin

^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v15 0/6] unpack-objects: support streaming blobs to disk
  2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
                                         ` (6 preceding siblings ...)
  2022-06-10 14:46                       ` [PATCH v14 7/7] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
@ 2022-06-11  2:44                       ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
                                           ` (5 more replies)
  7 siblings, 6 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

This series makes "unpack-objects" capable of streaming large objects
to disk.

As 6/6 shows streaming e.g. a 100MB blob now uses ~5MB of memory
instead of ~105MB. This streaming method is slower if you've got
memory to handle the blobs in-core, but if you don't it allows you to
unpack objects at all, as you might otherwise OOM.

Changes since v14:

* Remove "object-file.c: do fsync() and close() before post-write die()"
  as it's not necessary anymore. It was first introduced in v10 and was
  no longer in the utility function end_loose_object_common() since v12.
  We can see the discussion[1].

* Minor grammar/comment etc. fixes throughout.

1. https://lore.kernel.org/git/0b9bc499-18c7-e8ab-5c89-f9e1a98685bc@web.de/

Han Xin (4):
  unpack-objects: low memory footprint for get_data() in dry_run mode
  object-file.c: refactor write_loose_object() to several steps
  object-file.c: add "stream_loose_object()" to handle large object
  unpack-objects: use stream_loose_object() to unpack large objects

Ævar Arnfjörð Bjarmason (2):
  object-file.c: factor out deflate part of write_loose_object()
  core doc: modernize core.bigFileThreshold documentation

 Documentation/config/core.txt   |  33 +++--
 builtin/unpack-objects.c        | 106 +++++++++++++--
 object-file.c                   | 233 ++++++++++++++++++++++++++++----
 object-store.h                  |   8 ++
 t/t5351-unpack-large-objects.sh |  76 +++++++++++
 5 files changed, 405 insertions(+), 51 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

Range-diff against v14:
1:  bf600a2fa8 ! 1:  9a776f717d unpack-objects: low memory footprint for get_data() in dry_run mode
    @@ Commit message
         d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).
     
         Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
    -    Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Han Xin <chiyutianyi@gmail.com>
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## builtin/unpack-objects.c ##
2:  a327f484f7 < -:  ---------- object-file.c: do fsync() and close() before post-write die()
3:  9bc8002282 ! 2:  a1e090d338 object-file.c: refactor write_loose_object() to several steps
    @@ Commit message
     
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
    -    Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Han Xin <chiyutianyi@gmail.com>
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
    @@ object-file.c: static int write_loose_object(const struct object_id *oid, char *
     -	ret = git_deflate_end_gently(&stream);
     +	ret = end_loose_object_common(&c, &stream, &parano_oid);
      	if (ret != Z_OK)
    --		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
    --		    ret);
    + 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
    + 		    ret);
     -	the_hash_algo->final_oid_fn(&parano_oid, &c);
    -+		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret);
    - 	close_loose_object(fd, tmp_file.buf);
    - 
      	if (!oideq(oid, &parano_oid))
    + 		die(_("confused by unstable object source data for %s"),
    + 		    oid_to_hex(oid));
4:  7c73815f18 = 3:  0ddf912d47 object-file.c: factor out deflate part of write_loose_object()
5:  28a9588f9c ! 4:  f9e51d3c68 object-file.c: add "stream_loose_object()" to handle large object
    @@ Commit message
         Helped-by: René Scharfe <l.s.r@web.de>
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
    -    Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Han Xin <chiyutianyi@gmail.com>
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## object-file.c ##
6:  dea5c4172b ! 5:  61ae1c1632 core doc: modernize core.bigFileThreshold documentation
    @@ Documentation/config/core.txt: You probably do not need to adjust this value.
     -Common unit suffixes of 'k', 'm', or 'g' are supported.
     +* Stored deflated in packfiles, without attempting delta compression.
     ++
    -+The default limit is primarily set with this use-case in mind. With it
    ++The default limit is primarily set with this use-case in mind. With it,
     +most projects will have their source code and other text files delta
     +compressed, but not larger binary media files.
     ++
     +Storing large files without delta compression avoids excessive memory
     +usage, at the slight expense of increased disk usage.
     ++
    -+* Will be treated as if though they were labeled "binary" (see
    ++* Will be treated as if they were labeled "binary" (see
     +  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
    -+  linkgit:git-diff[1] will not diffs for files above this limit.
    ++  linkgit:git-diff[1] will not compute diffs for files above this limit.
     ++
    -+* Will be generally be streamed when written, which avoids excessive
    ++* Will generally be streamed when written, which avoids excessive
     +memory usage, at the cost of some fixed overhead. Commands that make
     +use of this include linkgit:git-archive[1],
     +linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
7:  d236230a4c ! 6:  5a4782d746 unpack-objects: use stream_loose_object() to unpack large objects
    @@ Commit message
         Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
         Helped-by: Derrick Stolee <stolee@gmail.com>
         Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
    -    Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
    +    Signed-off-by: Han Xin <chiyutianyi@gmail.com>
         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
     
      ## Documentation/config/core.txt ##
     @@ Documentation/config/core.txt: usage, at the slight expense of increased disk usage.
    - * Will be generally be streamed when written, which avoids excessive
    + * Will generally be streamed when written, which avoids excessive
      memory usage, at the cost of some fixed overhead. Commands that make
      use of this include linkgit:git-archive[1],
     -linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-- 
2.36.1


^ permalink raw reply	[flat|nested] 211+ messages in thread

* [PATCH v15 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
                                           ` (4 subsequent siblings)
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
amount of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and use it as zstream output. Make the function
return NULL in the dry-run mode, as no callers use the returned buffer.

The "find [...]objects/?? -type f | wc -l" test idiom being used here
is adapted from the same "find" use added to another test in
d9545c7f465 (fast-import: implement unpack limit, 2016-04-25).

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/unpack-objects.c        | 37 ++++++++++++++++++++---------
 t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 11 deletions(-)
 create mode 100755 t/t5351-unpack-large-objects.sh

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 56d05e2725..32e8b47059 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -97,15 +97,27 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from the standard input into a newly
+ * allocated buffer of specified size and return the buffer.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+	void *buf = xmallocz(bufsize);
 
 	memset(&stream, 0, sizeof(stream));
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize > size - stream.total_out ?
+						   size - stream.total_out :
+						   bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -326,10 +347,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +378,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +415,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 0000000000..8d84313221
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs pack)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+	test_stdout_line_count = 0 find dest.git/objects -type f &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+test_done
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v15 2/6] object-file.c: refactor write_loose_object() to several steps
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
  2022-06-11  2:44                         ` [PATCH v15 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 3/6] object-file.c: factor out deflate part of write_loose_object() Han Xin
                                           ` (3 subsequent siblings)
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

When writing a large blob using "write_loose_object()", we have to pass
a buffer with the whole content of the blob, and this behavior will
consume lots of memory and may cause OOM. We will introduce a stream
version function ("stream_loose_object()") in later commit to resolve
this issue.

Before introducing that streaming function, do some refactoring on
"write_loose_object()" to reuse code for both versions.

Rewrite "write_loose_object()" as follows:

 1. Figure out a path for the (temp) object file. This step is only
    used in "write_loose_object()".

 2. Move common steps for starting to write loose objects into a new
    function "start_loose_object_common()".

 3. Compress data.

 4. Move common steps for ending zlib stream into a new function
    "end_loose_object_common()".

 5. Close fd and finalize the object file.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 98 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 24 deletions(-)

diff --git a/object-file.c b/object-file.c
index 79eb8339b6..b5bce03274 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1951,6 +1951,74 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+				     const char *filename, unsigned flags,
+				     git_zstream *stream,
+				     unsigned char *buf, size_t buflen,
+				     git_hash_ctx *c,
+				     char *hdr, int hdrlen)
+{
+	int fd;
+
+	fd = create_tmpfile(tmp_file, filename);
+	if (fd < 0) {
+		if (flags & HASH_SILENT)
+			return -1;
+		else if (errno == EACCES)
+			return error(_("insufficient permission for adding "
+				       "an object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(
+				_("unable to create temporary file"));
+	}
+
+	/*  Setup zlib stream for compression */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = buf;
+	stream->avail_out = buflen;
+	the_hash_algo->init_fn(c);
+
+	/*  Start to feed header to zlib stream */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+
+	return fd;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+				   struct object_id *oid)
+{
+	int ret;
+
+	ret = git_deflate_end_gently(stream);
+	if (ret != Z_OK)
+		return ret;
+	the_hash_algo->final_oid_fn(oid, c);
+
+	return Z_OK;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1968,28 +2036,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
-
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0)
+		return -1;
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -2007,11 +2058,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
-	the_hash_algo->final_oid_fn(&parano_oid, &c);
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v15 3/6] object-file.c: factor out deflate part of write_loose_object()
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
  2022-06-11  2:44                         ` [PATCH v15 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
  2022-06-11  2:44                         ` [PATCH v15 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
                                           ` (2 subsequent siblings)
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

Split out the part of write_loose_object() that deals with calling
git_deflate() into a utility function, a subsequent commit will
introduce another function that'll make use of it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index b5bce03274..18dbf2a4e4 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2000,6 +2000,28 @@ static int start_loose_object_common(struct strbuf *tmp_file,
 	return fd;
 }
 
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+				     git_zstream *stream, const int flush,
+				     unsigned char *in0, const int fd,
+				     unsigned char *compressed,
+				     const size_t compressed_len)
+{
+	int ret;
+
+	ret = git_deflate(stream, flush ? Z_FINISH : 0);
+	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+		die(_("unable to write loose object file"));
+	stream->next_out = compressed;
+	stream->avail_out = compressed_len;
+
+	return ret;
+}
+
 /**
  * Common steps for loose object writers to end writing loose objects:
  *
@@ -2047,12 +2069,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
-			die(_("unable to write loose object file"));
-		stream.next_out = compressed;
-		stream.avail_out = sizeof(compressed);
+
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+						compressed, sizeof(compressed));
 	} while (ret == Z_OK);
 
 	if (ret != Z_STREAM_END)
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v15 4/6] object-file.c: add "stream_loose_object()" to handle large object
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
                                           ` (2 preceding siblings ...)
  2022-06-11  2:44                         ` [PATCH v15 3/6] object-file.c: factor out deflate part of write_loose_object() Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 5/6] core doc: modernize core.bigFileThreshold documentation Han Xin
  2022-06-11  2:44                         ` [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

If we want unpack and write a loose object using "write_loose_object",
we have to feed it with a buffer with the same size of the object, which
will consume lots of memory and may cause OOM. This can be improved by
feeding data to "stream_loose_object()" in a stream.

Add a new function "stream_loose_object()", which is a stream version of
"write_loose_object()" but with a low memory footprint. We will use this
function to unpack large blob object in later commit.

Another difference with "write_loose_object()" is that we have no chance
to run "write_object_file_prepare()" to calculate the oid in advance.
In "write_loose_object()", we know the oid and we can write the
temporary file in the same directory as the final object, but for an
object with an undetermined oid, we don't know the exact directory for
the object.

Still, we need to save the temporary file we're preparing
somewhere. We'll do that in the top-level ".git/objects/"
directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've
streamed it we'll know the OID, and will move it to its canonical
path.

"freshen_packed_object()" or "freshen_loose_object()" will be called
inside "stream_loose_object()" after obtaining the "oid". After the
temporary file is written, we wants to mark the object to recent and we
may find that where indeed is already the object. We should remove the
temporary and do not leave a new copy of the object.

Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 object-file.c  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
 object-store.h |   8 ++++
 2 files changed, 112 insertions(+)

diff --git a/object-file.c b/object-file.c
index 18dbf2a4e4..2ca2576ab1 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2119,6 +2119,110 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid)
+{
+	int fd, ret, err = 0, flush = 0;
+	unsigned char compressed[4096];
+	git_zstream stream;
+	git_hash_ctx c;
+	struct strbuf tmp_file = STRBUF_INIT;
+	struct strbuf filename = STRBUF_INIT;
+	int dirlen;
+	char hdr[MAX_HEADER_LEN];
+	int hdrlen;
+
+	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
+		prepare_loose_object_bulk_checkin();
+
+	/* Since oid is not determined, save tmp file to odb path. */
+	strbuf_addf(&filename, "%s/", get_object_directory());
+	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * start writing loose objects:
+	 *
+	 *  - Create tmpfile for the loose object.
+	 *  - Setup zlib stream for compression.
+	 *  - Start to feed header to zlib stream.
+	 */
+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+				       &stream, compressed, sizeof(compressed),
+				       &c, hdr, hdrlen);
+	if (fd < 0) {
+		err = -1;
+		goto cleanup;
+	}
+
+	/* Then the data itself.. */
+	do {
+		unsigned char *in0 = stream.next_in;
+
+		if (!stream.avail_in && !in_stream->is_finished) {
+			const void *in = in_stream->read(in_stream, &stream.avail_in);
+			stream.next_in = (void *)in;
+			in0 = (unsigned char *)in;
+			/* All data has been read. */
+			if (in_stream->is_finished)
+				flush = 1;
+		}
+		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+						compressed, sizeof(compressed));
+		/*
+		 * Unlike write_loose_object(), we do not have the entire
+		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+		 * then we'll replenish them in the next input_stream->read()
+		 * call when we loop.
+		 */
+	} while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+	if (stream.total_in != len + hdrlen)
+		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+		    (uintmax_t)len + hdrlen);
+
+	/*
+	 * Common steps for write_loose_object and stream_loose_object to
+	 * end writing loose oject:
+	 *
+	 *  - End the compression of zlib stream.
+	 *  - Get the calculated oid.
+	 */
+	if (ret != Z_STREAM_END)
+		die(_("unable to stream deflate new object (%d)"), ret);
+	ret = end_loose_object_common(&c, &stream, oid);
+	if (ret != Z_OK)
+		die(_("deflateEnd on stream object failed (%d)"), ret);
+	close_loose_object(fd, tmp_file.buf);
+
+	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+		unlink_or_warn(tmp_file.buf);
+		goto cleanup;
+	}
+
+	loose_object_path(the_repository, &filename, oid);
+
+	/* We finally know the object path, and create the missing dir. */
+	dirlen = directory_size(filename.buf);
+	if (dirlen) {
+		struct strbuf dir = STRBUF_INIT;
+		strbuf_add(&dir, filename.buf, dirlen);
+
+		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+			err = error_errno(_("unable to create directory %s"), dir.buf);
+			strbuf_release(&dir);
+			goto cleanup;
+		}
+		strbuf_release(&dir);
+	}
+
+	err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+	strbuf_release(&tmp_file);
+	strbuf_release(&filename);
+	return err;
+}
+
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
diff --git a/object-store.h b/object-store.h
index 539ea43904..5222ee5460 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	int is_finished;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
 
@@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+			struct object_id *oid);
 
 /*
  * Add an object file to the in-memory object store, without writing it
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v15 5/6] core doc: modernize core.bigFileThreshold documentation
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
                                           ` (3 preceding siblings ...)
  2022-06-11  2:44                         ` [PATCH v15 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-06-11  2:44                         ` [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
  5 siblings, 0 replies; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: chiyutianyi, git, gitster, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh

From: Ævar Arnfjörð Bjarmason <avarab@gmail.com>

The core.bigFileThreshold documentation has been largely unchanged
since 5eef828bc03 (fast-import: Stream very large blobs directly to
pack, 2010-02-01).

But since then this setting has been expanded to affect a lot more
than that description indicated. Most notably in how "git diff" treats
them, see 6bf3b813486 (diff --stat: mark any file larger than
core.bigfilethreshold binary, 2014-08-16).

In addition to that, numerous commands and APIs make use of a
streaming mode for files above this threshold.

So let's attempt to summarize 12 years of changes in behavior, which
can be seen with:

    git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c'

To do that turn this into a bullet-point list. The summary Han Xin
produced in [1] helped a lot, but is a bit too detailed for
documentation aimed at users. Let's instead summarize how
user-observable behavior differs, and generally describe how we tend
to stream these files in various commands.

1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 41e330f306..87e4c04836 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
-	attempting delta compression.  Storing large files without
-	delta compression avoids excessive memory usage, at the
-	slight expense of increased disk usage. Additionally files
-	larger than this size are always treated as binary.
+	The size of files considered "big", which as discussed below
+	changes the behavior of numerous git commands, as well as how
+	such files are stored within the repository. The default is
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated in packfiles, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it,
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if they were labeled "binary" (see
+  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
+  linkgit:git-diff[1] will not compute diffs for files above this limit.
++
+* Will generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
+linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects
  2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
                                           ` (4 preceding siblings ...)
  2022-06-11  2:44                         ` [PATCH v15 5/6] core doc: modernize core.bigFileThreshold documentation Han Xin
@ 2022-06-11  2:44                         ` Han Xin
  2022-07-01  2:01                           ` Junio C Hamano
  5 siblings, 1 reply; 211+ messages in thread
From: Han Xin @ 2022-06-11  2:44 UTC (permalink / raw)
  To: avarab
  Cc: Han Xin, chiyutianyi, git, gitster, l.s.r, neerajsi, newren,
	philipoakley, stolee, worldhello.net, Neeraj Singh, Jiang Xin

From: Han Xin <hanxin.hx@alibaba-inc.com>

Make use of the stream_loose_object() function introduced in the
preceding commit to unpack large objects. Before this we'd need to
malloc() the size of the blob before unpacking it, which could cause
OOM with very large blobs.

We could use the new streaming interface to unpack all blobs, but
doing so would be much slower, as demonstrated e.g. with this
benchmark using git-hyperfine[0]:

	rm -rf /tmp/scalar.git &&
	git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git &&
	mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack &&
	git hyperfine \
		-r 2 --warmup 1 \
		-L rev origin/master,HEAD -L v "10,512,1k,1m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack'

Here we'll perform worse with lower core.bigFileThreshold settings
with this change in terms of speed, but we're getting lower memory use
in return:

	Summary
	  './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master'
	    1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'
	    1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD'

A better benchmark to demonstrate the benefits of that this one, which
creates an artificial repo with a 1, 25, 50, 75 and 100MB blob:

	rm -rf /tmp/repo &&
	git init /tmp/repo &&
	(
		cd /tmp/repo &&
		for i in 1 25 50 75 100
		do
			dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024
		done &&
		git add blob.* &&
		git commit -mblobs &&
		git gc &&
		PACK=$(echo .git/objects/pack/pack-*.pack) &&
		cp "$PACK" my.pack
	) &&
	git hyperfine \
		--show-output \
		-L rev origin/master,HEAD -L v "512,50m,100m" \
		-s 'make' \
		-p 'git init --bare dest.git' \
		-c 'rm -rf dest.git' \
		'/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum'

Using this test we'll always use >100MB of memory on
origin/master (around ~105MB), but max out at e.g. ~55MB if we set
core.bigFileThreshold=50m.

The relevant "Maximum resident set size" lines were manually added
below the relevant benchmark:

  '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran
        Maximum resident set size (kbytes): 107080
    1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 106968
    1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master'
        Maximum resident set size (kbytes): 107032
    1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 107072
    1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 55704
    2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD'
        Maximum resident set size (kbytes): 4564

This shows that if you have enough memory this new streaming method is
slower the lower you set the streaming threshold, but the benefit is
more bounded memory use.

An earlier version of this patch introduced a new
"core.bigFileStreamingThreshold" instead of re-using the existing
"core.bigFileThreshold" variable[1]. As noted in a detailed overview
of its users in [2] using it has several different meanings.

Still, we consider it good enough to simply re-use it. While it's
possible that someone might want to e.g. consider objects "small" for
the purposes of diffing but "big" for the purposes of writing them
such use-cases are probably too obscure to worry about. We can always
split up "core.bigFileThreshold" in the future if there's a need for
that.

0. https://github.com/avar/git-hyperfine/
1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/
2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <chiyutianyi@gmail.com>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 Documentation/config/core.txt   |  4 +-
 builtin/unpack-objects.c        | 69 ++++++++++++++++++++++++++++++++-
 t/t5351-unpack-large-objects.sh | 43 ++++++++++++++++++--
 3 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 87e4c04836..3ea3124f7f 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -468,8 +468,8 @@ usage, at the slight expense of increased disk usage.
 * Will generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
-linkgit:git-fast-import[1], linkgit:git-index-pack[1] and
-linkgit:git-fsck[1].
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 32e8b47059..43789b8ef2 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -351,6 +351,68 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 		write_object(nr, type, buf, size);
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (in_stream->is_finished) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+
+	in_stream->is_finished = data->status != Z_OK;
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+	git_zstream zstream = { 0 };
+	struct input_zstream_data data = { 0 };
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+	struct obj_info *info = &obj_list[nr];
+
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (stream_loose_object(&in_stream, size, &info->oid))
+		die(_("failed to write object in stream"));
+
+	if (data.status != Z_STREAM_END)
+		die(_("inflate returned (%d)"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+		if (!blob)
+			die(_("invalid blob object from stream"));
+		blob->object.flags |= FLAG_WRITTEN;
+	}
+	info->obj = NULL;
+}
+
 static int resolve_against_held(unsigned nr, const struct object_id *base,
 				void *delta_data, unsigned long delta_size)
 {
@@ -483,9 +545,14 @@ static void unpack_one(unsigned nr)
 	}
 
 	switch (type) {
+	case OBJ_BLOB:
+		if (!dry_run && size > big_file_threshold) {
+			stream_blob(size, nr);
+			return;
+		}
+		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-	case OBJ_BLOB:
 	case OBJ_TAG:
 		unpack_non_delta_entry(type, size, nr);
 		return;
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
index 8d84313221..8ce8aa3b14 100755
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects'
 
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
-	git init --bare dest.git
+	git init --bare dest.git &&
+	git -C dest.git config core.bigFileThreshold "$1"
 }
 
 test_expect_success "create large objects (1.5 MB) and PACK" '
@@ -17,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" '
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
-	PACK=$(echo HEAD | git pack-objects --revs pack)
+	PACK=$(echo HEAD | git pack-objects --revs pack) &&
+	git verify-pack -v pack-$PACK.pack >out &&
+	sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
+		<out >obj-list
 '
 
 test_expect_success 'set memory limitation to 1MB' '
@@ -26,16 +30,47 @@ test_expect_success 'set memory limitation to 1MB' '
 '
 
 test_expect_success 'unpack-objects failed under memory limitation' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+	test_dir_is_empty dest.git/objects/pack
+'
+
+BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
+
+test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
+	prepare_dest 1m &&
+	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
+		git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
+	grep fsync/hardware-flush trace2.txt &&
+	test_dir_is_empty dest.git/objects/pack &&
+	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
+	cmp obj-list current
+'
+
+test_expect_success 'do not unpack existing large objects' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+	git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+	# The destination came up with the exact same pack...
+	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+	test_cmp pack-$PACK.pack $DEST_PACK &&
+
+	# ...and wrote no loose objects
+	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
 test_done
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 211+ messages in thread

* Re: [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects
  2022-06-11  2:44                         ` [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
@ 2022-07-01  2:01                           ` Junio C Hamano
  0 siblings, 0 replies; 211+ messages in thread
From: Junio C Hamano @ 2022-07-01  2:01 UTC (permalink / raw)
  To: Han Xin
  Cc: avarab, Han Xin, git, l.s.r, neerajsi, newren, philipoakley,
	stolee, worldhello.net, Neeraj Singh, Jiang Xin

Han Xin <chiyutianyi@gmail.com> writes:

> +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
> +
> +test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
> +	prepare_dest 1m &&
> +	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
> +		git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
> +	grep fsync/hardware-flush trace2.txt &&
> +	test_dir_is_empty dest.git/objects/pack &&
> +	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
> +	cmp obj-list current
> +'

This test without any prerequisite expects that "hardware-flush"
will always appear in the trace, but is that reasonable?  Don't
we need either 

 (1) some sort of prerequisite to make sure this test piece runs
     only on platforms that will use hardware-flush, or

 (2) loosen grep pattern to look for just "fsync/", or

 (3) something else?

It will become even worse when we queue Ævar's "trace2 squelch"
patch on top, as we will stop emitting trace entries for that did
not trigger.

^ permalink raw reply	[flat|nested] 211+ messages in thread

end of thread, other threads:[~2022-07-01  2:01 UTC | newest]

Thread overview: 211+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-09  8:20 [PATCH] unpack-objects: unpack large object in stream Han Xin
2021-10-19  7:37 ` Han Xin
2021-10-20 14:42 ` Philip Oakley
2021-10-21  3:42   ` Han Xin
2021-10-21 22:47     ` Philip Oakley
2021-11-03  1:48 ` Han Xin
2021-11-03 10:07   ` Philip Oakley
2021-11-12  9:40 ` [PATCH v2 1/6] object-file: refactor write_loose_object() to support inputstream Han Xin
2021-11-18  4:59   ` Jiang Xin
2021-11-18  6:45     ` Junio C Hamano
2021-11-12  9:40 ` [PATCH v2 2/6] object-file.c: add dry_run mode for write_loose_object() Han Xin
2021-11-18  5:42   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 3/6] object-file.c: handle nil oid in write_loose_object() Han Xin
2021-11-18  5:49   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 4/6] object-file.c: read input stream repeatedly " Han Xin
2021-11-18  5:56   ` Jiang Xin
2021-11-12  9:40 ` [PATCH v2 5/6] object-store.h: add write_loose_object() Han Xin
2021-11-12  9:40 ` [PATCH v2 6/6] unpack-objects: unpack large object in stream Han Xin
2021-11-18  7:14   ` Jiang Xin
2021-11-22  3:32 ` [PATCH v3 0/5] unpack large objects " Han Xin
2021-11-29  7:01   ` Han Xin
2021-11-29 19:12     ` Jeff King
2021-11-30  2:57       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 " Han Xin
2021-12-07 16:18     ` Derrick Stolee
2021-12-10 10:34     ` [PATCH v5 0/6] unpack large blobs " Han Xin
2021-12-17 11:26       ` Han Xin
2021-12-21 11:51         ` [PATCH v7 0/5] " Han Xin
2021-12-21 11:51         ` [PATCH v7 1/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-12-21 14:09           ` Ævar Arnfjörð Bjarmason
2021-12-21 14:43             ` René Scharfe
2021-12-21 15:04               ` Ævar Arnfjörð Bjarmason
2021-12-22 11:15               ` Jiang Xin
2021-12-22 11:29             ` Jiang Xin
2021-12-31  3:06           ` Jiang Xin
2021-12-21 11:51         ` [PATCH v7 2/5] object-file API: add a format_object_header() function Han Xin
2021-12-21 14:30           ` René Scharfe
2022-02-01 14:28             ` C99 %z (was: [PATCH v7 2/5] object-file API: add a format_object_header() function) Ævar Arnfjörð Bjarmason
2021-12-31  3:12           ` [PATCH v7 2/5] object-file API: add a format_object_header() function Jiang Xin
2021-12-21 11:51         ` [PATCH v7 3/5] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
2021-12-21 14:16           ` Ævar Arnfjörð Bjarmason
2021-12-22 12:02             ` Jiang Xin
2021-12-21 11:52         ` [PATCH v7 4/5] object-file.c: add "write_stream_object_file()" to support read in stream Han Xin
2021-12-21 14:20           ` Ævar Arnfjörð Bjarmason
2021-12-21 15:05             ` Ævar Arnfjörð Bjarmason
2021-12-21 11:52         ` [PATCH v7 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-21 15:06           ` Ævar Arnfjörð Bjarmason
2021-12-31  3:19           ` Jiang Xin
2022-01-08  8:54         ` [PATCH v8 0/6] unpack large blobs in stream Han Xin
2022-01-20 11:21           ` [PATCH v9 0/5] " Han Xin
2022-02-01 21:24             ` Ævar Arnfjörð Bjarmason
2022-02-02  8:32               ` Han Xin
2022-02-02 10:59                 ` Ævar Arnfjörð Bjarmason
2022-02-04 14:07             ` [PATCH v10 0/6] unpack-objects: support streaming large objects to disk Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 2/6] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 3/6] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 4/6] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 5/6] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
2022-02-04 14:07               ` [PATCH v10 6/6] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
2022-03-19  0:23               ` [PATCH v11 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
2022-03-19 10:11                   ` René Scharfe
2022-03-19  0:23                 ` [PATCH v11 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
2022-03-19  0:23                 ` [PATCH v11 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
2022-03-29 13:56                 ` [PATCH v12 0/8] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 1/8] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 2/8] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 3/8] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
2022-03-30  7:13                     ` Han Xin
2022-03-30 17:34                       ` Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 4/8] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 5/8] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
2022-03-31 19:54                     ` Neeraj Singh
2022-03-29 13:56                   ` [PATCH v12 6/8] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
2022-03-29 13:56                   ` [PATCH v12 7/8] unpack-objects: refactor away unpack_non_delta_entry() Ævar Arnfjörð Bjarmason
2022-03-30 19:40                     ` René Scharfe
2022-03-31 12:42                       ` Ævar Arnfjörð Bjarmason
2022-03-31 16:38                         ` René Scharfe
2022-03-29 13:56                   ` [PATCH v12 8/8] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
2022-06-04 10:10                   ` [PATCH v13 0/7] unpack-objects: support streaming blobs to disk Ævar Arnfjörð Bjarmason
2022-06-04 10:10                     ` [PATCH v13 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Ævar Arnfjörð Bjarmason
2022-06-06 18:35                       ` Junio C Hamano
2022-06-09  4:10                         ` Han Xin
2022-06-09 18:27                           ` Junio C Hamano
2022-06-10  1:50                             ` Han Xin
2022-06-10  2:05                               ` Ævar Arnfjörð Bjarmason
2022-06-10 12:04                                 ` Han Xin
2022-06-04 10:10                     ` [PATCH v13 2/7] object-file.c: do fsync() and close() before post-write die() Ævar Arnfjörð Bjarmason
2022-06-06 18:45                       ` Junio C Hamano
2022-06-04 10:10                     ` [PATCH v13 3/7] object-file.c: refactor write_loose_object() to several steps Ævar Arnfjörð Bjarmason
2022-06-04 10:10                     ` [PATCH v13 4/7] object-file.c: factor out deflate part of write_loose_object() Ævar Arnfjörð Bjarmason
2022-06-04 10:10                     ` [PATCH v13 5/7] object-file.c: add "stream_loose_object()" to handle large object Ævar Arnfjörð Bjarmason
2022-06-06 19:44                       ` Junio C Hamano
2022-06-06 20:02                         ` Junio C Hamano
2022-06-09  6:04                           ` Han Xin
2022-06-09  6:14                         ` Han Xin
2022-06-07 19:53                       ` Neeraj Singh
2022-06-08 15:34                         ` Junio C Hamano
2022-06-09  3:05                         ` [RFC PATCH] object-file.c: batched disk flushes for stream_loose_object() Han Xin
2022-06-09  7:35                           ` Neeraj Singh
2022-06-09  9:30                           ` Johannes Schindelin
2022-06-10 12:55                             ` Han Xin
2022-06-04 10:10                     ` [PATCH v13 6/7] core doc: modernize core.bigFileThreshold documentation Ævar Arnfjörð Bjarmason
2022-06-06 19:50                       ` Junio C Hamano
2022-06-04 10:10                     ` [PATCH v13 7/7] unpack-objects: use stream_loose_object() to unpack large objects Ævar Arnfjörð Bjarmason
2022-06-10 14:46                     ` [PATCH v14 0/7] unpack-objects: support streaming blobs to disk Han Xin
2022-06-10 14:46                       ` [PATCH v14 1/7] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
2022-06-10 14:46                       ` [PATCH v14 2/7] object-file.c: do fsync() and close() before post-write die() Han Xin
2022-06-10 21:10                         ` René Scharfe
2022-06-10 21:33                           ` Junio C Hamano
2022-06-11  1:50                             ` Han Xin
2022-06-10 14:46                       ` [PATCH v14 3/7] object-file.c: refactor write_loose_object() to several steps Han Xin
2022-06-10 14:46                       ` [PATCH v14 4/7] object-file.c: factor out deflate part of write_loose_object() Han Xin
2022-06-10 14:46                       ` [PATCH v14 5/7] object-file.c: add "stream_loose_object()" to handle large object Han Xin
2022-06-10 14:46                       ` [PATCH v14 6/7] core doc: modernize core.bigFileThreshold documentation Han Xin
2022-06-10 21:01                         ` Junio C Hamano
2022-06-10 14:46                       ` [PATCH v14 7/7] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
2022-06-11  2:44                       ` [PATCH v15 0/6] unpack-objects: support streaming blobs to disk Han Xin
2022-06-11  2:44                         ` [PATCH v15 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
2022-06-11  2:44                         ` [PATCH v15 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
2022-06-11  2:44                         ` [PATCH v15 3/6] object-file.c: factor out deflate part of write_loose_object() Han Xin
2022-06-11  2:44                         ` [PATCH v15 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
2022-06-11  2:44                         ` [PATCH v15 5/6] core doc: modernize core.bigFileThreshold documentation Han Xin
2022-06-11  2:44                         ` [PATCH v15 6/6] unpack-objects: use stream_loose_object() to unpack large objects Han Xin
2022-07-01  2:01                           ` Junio C Hamano
2022-05-20  3:05                 ` [PATCH 0/1] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
2022-05-20  3:05                   ` [PATCH 1/1] " Han Xin
2022-01-20 11:21           ` [PATCH v9 1/5] " Han Xin
2022-01-20 11:21           ` [PATCH v9 2/5] object-file.c: refactor write_loose_object() to several steps Han Xin
2022-01-20 11:21           ` [PATCH v9 3/5] object-file.c: add "stream_loose_object()" to handle large object Han Xin
2022-01-20 11:21           ` [PATCH v9 4/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2022-01-20 11:21           ` [PATCH v9 5/5] object-file API: add a format_object_header() function Han Xin
2022-01-08  8:54         ` [PATCH v8 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode Han Xin
2022-01-08 12:28           ` René Scharfe
2022-01-11 10:41             ` Han Xin
2022-01-08  8:54         ` [PATCH v8 2/6] object-file.c: refactor write_loose_object() to several steps Han Xin
2022-01-08 12:28           ` René Scharfe
2022-01-11 10:33             ` Han Xin
2022-01-08  8:54         ` [PATCH v8 3/6] object-file.c: remove the slash for directory_size() Han Xin
2022-01-08 17:24           ` René Scharfe
2022-01-11 10:14             ` Han Xin
2022-01-08  8:54         ` [PATCH v8 4/6] object-file.c: add "stream_loose_object()" to handle large object Han Xin
2022-01-08  8:54         ` [PATCH v8 5/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2022-01-08  8:54         ` [PATCH v8 6/6] object-file API: add a format_object_header() function Han Xin
2021-12-17 11:26       ` [PATCH v6 1/6] object-file.c: release strbuf in write_loose_object() Han Xin
2021-12-17 19:28         ` René Scharfe
2021-12-18  0:09           ` Junio C Hamano
2021-12-17 11:26       ` [PATCH v6 2/6] object-file.c: refactor object header generation into a function Han Xin
2021-12-20 12:10         ` [RFC PATCH] object-file API: add a format_loose_header() function Ævar Arnfjörð Bjarmason
2021-12-20 12:48           ` Philip Oakley
2021-12-20 22:25           ` Junio C Hamano
2021-12-21  1:42             ` Ævar Arnfjörð Bjarmason
2021-12-21  2:11               ` Junio C Hamano
2021-12-21  2:27                 ` Ævar Arnfjörð Bjarmason
2021-12-21 11:43           ` Han Xin
2021-12-17 11:26       ` [PATCH v6 3/6] object-file.c: refactor write_loose_object() to reuse in stream version Han Xin
2021-12-17 11:26       ` [PATCH v6 4/6] object-file.c: make "write_object_file_flags()" to support read in stream Han Xin
2021-12-17 22:52         ` René Scharfe
2021-12-17 11:26       ` [PATCH v6 5/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-12-17 21:22         ` René Scharfe
2021-12-17 11:26       ` [PATCH v6 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-10 10:34     ` [PATCH v5 1/6] object-file: refactor write_loose_object() to support read from stream Han Xin
2021-12-10 10:34     ` [PATCH v5 2/6] object-file.c: handle undetermined oid in write_loose_object() Han Xin
2021-12-13  7:32       ` Ævar Arnfjörð Bjarmason
2021-12-10 10:34     ` [PATCH v5 3/6] object-file.c: read stream in a loop " Han Xin
2021-12-10 10:34     ` [PATCH v5 4/6] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-12-10 10:34     ` [PATCH v5 5/6] object-file.c: make "write_object_file_flags()" to support "HASH_STREAM" Han Xin
2021-12-10 10:34     ` [PATCH v5 6/6] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-13  8:05       ` Ævar Arnfjörð Bjarmason
2021-12-03  9:35   ` [PATCH v4 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
2021-12-03 13:28     ` Ævar Arnfjörð Bjarmason
2021-12-06  2:07       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
2021-12-03 13:21     ` Ævar Arnfjörð Bjarmason
2021-12-06  2:51       ` Han Xin
2021-12-03 13:41     ` Ævar Arnfjörð Bjarmason
2021-12-06  3:12       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 3/5] object-file.c: read stream in a loop " Han Xin
2021-12-03  9:35   ` [PATCH v4 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-12-03 13:59     ` Ævar Arnfjörð Bjarmason
2021-12-06  3:20       ` Han Xin
2021-12-03  9:35   ` [PATCH v4 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-03 13:07     ` Ævar Arnfjörð Bjarmason
2021-12-07  6:42       ` Han Xin
2021-12-03 13:54     ` Ævar Arnfjörð Bjarmason
2021-12-07  6:17       ` Han Xin
2021-12-03 14:05     ` Ævar Arnfjörð Bjarmason
2021-12-07  6:48       ` Han Xin
2021-11-22  3:32 ` [PATCH v3 1/5] object-file: refactor write_loose_object() to read buffer from stream Han Xin
2021-11-23 23:24   ` Junio C Hamano
2021-11-24  9:00     ` Han Xin
2021-11-22  3:32 ` [PATCH v3 2/5] object-file.c: handle undetermined oid in write_loose_object() Han Xin
2021-11-29 15:10   ` Derrick Stolee
2021-11-29 20:44     ` Junio C Hamano
2021-11-29 22:18       ` Derrick Stolee
2021-11-30  3:23         ` Han Xin
2021-11-22  3:32 ` [PATCH v3 3/5] object-file.c: read stream in a loop " Han Xin
2021-11-22  3:32 ` [PATCH v3 4/5] unpack-objects.c: add dry_run mode for get_data() Han Xin
2021-11-22  3:32 ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-11-29 17:37   ` Derrick Stolee
2021-11-30 13:49     ` Han Xin
2021-11-30 18:38       ` Derrick Stolee
2021-12-01 20:37         ` "git hyperfine" (was: [PATCH v3 5/5] unpack-objects[...]) Ævar Arnfjörð Bjarmason
2021-12-02  7:33         ` [PATCH v3 5/5] unpack-objects: unpack_non_delta_entry() read data in a stream Han Xin
2021-12-02 13:53           ` Derrick Stolee

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).