[PATCH WIP 0/4] Special code path for large blobs

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [PATCH WIP 0/4] Special code path for large blobs
@ 2009-05-28  5:29 Nguyễn Thái Ngọc Duy
  2009-05-28  5:29 ` [PATCH WIP 1/4] convert.c: refactor in order to skip conversion early without looking into file content Nguyễn Thái Ngọc Duy
  2009-05-28 18:03 ` [PATCH WIP 0/4] Special code path for large blobs Nicolas Pitre
  0 siblings, 2 replies; 9+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2009-05-28  5:29 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

Thread "Problem with large files on different OSes" reminds me this.
This series is in my repository for quite some time. It addresses
adding/checking out large blobs as long as:

 - no conversion will be done
 - blobs are loose (in checkout case)

Together with a patch that prevents large blobs from being packed
(something like Dana How sent long ago), and a modification of "lazy
clone/remote alternatives" patch to avoid packing large blobs again
for sending over network, I think it should make git possible for
large files.

Just something to play.

Nguyá»…n ThÃ¡i Ngá»c Duy (4):
  convert.c: refactor in order to skip conversion early without looking
    into file content
  sha1_file.c: add streaming interface for reading blobs
  write_entry: use streaming interface for checkout large files
  index_fd: support indexing large files

 cache.h     |   10 +++
 convert.c   |   86 ++++++++++++++++++++---
 entry.c     |   68 +++++++++++++++++
 sha1_file.c |  233 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 388 insertions(+), 9 deletions(-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH WIP 1/4] convert.c: refactor in order to skip conversion early without looking into file content
  2009-05-28  5:29 [PATCH WIP 0/4] Special code path for large blobs Nguyễn Thái Ngọc Duy
@ 2009-05-28  5:29 ` Nguyễn Thái Ngọc Duy
  2009-05-28  5:29   ` [PATCH WIP 2/4] sha1_file.c: add streaming interface for reading blobs Nguyễn Thái Ngọc Duy
  2009-05-28 18:03 ` [PATCH WIP 0/4] Special code path for large blobs Nicolas Pitre
  1 sibling, 1 reply; 9+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2009-05-28  5:29 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

convert_to_{git,working_tree} require the entire blob content in
memory. This is impossible for large files (especially files that
cannot be mapped into memory at all). Those files won't likely be
converted.

This patch moves out some condition checks that does not require file
content, then large file-related routines can do early check to see if
it's possible to skip conversion. If not, follow the common routes.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 cache.h   |    2 +
 convert.c |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/cache.h b/cache.h
index b8503ad..f3fc822 100644
--- a/cache.h
+++ b/cache.h
@@ -933,6 +933,8 @@ extern void trace_argv_printf(const char **argv, const char *format, ...);
 extern int convert_to_git(const char *path, const char *src, size_t len,
                           struct strbuf *dst, enum safe_crlf checksafe);
 extern int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst);
+extern int convert_to_git_needed(const char *path, size_t len);
+extern int convert_to_working_tree_needed(const char *path, size_t len);
 
 /* add */
 /*
diff --git a/convert.c b/convert.c
index 1816e97..809c3e8 100644
--- a/convert.c
+++ b/convert.c
@@ -120,13 +120,18 @@ static void check_safe_crlf(const char *path, int action,
 	}
 }
 
+static int crlf_to_git_noneed(const char *path, size_t len, int action)
+{
+	return (action == CRLF_BINARY) || !auto_crlf || !len;
+}
+
 static int crlf_to_git(const char *path, const char *src, size_t len,
                        struct strbuf *buf, int action, enum safe_crlf checksafe)
 {
 	struct text_stat stats;
 	char *dst;
 
-	if ((action == CRLF_BINARY) || !auto_crlf || !len)
+	if (crlf_to_git_noneed(path, len, action))
 		return 0;
 
 	gather_stats(src, len, &stats);
@@ -179,17 +184,19 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
 	return 1;
 }
 
+static int crlf_to_worktree_noneed(const char *path, size_t len, int action)
+{
+	return 	(action == CRLF_BINARY) || (action == CRLF_INPUT) ||
+		auto_crlf <= 0 || !len;
+}
+
 static int crlf_to_worktree(const char *path, const char *src, size_t len,
                             struct strbuf *buf, int action)
 {
 	char *to_free = NULL;
 	struct text_stat stats;
 
-	if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
-	    auto_crlf <= 0)
-		return 0;
-
-	if (!len)
+	if (crlf_to_worktree_noneed(path, len, action))
 		return 0;
 
 	gather_stats(src, len, &stats);
@@ -271,6 +278,11 @@ static int filter_buffer(int fd, void *data)
 	return (write_err || status);
 }
 
+static int apply_filter_noneed(const char *path, const char *cmd)
+{
+	return cmd == NULL;
+}
+
 static int apply_filter(const char *path, const char *src, size_t len,
                         struct strbuf *dst, const char *cmd)
 {
@@ -285,7 +297,7 @@ static int apply_filter(const char *path, const char *src, size_t len,
 	struct async async;
 	struct filter_params params;
 
-	if (!cmd)
+	if (apply_filter_noneed(path, cmd))
 		return 0;
 
 	memset(&async, 0, sizeof(async));
@@ -428,12 +440,20 @@ static int count_ident(const char *cp, unsigned long size)
 	return cnt;
 }
 
+static int ident_conversion_noneed(const char *path, int ident)
+{
+	return !ident;
+}
+
 static int ident_to_git(const char *path, const char *src, size_t len,
                         struct strbuf *buf, int ident)
 {
 	char *dst, *dollar;
 
-	if (!ident || !count_ident(src, len))
+	if (ident_conversion_noneed(path, ident))
+		return 0;
+
+	if (!count_ident(src, len))
 		return 0;
 
 	/* only grow if not in place */
@@ -471,7 +491,7 @@ static int ident_to_worktree(const char *path, const char *src, size_t len,
 	char *to_free = NULL, *dollar;
 	int cnt;
 
-	if (!ident)
+	if (ident_conversion_noneed(path, ident))
 		return 0;
 
 	cnt = count_ident(src, len);
@@ -597,6 +617,28 @@ int convert_to_git(const char *path, const char *src, size_t len,
 	return ret | ident_to_git(path, src, len, dst, ident);
 }
 
+int convert_to_git_needed(const char *path, size_t len)
+{
+	struct git_attr_check check[3];
+	int crlf = CRLF_GUESS;
+	int ident = 0;
+	const char *filter = NULL;
+
+	setup_convert_check(check);
+	if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
+		struct convert_driver *drv;
+		crlf = git_path_check_crlf(path, check + 0);
+		ident = git_path_check_ident(path, check + 1);
+		drv = git_path_check_convert(path, check + 2);
+		if (drv && drv->clean)
+			filter = drv->clean;
+	}
+
+	return !apply_filter_noneed(path, filter) ||
+		!crlf_to_git_noneed(path, len, crlf) ||
+		!ident_conversion_noneed(path, ident);
+}
+
 int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 {
 	struct git_attr_check check[3];
@@ -626,3 +668,29 @@ int convert_to_working_tree(const char *path, const char *src, size_t len, struc
 	}
 	return ret | apply_filter(path, src, len, dst, filter);
 }
+
+int convert_to_working_tree_needed(const char *path, size_t len)
+{
+	struct git_attr_check check[3];
+	int crlf = CRLF_GUESS;
+	int ident = 0;
+	const char *filter = NULL;
+
+	/*
+	 * any additional conversion should be added to
+	 * convert_to_working_tree_needed() as well
+	 */
+	setup_convert_check(check);
+	if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
+		struct convert_driver *drv;
+		crlf = git_path_check_crlf(path, check + 0);
+		ident = git_path_check_ident(path, check + 1);
+		drv = git_path_check_convert(path, check + 2);
+		if (drv && drv->smudge)
+			filter = drv->smudge;
+	}
+
+	return !ident_conversion_noneed(path, ident) ||
+		!crlf_to_worktree_noneed(path, len, crlf) ||
+		!apply_filter_noneed(path, filter);
+}
-- 
1.6.3.1.257.gbd13

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH WIP 2/4] sha1_file.c: add streaming interface for reading blobs
  2009-05-28  5:29 ` [PATCH WIP 1/4] convert.c: refactor in order to skip conversion early without looking into file content Nguyễn Thái Ngọc Duy
@ 2009-05-28  5:29   ` Nguyễn Thái Ngọc Duy
  2009-05-28  5:29     ` [PATCH WIP 3/4] write_entry: use streaming interface for checkout large files Nguyễn Thái Ngọc Duy
  0 siblings, 1 reply; 9+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2009-05-28  5:29 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

The reason is quite obvious: large files should not be read entirely
into memory (and in some cases, cannot).

This patch deals with separate blobs only for two reasons:

 1. large blobs are less likely to be put in packs (*)
 2. streaming interface for blobs in pack is more complicated, thus
    more troublesome

(*) With regard to the first point, there is an assumption that large
blobs must stay out of pack otherwise you cannot make use of this
interface. This is not true now, but it was discussed and worked on in
the past. Hopefully a patch series that makes this assumption true
will come soon.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 cache.h     |    8 ++++
 sha1_file.c |  113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 0 deletions(-)

diff --git a/cache.h b/cache.h
index f3fc822..f6f70ce 100644
--- a/cache.h
+++ b/cache.h
@@ -655,6 +655,14 @@ extern int write_sha1_file(void *buf, unsigned long len, const char *type, unsig
 extern int pretend_sha1_file(void *, unsigned long, enum object_type, unsigned char *);
 extern int force_object_loose(const unsigned char *sha1, time_t mtime);
 
+struct loose_object_handle;
+struct loose_object_handle *open_loose_object(const unsigned char *sha1);
+int read_loose_object(struct loose_object_handle *oh, void *buf, unsigned long len);
+int close_loose_object(struct loose_object_handle *oh);
+const unsigned char *loose_object_sha1(struct loose_object_handle *oh);
+unsigned long loose_object_size(struct loose_object_handle *oh);
+enum object_type loose_object_type(struct loose_object_handle *oh);
+
 /* global flag to enable extra checks when accessing packed objects */
 extern int do_check_packed_object_crc;
 
diff --git a/sha1_file.c b/sha1_file.c
index e73cd4f..2ed06a2 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1340,6 +1340,119 @@ static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type
 	return unpack_sha1_rest(&stream, hdr, *size, sha1);
 }
 
+
+struct loose_object_handle {
+	unsigned char sha1[20];
+	enum object_type type;
+	unsigned long size;
+
+	int fd;
+	z_stream stream;
+	char *bufin, *bufout;
+	int bufin_length, bufout_length;
+	unsigned long start, end;
+};
+
+enum object_type loose_object_type(struct loose_object_handle *oh)
+{
+	return oh->type;
+}
+
+unsigned long loose_object_size(struct loose_object_handle *oh)
+{
+	return oh->size;
+}
+
+const unsigned char *loose_object_sha1(struct loose_object_handle *oh)
+{
+	return oh->sha1;
+}
+
+struct loose_object_handle *open_loose_object(const unsigned char *sha1)
+{
+	int ret, len;
+	struct loose_object_handle oh, *ohp;
+
+	oh.fd = open_sha1_file(sha1);
+	if (oh.fd == -1)
+		return NULL;
+
+	oh.bufin_length = 8192;
+	oh.bufin = xmalloc(oh.bufin_length);
+	len = xread(oh.fd, oh.bufin, oh.bufin_length);
+	if (len == -1) {
+		free(oh.bufin);
+		return NULL;
+	}
+
+	oh.bufout_length = 8192;
+	oh.bufout = xmalloc(oh.bufout_length);
+
+	ret = unpack_sha1_header(&oh.stream, (unsigned char *)oh.bufin, len, oh.bufout, oh.bufout_length);
+	if (ret < Z_OK || (oh.type = parse_sha1_header(oh.bufout, &oh.size)) < 0) {
+		free(oh.bufin);
+		free(oh.bufout);
+		return NULL;
+	}
+
+	ohp = xmalloc(sizeof(*ohp));
+	*ohp = oh;
+	memcpy(ohp->sha1, sha1, 20);
+
+	ohp->start = strlen(ohp->bufout)+1;
+	ohp->end = ohp->stream.total_out;
+	return ohp;
+}
+
+int read_loose_object(struct loose_object_handle *oh, void *buf, unsigned long buflen)
+{
+	if (oh->end == oh->start) {
+		int status;
+
+		oh->start = 0;
+		oh->stream.next_out = (unsigned char*)oh->bufout;
+		oh->stream.avail_out = oh->bufout_length;
+		status = inflate(&oh->stream, Z_NO_FLUSH);
+		oh->end = oh->stream.next_out - (unsigned char*)oh->bufout;
+
+		if (oh->stream.avail_in == 0) {
+			oh->stream.avail_in = xread(oh->fd, oh->bufin, oh->bufin_length);
+			oh->stream.next_in = (unsigned char *)oh->bufin;
+		}
+
+		/* trying to get Z_STREAM_END */
+		if (oh->stream.total_out == oh->size && status == Z_OK) {
+			status = inflate(&oh->stream, Z_NO_FLUSH);
+
+			if (status < 0)
+				error("corrupt loose object '%s'", sha1_to_hex(oh->sha1));
+			else if (oh->stream.avail_in)
+				error("garbage at end of loose object '%s'",
+				      sha1_to_hex(oh->sha1));
+		}
+	}
+
+	if (oh->end > oh->start) {
+		int len = oh->end - oh->start;
+		memcpy(buf, (char *) oh->bufout + oh->start, len);
+		oh->start = oh->end;
+		return len;
+	}
+
+	/* How can it get here? */
+	return -1;
+}
+
+int close_loose_object(struct loose_object_handle *oh)
+{
+	close(oh->fd);
+	free(oh->bufin);
+	free(oh->bufout);
+	inflateEnd(&oh->stream);
+	free(oh);
+	return 0;
+}
+
 unsigned long get_size_from_delta(struct packed_git *p,
 				  struct pack_window **w_curs,
 			          off_t curpos)
-- 
1.6.3.1.257.gbd13

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH WIP 3/4] write_entry: use streaming interface for checkout large files
  2009-05-28  5:29   ` [PATCH WIP 2/4] sha1_file.c: add streaming interface for reading blobs Nguyễn Thái Ngọc Duy
@ 2009-05-28  5:29     ` Nguyễn Thái Ngọc Duy
  2009-05-28  5:29       ` [PATCH WIP 4/4] index_fd: support indexing " Nguyễn Thái Ngọc Duy
  0 siblings, 1 reply; 9+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2009-05-28  5:29 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

With this patch, Git's memory consumption should go pretty flat no
matter how large input files are. So:

 - less memory will be used
 - more less memory for systems that do not have proper mmap() support
 - unmappable files can now be checked in

TODO: buffer size, file size limit that triggers this routine

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 entry.c |   68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 68 insertions(+), 0 deletions(-)

diff --git a/entry.c b/entry.c
index cc841ed..2a49d7b 100644
--- a/entry.c
+++ b/entry.c
@@ -91,6 +91,65 @@ static void *read_blob_entry(struct cache_entry *ce, unsigned long *size)
 	return NULL;
 }
 
+/*
+ * Trying to write entry using blob streaming interface.
+ * Return 1 if normal interface should be used.
+ */
+static int write_large_entry(struct cache_entry *ce, char *path,
+			     const struct checkout *state, int to_tempfile)
+{
+	unsigned int ce_mode_s_ifmt = ce->ce_mode & S_IFMT;
+	struct loose_object_handle *oh = open_loose_object(ce->sha1);
+	char buf[8192];
+	long len;
+	int fd;
+	size_t wrote;
+
+	if (!oh)
+		return 1;
+
+	if (loose_object_type(oh) != OBJ_BLOB) {
+		close_loose_object(oh);
+		return error("git checkout-index: unable to read sha1 file of %s (%s)",
+			     path, sha1_to_hex(ce->sha1));
+	}
+
+	if (convert_to_working_tree_needed(ce->name,  xsize_t(loose_object_size(oh)))) {
+		close_loose_object(oh);
+		return 1;
+	}
+
+	if (to_tempfile) {
+		if (ce_mode_s_ifmt == S_IFREG)
+			strcpy(path, ".merge_file_XXXXXX");
+		else
+			strcpy(path, ".merge_link_XXXXXX");
+		fd = mkstemp(path);
+	} else if (ce_mode_s_ifmt == S_IFREG) {
+		fd = create_file(path, ce->ce_mode);
+	} else {
+		fd = create_file(path, 0666);
+	}
+	if (fd < 0) {
+		close_loose_object(oh);
+		return error("git checkout-index: unable to create file %s (%s)",
+			     path, strerror(errno));
+	}
+
+	while ((len = read_loose_object(oh, buf, sizeof(buf))) > 0) {
+		wrote = write_in_full(fd, buf, len);
+		if (wrote != len) {
+			close(fd);
+			close_loose_object(oh);
+			return error("git checkout-index: unable to write file %s", path);
+		}
+	}
+
+	close(fd);
+	close_loose_object(oh);
+	return 0;
+}
+
 static int write_entry(struct cache_entry *ce, char *path, const struct checkout *state, int to_tempfile)
 {
 	unsigned int ce_mode_s_ifmt = ce->ce_mode & S_IFMT;
@@ -104,6 +163,15 @@ static int write_entry(struct cache_entry *ce, char *path, const struct checkout
 	switch (ce_mode_s_ifmt) {
 	case S_IFREG:
 	case S_IFLNK:
+		if (ce_mode_s_ifmt == S_IFREG) {
+			ret = write_large_entry(ce, path, state, to_tempfile);
+			if (ret < 0) /* failed */
+				return ret;
+			if (ret == 0) /* successful */
+				break;
+			/* else, go through */
+		}
+
 		new = read_blob_entry(ce, &size);
 		if (!new)
 			return error("git checkout-index: unable to read sha1 file of %s (%s)",
-- 
1.6.3.1.257.gbd13

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH WIP 4/4] index_fd: support indexing large files
  2009-05-28  5:29     ` [PATCH WIP 3/4] write_entry: use streaming interface for checkout large files Nguyễn Thái Ngọc Duy
@ 2009-05-28  5:29       ` Nguyễn Thái Ngọc Duy
  0 siblings, 0 replies; 9+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2009-05-28  5:29 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

This patch is less impressed than the previous one as memory usage is
usually lower. But then systems without proper mmap() would still love it.

TODO: again, file limit

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 sha1_file.c |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 120 insertions(+), 0 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index 2ed06a2..f4f90ab 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2609,12 +2609,132 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
 	return ret;
 }
 
+static int index_and_write_fd(unsigned char *sha1, int fd, struct stat *st,
+			      enum object_type type, const char *path)
+{
+	int fdo, ret;
+	z_stream stream;
+	char *filename;
+	static char tmpfile[PATH_MAX];
+	int hdrlen;
+	int bufin_length = 8192, bufout_length = 8192;
+	char bufin[8192], bufout[8192];
+	int written_out = 0;
+	git_SHA_CTX c;
+
+	fdo = create_tmpfile(tmpfile, sizeof(tmpfile), "foo");
+	if (fdo < 0) {
+		if (errno == EACCES)
+			return error("insufficient permission for adding an object to repository database %s\n", get_object_directory());
+		else
+			return error("unable to create temporary sha1 filename %s: %s\n", tmpfile, strerror(errno));
+	}
+
+	hdrlen = sprintf(bufin, "%s %lu", typename(type), (unsigned long)st->st_size)+1;
+	git_SHA1_Init(&c);
+	git_SHA1_Update(&c, bufin, hdrlen);
+
+	/* Set it up */
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, zlib_compression_level);
+	stream.next_out = (unsigned char *)bufout;
+	stream.avail_out = bufout_length;
+
+	/* First header.. */
+	stream.next_in = (unsigned char *)bufin;
+	stream.avail_in = hdrlen;
+	while (deflate(&stream, 0) == Z_OK)
+		/* nothing */;
+
+	written_out = stream.total_out;
+	write_or_die(fdo, bufout, written_out);
+	stream.next_out = (unsigned char *)bufout;
+	stream.avail_out = bufout_length;
+
+	/* Then the data itself.. */
+	stream.next_in = (unsigned char *)bufin;
+	stream.avail_in = xread(fd, bufin, bufin_length);
+	git_SHA1_Update(&c, stream.next_in, stream.avail_in);
+
+	while ((ret = deflate(&stream, Z_NO_FLUSH)) == Z_OK || ret == Z_BUF_ERROR) {
+		if (stream.total_out > written_out) {
+			write_or_die(fdo, bufout, stream.total_out - written_out);
+			written_out = stream.total_out;
+			stream.next_out = (unsigned char *)bufout;
+			stream.avail_out = bufout_length;
+		}
+		if (stream.avail_in == 0) {
+			stream.next_in = (unsigned char *)bufin;
+			stream.avail_in = xread(fd, bufin, bufin_length);
+			if (!stream.avail_in)
+				break;
+			git_SHA1_Update(&c, stream.next_in, stream.avail_in);
+		}
+	}
+
+	/* Done computing SHA-1 */
+	git_SHA1_Final(sha1, &c);
+
+	/* Make sure everything is flushed out */
+	while ((ret = deflate(&stream, Z_FINISH)) == Z_OK) {
+		write_or_die(fdo, bufout, stream.total_out - written_out);
+		written_out = stream.total_out;
+		stream.next_out = (unsigned char *)bufout;
+		stream.avail_out = bufout_length;
+	}
+
+	if (ret != Z_STREAM_END)
+		die("unable to deflate new object %s (%d)", sha1_to_hex(sha1), ret);
+
+	/* Last piece */
+	if (written_out < stream.total_out)
+		write_or_die(fdo, bufout, stream.total_out - written_out);
+	close_sha1_file(fdo);
+
+	ret = deflateEnd(&stream);
+	if (ret != Z_OK)
+		die("deflateEnd on object %s failed (%d)", sha1_to_hex(sha1), ret);
+
+	/* Now generate proper path from SHA-1 */
+	filename = sha1_file_name(sha1);
+	safe_create_leading_directories_const(filename);
+	return move_temp_to_file(tmpfile, filename);
+}
+
+static int hash_fd(unsigned char *sha1, int fd, struct stat *st,
+		   enum object_type type, const char *path)
+{
+	git_SHA_CTX c;
+	char buf[8192];
+	int buflen;
+
+	/* Generate the header */
+	buflen = sprintf(buf, "%s %lu", typename(type), (unsigned long)st->st_size)+1;
+
+	/* Sha1.. */
+	git_SHA1_Init(&c);
+	git_SHA1_Update(&c, buf, buflen);
+	while ((buflen = xread(fd, buf, 8192)) > 0)
+		git_SHA1_Update(&c, buf, buflen);
+	git_SHA1_Final(sha1, &c);
+	return 0;
+}
+
 int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object,
 	     enum object_type type, const char *path)
 {
 	int ret;
 	size_t size = xsize_t(st->st_size);
 
+	if (S_ISREG(st->st_mode) && path && !convert_to_git_needed(path, size)) {
+		if (write_object)
+			ret = index_and_write_fd(sha1, fd, st, type, path);
+		else
+			ret = hash_fd(sha1, fd, st, type, path);
+		close(fd);
+		return ret;
+	}
+
 	if (!S_ISREG(st->st_mode)) {
 		struct strbuf sbuf = STRBUF_INIT;
 		if (strbuf_read(&sbuf, fd, 4096) >= 0)
-- 
1.6.3.1.257.gbd13

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH WIP 0/4] Special code path for large blobs
  2009-05-28  5:29 [PATCH WIP 0/4] Special code path for large blobs Nguyễn Thái Ngọc Duy
  2009-05-28  5:29 ` [PATCH WIP 1/4] convert.c: refactor in order to skip conversion early without looking into file content Nguyễn Thái Ngọc Duy
@ 2009-05-28 18:03 ` Nicolas Pitre
  2009-06-02  4:46   ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2009-05-28 18:03 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1179 bytes --]

On Thu, 28 May 2009, Nguyễn Thái Ngọc Duy wrote:

> Thread "Problem with large files on different OSes" reminds me this.
> This series is in my repository for quite some time. It addresses
> adding/checking out large blobs as long as:
> 
>  - no conversion will be done
>  - blobs are loose (in checkout case)
> 
> Together with a patch that prevents large blobs from being packed
> (something like Dana How sent long ago), and a modification of "lazy
> clone/remote alternatives" patch to avoid packing large blobs again
> for sending over network, I think it should make git possible for
> large files.
> 
> Just something to play.

I think this is a good start.

However, like I said previously, I'd encapsulate large blobs in a pack 
right away instead of storing them as loose objects.  The reason is that 
you can effortlessly repack/fetch/push them afterwards by simply 
triggering the pack data reuse code path for them.  Extracting large and 
undeltified blobs from a pack is just as easy as from a loose object.

To accomplish that, you only need to copy write_pack_file() from 
builtin-pack-objects.c and strip it to the bone with only one object to 
write.

Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH WIP 0/4] Special code path for large blobs
  2009-05-28 18:03 ` [PATCH WIP 0/4] Special code path for large blobs Nicolas Pitre
@ 2009-06-02  4:46   ` Nguyen Thai Ngoc Duy
  2009-06-02 14:45     ` Shawn O. Pearce
  0 siblings, 1 reply; 9+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2009-06-02  4:46 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

2009/5/29 Nicolas Pitre <nico@cam.org>:
> However, like I said previously, I'd encapsulate large blobs in a pack
> right away instead of storing them as loose objects.  The reason is that
> you can effortlessly repack/fetch/push them afterwards by simply
> triggering the pack data reuse code path for them.  Extracting large and
> undeltified blobs from a pack is just as easy as from a loose object.

Makes sense. And the code looks nice too.

> To accomplish that, you only need to copy write_pack_file() from
> builtin-pack-objects.c and strip it to the bone with only one object to
> write.

write_pack_file() is too scary to me, I ripped from fast-import.c
instead. BTW, how does git handle hundreds of single object packs? I
don't know if prepare_packed_git scales in such cases.
-- 
Duy

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH WIP 0/4] Special code path for large blobs
  2009-06-02  4:46   ` Nguyen Thai Ngoc Duy
@ 2009-06-02 14:45     ` Shawn O. Pearce
  2009-06-02 17:22       ` Nicolas Pitre
  0 siblings, 1 reply; 9+ messages in thread
From: Shawn O. Pearce @ 2009-06-02 14:45 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: Nicolas Pitre, git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> wrote:
> 2009/5/29 Nicolas Pitre <nico@cam.org>:
> > However, like I said previously, I'd encapsulate large blobs in a pack
> > right away instead of storing them as loose objects. ??The reason is that
> > you can effortlessly repack/fetch/push them afterwards by simply
> > triggering the pack data reuse code path for them. ??Extracting large and
> > undeltified blobs from a pack is just as easy as from a loose object.
> 
> Makes sense. And the code looks nice too.
> 
> > To accomplish that, you only need to copy write_pack_file() from
> > builtin-pack-objects.c and strip it to the bone with only one object to
> > write.
> 
> write_pack_file() is too scary to me, I ripped from fast-import.c
> instead. BTW, how does git handle hundreds of single object packs? I
> don't know if prepare_packed_git scales in such cases.

Yea, its not going to do that great.

We may be able to improve that code path by sorting any pack whose
index is really small and pack file is really big to the end of
the list, where its least likely to be matched, so we don't even
bother to load the index into memory during normal commit traversal.

But even with that sorting, its still going to suck.  Lookup for
a large binary is O(N), where N is the number of large binary
*revisions*.  Yuck.

Really, objects in the 200MB+ range probably should just be in a lone
file named by its SHA-1... aka, a loose object.  Combining them into
a pack is going to be potentially expensive disk IO wise, and may
not gain you very much (its over 200 MB compressed with deflate, its
likely already compressed binary content that may not delta well).

Way back we had that pack-style loose object format, for exactly
these sorts of files, and exactly to avoid having many packs of
just 1 object, but that didn't go anywhere... indeed, Nico deleted
the code that creates that format.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH WIP 0/4] Special code path for large blobs
  2009-06-02 14:45     ` Shawn O. Pearce
@ 2009-06-02 17:22       ` Nicolas Pitre
  0 siblings, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2009-06-02 17:22 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nguyen Thai Ngoc Duy, git

On Tue, 2 Jun 2009, Shawn O. Pearce wrote:

> Nguyen Thai Ngoc Duy <pclouds@gmail.com> wrote:
> > 2009/5/29 Nicolas Pitre <nico@cam.org>:
> > > However, like I said previously, I'd encapsulate large blobs in a pack
> > > right away instead of storing them as loose objects. ??The reason is that
> > > you can effortlessly repack/fetch/push them afterwards by simply
> > > triggering the pack data reuse code path for them. ??Extracting large and
> > > undeltified blobs from a pack is just as easy as from a loose object.
> > 
> > Makes sense. And the code looks nice too.
> > 
> > > To accomplish that, you only need to copy write_pack_file() from
> > > builtin-pack-objects.c and strip it to the bone with only one object to
> > > write.
> > 
> > write_pack_file() is too scary to me, I ripped from fast-import.c
> > instead. BTW, how does git handle hundreds of single object packs? I
> > don't know if prepare_packed_git scales in such cases.
> 
> Yea, its not going to do that great.

Well... remember that we're talking about huge objects here.  You're not 
likely to have so many of them.  And if you do have that many huge 
objects, then any operation you do is likely to access one of them 
anyway and then the pack lookup overhead will be lost in the noise.

> We may be able to improve that code path by sorting any pack whose
> index is really small and pack file is really big to the end of
> the list, where its least likely to be matched, so we don't even
> bother to load the index into memory during normal commit traversal.
> 
> But even with that sorting, its still going to suck.

I don't agree with that statement.  Supposing the sorting you propose 
above which I consider a pretty good idea doesn't provide any 
improvement, we already do pretty well with many packs since commit 
f7c22cc6.

> Lookup for a large binary is O(N), where N is the number of large 
> binary *revisions*.  Yuck.

It is almost like loose object lookups, except that you don't have a 256 
entry fanout.  But since the IO corresponding to such objects is 
expected to be much more significant than 256 times the IO for a tree or 
commit object, then again this shouldn't matter much in practice.

> Really, objects in the 200MB+ range probably should just be in a lone
> file named by its SHA-1... aka, a loose object.  Combining them into
> a pack is going to be potentially expensive disk IO wise, and may
> not gain you very much (its over 200 MB compressed with deflate, its
> likely already compressed binary content that may not delta well).

I must disagree again.  The above notwithstanding, the idea is _not_ to 
keep those large blobs into their own packs.  The idea is mainly to make 
a future repack much much easier with the current repack-objects code.  
And yes, we should repack even those large blobs eventually just like we 
do for loose objects.  Really there is no point keeping even large blobs 
in their own files.  If you do care about object lookup efficiency, 
having large objects into their own pack or loose does not make such a 
huge difference.  Sure a factor of 256 might be considered significant, 
but this is only a linear cost improvement while object lookup within a 
pack gets an exponential cost improvement.

There are ways already to enforce some size policies on pack files if 
you don't want to keep all your objects into a single pack, but even a 
single huge pack should work just fine as well and you may .keep it 
afterward not to repack it anymore, or until a monthly or quarterly 
repository maintenance.  So in practice you shouldn't be accumulating 
_that_ many single object packs anyway.

> Way back we had that pack-style loose object format, for exactly
> these sorts of files, and exactly to avoid having many packs of
> just 1 object, but that didn't go anywhere... indeed, Nico deleted
> the code that creates that format.

And the rational was exactly the one above: large blobs have no reason 
what so ever to _remain_ alone into a file, be it a loose object or a 
single object pack.  And just like for loose objects, you don't have to 
repack large blobs right away so the IO cost can be postponed for when 
that doesn't matter.

Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-06-02 17:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-28  5:29 [PATCH WIP 0/4] Special code path for large blobs Nguyễn Thái Ngọc Duy
2009-05-28  5:29 ` [PATCH WIP 1/4] convert.c: refactor in order to skip conversion early without looking into file content Nguyễn Thái Ngọc Duy
2009-05-28  5:29   ` [PATCH WIP 2/4] sha1_file.c: add streaming interface for reading blobs Nguyễn Thái Ngọc Duy
2009-05-28  5:29     ` [PATCH WIP 3/4] write_entry: use streaming interface for checkout large files Nguyễn Thái Ngọc Duy
2009-05-28  5:29       ` [PATCH WIP 4/4] index_fd: support indexing " Nguyễn Thái Ngọc Duy
2009-05-28 18:03 ` [PATCH WIP 0/4] Special code path for large blobs Nicolas Pitre
2009-06-02  4:46   ` Nguyen Thai Ngoc Duy
2009-06-02 14:45     ` Shawn O. Pearce
2009-06-02 17:22       ` Nicolas Pitre

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).