git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] fast-import: implement --min-pack-size parameter
@ 2016-04-23  2:42 Eric Wong
  2016-04-23  3:13 ` Jeff King
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Wong @ 2016-04-23  2:42 UTC (permalink / raw)
  To: git; +Cc: Jonathan Nieder, Jeff King, Junio C Hamano, Shawn Pearce

With many incremental imports, small packs become highly
inefficient due to the need to readdir scan and load many
indices to locate even a single object.  Frequent repacking and
consolidation may be prohibitively expensive in terms of disk
I/O, especially in large repositories where the initial packs
were aggressively optimized and marked with .keep files.

In those cases, users may be better served with loose objects
and relying on "git gc --auto".

Signed-off-by: Eric Wong <normalperson@yhbt.net>
---
  There should be a matching config file directive, but I'm
  not sure how/if it should affect other commands.  So I'm
  not sure if it should be "pack.packSizeMin" or
  "fastimport.packSizeMin" or something else.

  To further reduce disk I/O, the fsync_or_die call in
  fixup_pack_header_footer could probably be moved out of that
  function and become the fphf caller's responsibility.

 Documentation/git-fast-import.txt   |  9 ++++++++
 fast-import.c                       | 30 ++++++++++++++++++++++++++
 t/t9302-fast-import-min-packsize.sh | 42 +++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100755 t/t9302-fast-import-min-packsize.sh

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index 66910aa..8c0ac94 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -136,6 +136,15 @@ Performance and Compression Tuning
 	Maximum size of each output packfile.
 	The default is unlimited.
 
+--min-pack-size=<n>::
+	Mininum size of an output packfile, packfiles smaller
+	than this threshold are unpacked into loose objects and
+	the pack is discarded.  This is useful when performing
+	small, incremental imports as loose objects and relying
+	on `git gc --auto` may be more efficient than generating
+	many tiny packs.
+	The default is to always preserve the pack and never
+	generate loose objects.
 
 Performance
 -----------
diff --git a/fast-import.c b/fast-import.c
index 9fc7093..a00bee5 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -166,6 +166,7 @@ Format of STDIN stream:
 #include "quote.h"
 #include "exec_cmd.h"
 #include "dir.h"
+#include "run-command.h"
 
 #define PACK_ID_BITS 16
 #define MAX_PACK_ID ((1<<PACK_ID_BITS)-1)
@@ -282,6 +283,7 @@ struct recent_command {
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static off_t max_packsize;
+static off_t min_packsize;
 static int force_update;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
@@ -950,6 +952,22 @@ static void unkeep_all_packs(void)
 	}
 }
 
+static int loosen_small_pack(const struct packed_git *p)
+{
+	struct child_process unpack = CHILD_PROCESS_INIT;
+
+	if (lseek(p->pack_fd, 0, SEEK_SET) < 0)
+		die_errno("Failed seeking to start of '%s'", p->pack_name);
+
+	unpack.in = p->pack_fd;
+	unpack.git_cmd = 1;
+	unpack.stdout_to_stderr = 1;
+	argv_array_push(&unpack.args, "unpack-objects");
+	argv_array_push(&unpack.args, "-q");
+
+	return run_command(&unpack);
+}
+
 static void end_packfile(void)
 {
 	static int running;
@@ -972,6 +990,12 @@ static void end_packfile(void)
 		fixup_pack_header_footer(pack_data->pack_fd, pack_data->sha1,
 				    pack_data->pack_name, object_count,
 				    cur_pack_sha1, pack_size);
+
+		if (pack_size < min_packsize) {
+			if (loosen_small_pack(pack_data) == 0)
+				goto discard_pack;
+		}
+
 		close(pack_data->pack_fd);
 		idx_name = keep_pack(create_index());
 
@@ -1002,6 +1026,7 @@ static void end_packfile(void)
 		pack_id++;
 	}
 	else {
+discard_pack:
 		close(pack_data->pack_fd);
 		unlink_or_warn(pack_data->pack_name);
 	}
@@ -3237,6 +3262,11 @@ static int parse_one_option(const char *option)
 			v = 1024 * 1024;
 		}
 		max_packsize = v;
+	} else if (skip_prefix(option, "min-pack-size=", &option)) {
+		unsigned long v;
+		if (!git_parse_ulong(option, &v))
+			return 0;
+		min_packsize = v;
 	} else if (skip_prefix(option, "big-file-threshold=", &option)) {
 		unsigned long v;
 		if (!git_parse_ulong(option, &v))
diff --git a/t/t9302-fast-import-min-packsize.sh b/t/t9302-fast-import-min-packsize.sh
new file mode 100755
index 0000000..7dcdccc
--- /dev/null
+++ b/t/t9302-fast-import-min-packsize.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+test_description='test git fast-import min-packsize'
+. ./test-lib.sh
+
+test_expect_success 'create loose objects on import' '
+	test_tick &&
+	cat >input <<-INPUT_END &&
+	commit refs/heads/master
+	committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+	data <<COMMIT
+	initial
+	COMMIT
+
+	done
+	INPUT_END
+
+	git fast-import --done --min-pack-size=1g <input &&
+	git fsck --no-progress &&
+	test $(find .git/objects/?? -type f | wc -l) -eq 2 &&
+	test $(find .git/objects/pack -type f | wc -l) -eq 0
+'
+
+test_expect_success 'bigger packs are preserved' '
+	test_tick &&
+	cat >input <<-INPUT_END &&
+	commit refs/heads/master
+	committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+	data <<COMMIT
+	incremental should create a pack
+	COMMIT
+	from refs/heads/master^0
+
+	done
+	INPUT_END
+
+	git fast-import --done --min-pack-size=10 <input &&
+	git fsck --no-progress &&
+	test $(find .git/objects/?? -type f | wc -l) -eq 2 &&
+	test $(find .git/objects/pack -type f | wc -l) -eq 2
+'
+
+test_done
-- 
EW

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-04-25 21:17 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-23  2:42 [PATCH] fast-import: implement --min-pack-size parameter Eric Wong
2016-04-23  3:13 ` Jeff King
2016-04-24  4:32   ` [PATCH v2] fast-import: implement unpack limit Eric Wong
2016-04-24 19:18     ` Junio C Hamano
2016-04-24 20:36       ` Eric Wong
2016-04-25 21:17         ` [PATCH v3] " Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).