git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:20   ` Nguyễn Thái Ngọc Duy
  2018-03-01 18:14     ` Junio C Hamano
  2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  0 siblings, 2 replies; 5+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:20 UTC (permalink / raw
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

pack-objects could be a big memory hog especially on large repos,
everybody knows that. The suggestion to stick a .keep file on the
largest pack to avoid this problem is also known for a long time.

Let's do the suggestion automatically instead of waiting for people to
come to Git mailing list and get the advice. When a certain condition
is met, gc --auto create a .keep file temporary before repack is run,
then remove it afterward.

gc --auto does this based on an estimation of pack-objects memory
usage and whether that fits in one third of system memory (the
assumption here is for desktop environment where there are many other
applications running).

Since the estimation may be inaccurate and that 1/3 threshold is
arbitrary, give the user a finer control over this mechanism as well:
if the largest pack is larger than gc.bigPackThreshold, it's kept.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/gc.c           | 125 +++++++++++++++++++++++++++++++++++++++--
 builtin/pack-objects.c |   2 +-
 config.mak.uname       |   1 +
 git-compat-util.h      |   4 ++
 pack-objects.h         |   2 +
 5 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..2d9965bcdf 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,10 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "blob.h"
+#include "tree.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -39,6 +43,8 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
+static unsigned long big_pack_threshold = 0;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -49,6 +55,7 @@ static struct argv_array rerere = ARGV_ARRAY_INIT;
 
 static struct tempfile *pidfile;
 static struct lock_file log_lock;
+static struct strbuf temp_keep_file = STRBUF_INIT;
 
 static struct string_list pack_garbage = STRING_LIST_INIT_DUP;
 
@@ -93,6 +100,18 @@ static void process_log_file(void)
 	}
 }
 
+static void delete_temp_keep_file(void)
+{
+	unlink(temp_keep_file.buf);
+}
+
+static void delete_temp_keep_file_on_signal(int signo)
+{
+	delete_temp_keep_file();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
 static void process_log_file_at_exit(void)
 {
 	fflush(stderr);
@@ -126,6 +145,9 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
+	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
+
 	git_config(git_default_config, NULL);
 }
 
@@ -164,7 +186,7 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static int too_many_packs(void)
+static int too_many_packs(struct packed_git **largest_pack)
 {
 	struct packed_git *p;
 	int cnt;
@@ -173,22 +195,104 @@ static int too_many_packs(void)
 		return 0;
 
 	prepare_packed_git();
+	*largest_pack = NULL;
 	for (cnt = 0, p = packed_git; p; p = p->next) {
 		if (!p->pack_local)
 			continue;
 		if (p->pack_keep)
 			continue;
+		if (!*largest_pack || (*largest_pack)->pack_size  < p->pack_size)
+			*largest_pack = p;
 		/*
 		 * Perhaps check the size of the pack and count only
 		 * very small ones here?
 		 */
 		cnt++;
 	}
+
 	return gc_auto_pack_limit < cnt;
 }
 
-static void add_repack_all_option(void)
+static inline unsigned long total_ram(void)
+{
+	unsigned long default_ram = 4;
+#ifdef HAVE_SYSINFO
+	struct sysinfo si;
+
+	if (!sysinfo(&si))
+		return si.totalram;
+#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
+	int64_t physical_memory;
+	int mib[2];
+	int length;
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_MEMSIZE;
+	length = sizeof(int64_t);
+	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
+		return physical_memory;
+#elif defined(GIT_WINDOWS_NATIVE)
+	MEMORYSTATUSEX memInfo;
+
+	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+	if (GlobalMemoryStatusEx(&memInfo))
+		return memInfo;ullTotalPhys;
+#else
+	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
+		default_ram);
+#endif
+	return default_ram * 1024 * 1024 * 1024;
+}
+
+static int pack_objects_uses_too_much_memory(struct packed_git *pack)
+{
+	unsigned long nr_objects = approximate_object_count();
+	size_t mem_want, mem_have;
+
+	if (!pack || !nr_objects)
+		return 0;
+
+	if (big_pack_threshold)
+		return pack->pack_size >= big_pack_threshold;
+
+	/* First we have to scan through at least one pack */
+	mem_want = pack->pack_size + pack->index_size;
+	/* then pack-objects needs lots more for book keeping */
+	mem_want += sizeof(struct object_entry) * nr_objects;
+	/*
+	 * internal rev-list --all --objects takes up some memory too,
+	 * let's say half of it is for blobs
+	 */
+	mem_want += sizeof(struct blob) * nr_objects / 2;
+	/*
+	 * and the other half is for trees (commits and tags are
+	 * usually insignificant)
+	 */
+	mem_want += sizeof(struct tree) * nr_objects / 2;
+	/* and then obj_hash[], underestimated in fact */
+	mem_want += sizeof(struct object *) * nr_objects;
+	/*
+	 * read_sha1_file() (either at delta calculation phase, or
+	 * writing phase) also fills up the delta base cache
+	 */
+	mem_want += delta_base_cache_limit;
+	/* and of course pack-objects has its own delta cache */
+	mem_want += max_delta_cache_size;
+
+	/* Only allow 1/3 of memory for pack-objects */
+	mem_have = total_ram() / 3;
+
+	return mem_want >= mem_have;
+
+}
+
+static void add_repack_all_option(struct packed_git *exclude_pack)
 {
+	if (pack_objects_uses_too_much_memory(exclude_pack)) {
+		strbuf_addstr(&temp_keep_file, exclude_pack->pack_name);
+		strbuf_strip_suffix(&temp_keep_file, ".pack");
+		strbuf_addstr(&temp_keep_file, ".keep");
+	}
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
 	else {
@@ -205,6 +309,7 @@ static void add_repack_incremental_option(void)
 
 static int need_to_gc(void)
 {
+	struct packed_git *largest_pack = NULL;
 	/*
 	 * Setting gc.auto to 0 or negative can disable the
 	 * automatic gc.
@@ -218,8 +323,8 @@ static int need_to_gc(void)
 	 * we run "repack -A -d -l".  Otherwise we tell the caller
 	 * there is no need.
 	 */
-	if (too_many_packs())
-		add_repack_all_option();
+	if (too_many_packs(&largest_pack))
+		add_repack_all_option(largest_pack);
 	else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
@@ -428,7 +533,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			daemonized = !daemonize();
 		}
 	} else
-		add_repack_all_option();
+		add_repack_all_option(NULL);
 
 	name = lock_repo_for_gc(force, &pid);
 	if (name) {
@@ -450,6 +555,16 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 	if (gc_before_repack())
 		return -1;
 
+	if (temp_keep_file.len) {
+		int fd = open(temp_keep_file.buf, O_CREAT | O_RDWR, 0644);
+		if (fd != -1) {
+			sigchain_push_common(delete_temp_keep_file_on_signal);
+			atexit(delete_temp_keep_file);
+			close(fd);
+		} else {
+			strbuf_release(&temp_keep_file);
+		}
+	}
 	if (!repository_format_precious_objects) {
 		if (run_command_v_opt(repack.argv, RUN_GIT_CMD))
 			return error(FAILED_RUN, repack.argv[0]);
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..722cc999dc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -78,7 +78,7 @@ static uint16_t write_bitmap_options;
 static int exclude_promisor_objects;
 
 static unsigned long delta_cache_size = 0;
-static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
diff --git a/config.mak.uname b/config.mak.uname
index 6a1d0de0cc..ae9cbccec1 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux)
 	HAVE_GETDELIM = YesPlease
 	SANE_TEXT_GREP=-a
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	BASIC_CFLAGS += -DHAVE_SYSINFO
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
diff --git a/git-compat-util.h b/git-compat-util.h
index 68b2ad531e..a84b21986d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -284,6 +284,10 @@ extern char *gitdirname(char *);
 #include <openssl/err.h>
 #endif
 
+#ifdef HAVE_SYSINFO
+# include <sys/sysinfo.h>
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..af4f46c026 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
@ 2018-03-01 18:14     ` Junio C Hamano
  2018-03-02  0:00       ` Duy Nguyen
  2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 5+ messages in thread
From: Junio C Hamano @ 2018-03-01 18:14 UTC (permalink / raw
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> pack-objects could be a big memory hog especially on large repos,
> everybody knows that. The suggestion to stick a .keep file on the
> largest pack to avoid this problem is also known for a long time.

Yup, but not that it is not "largest" per-se.  The thing being large
is a mere consequence that it is the base pack that holds the bulk
of older parts of the history (e.g. the one that you obtained via
the initial clone).

> Let's do the suggestion automatically instead of waiting for people to
> come to Git mailing list and get the advice. When a certain condition
> is met, gc --auto create a .keep file temporary before repack is run,
> then remove it afterward.
>
> gc --auto does this based on an estimation of pack-objects memory
> usage and whether that fits in one third of system memory (the
> assumption here is for desktop environment where there are many other
> applications running).
>
> Since the estimation may be inaccurate and that 1/3 threshold is
> arbitrary, give the user a finer control over this mechanism as well:
> if the largest pack is larger than gc.bigPackThreshold, it's kept.

If this is a transient mechanism during a single gc session, it
would be far more preferrable if we can find a way to do this
without actually having a .keep file on the filesystem.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01 18:14     ` Junio C Hamano
@ 2018-03-02  0:00       ` Duy Nguyen
  0 siblings, 0 replies; 5+ messages in thread
From: Duy Nguyen @ 2018-03-02  0:00 UTC (permalink / raw
  To: Junio C Hamano; +Cc: Git Mailing List, Jeff King, Eric Wong

On Fri, Mar 2, 2018 at 1:14 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> pack-objects could be a big memory hog especially on large repos,
>> everybody knows that. The suggestion to stick a .keep file on the
>> largest pack to avoid this problem is also known for a long time.
>
> Yup, but not that it is not "largest" per-se.  The thing being large
> is a mere consequence that it is the base pack that holds the bulk
> of older parts of the history (e.g. the one that you obtained via
> the initial clone).

Thanks, "base pack" sounds much better. I was having trouble with
wording because I didn't nail this one down.

>> Let's do the suggestion automatically instead of waiting for people to
>> come to Git mailing list and get the advice. When a certain condition
>> is met, gc --auto create a .keep file temporary before repack is run,
>> then remove it afterward.
>>
>> gc --auto does this based on an estimation of pack-objects memory
>> usage and whether that fits in one third of system memory (the
>> assumption here is for desktop environment where there are many other
>> applications running).
>>
>> Since the estimation may be inaccurate and that 1/3 threshold is
>> arbitrary, give the user a finer control over this mechanism as well:
>> if the largest pack is larger than gc.bigPackThreshold, it's kept.
>
> If this is a transient mechanism during a single gc session, it
> would be far more preferrable if we can find a way to do this
> without actually having a .keep file on the filesystem.

That was my first attempt, manipulating packed_git::pack_keep inside
pack-objects. Then my whole git.git was gone. I was scared off so I
did this instead.

I've learned my lesson though (never test dangerous operations on your
worktree!) and will do that pack_keep again _if_ this gc --auto still
sounds like a good direction to go.
-- 
Duy

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
  2018-03-01 18:14     ` Junio C Hamano
@ 2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 5+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-05 14:00 UTC (permalink / raw
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong


On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:

> pack-objects could be a big memory hog especially on large repos,
> everybody knows that. The suggestion to stick a .keep file on the
> largest pack to avoid this problem is also known for a long time.
>
> Let's do the suggestion automatically instead of waiting for people to
> come to Git mailing list and get the advice. When a certain condition
> is met, gc --auto create a .keep file temporary before repack is run,
> then remove it afterward.
>
> gc --auto does this based on an estimation of pack-objects memory
> usage and whether that fits in one third of system memory (the
> assumption here is for desktop environment where there are many other
> applications running).
>
> Since the estimation may be inaccurate and that 1/3 threshold is
> arbitrary, give the user a finer control over this mechanism as well:
> if the largest pack is larger than gc.bigPackThreshold, it's kept.

This is very promising. Saves lots of memory on my ad-hoc testing of
adding a *.keep file on an in-house repo.

> +	if (big_pack_threshold)
> +		return pack->pack_size >= big_pack_threshold;
> +
> +	/* First we have to scan through at least one pack */
> +	mem_want = pack->pack_size + pack->index_size;
> +	/* then pack-objects needs lots more for book keeping */
> +	mem_want += sizeof(struct object_entry) * nr_objects;
> +	/*
> +	 * internal rev-list --all --objects takes up some memory too,
> +	 * let's say half of it is for blobs
> +	 */
> +	mem_want += sizeof(struct blob) * nr_objects / 2;
> +	/*
> +	 * and the other half is for trees (commits and tags are
> +	 * usually insignificant)
> +	 */
> +	mem_want += sizeof(struct tree) * nr_objects / 2;
> +	/* and then obj_hash[], underestimated in fact */
> +	mem_want += sizeof(struct object *) * nr_objects;
> +	/*
> +	 * read_sha1_file() (either at delta calculation phase, or
> +	 * writing phase) also fills up the delta base cache
> +	 */
> +	mem_want += delta_base_cache_limit;
> +	/* and of course pack-objects has its own delta cache */
> +	mem_want += max_delta_cache_size;

I'm not familiar enough with this part to say, but isn't this assuming a
lot about the distribution of objects in a way that will cause is not to
repack in some pathological cases?

Probably worth documenting...

> +	/* Only allow 1/3 of memory for pack-objects */
> +	mem_have = total_ram() / 3;

Would be great to have this be a configurable variable, so you could set
it to e.g. 33% (like here), 50% etc.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
@ 2018-03-06  0:10 Duy Nguyen
  0 siblings, 0 replies; 5+ messages in thread
From: Duy Nguyen @ 2018-03-06  0:10 UTC (permalink / raw
  To: Ævar Arnfjörð Bjarmason
  Cc: Git Mailing List, Jeff King, Eric Wong

On Mon, Mar 5, 2018 at 9:00 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:
>
>> pack-objects could be a big memory hog especially on large repos,
>> everybody knows that. The suggestion to stick a .keep file on the
>> largest pack to avoid this problem is also known for a long time.
>>
>> Let's do the suggestion automatically instead of waiting for people to
>> come to Git mailing list and get the advice. When a certain condition
>> is met, gc --auto create a .keep file temporary before repack is run,
>> then remove it afterward.
>>
>> gc --auto does this based on an estimation of pack-objects memory
>> usage and whether that fits in one third of system memory (the
>> assumption here is for desktop environment where there are many other
>> applications running).
>>
>> Since the estimation may be inaccurate and that 1/3 threshold is
>> arbitrary, give the user a finer control over this mechanism as well:
>> if the largest pack is larger than gc.bigPackThreshold, it's kept.
>
> This is very promising. Saves lots of memory on my ad-hoc testing of
> adding a *.keep file on an in-house repo.

The good news for you is when we run external rev-list on top of this,
memory consumption seems even better (and I think even peak memory
should be a bit lower too, but I'll need to verify that).

>> +     if (big_pack_threshold)
>> +             return pack->pack_size >= big_pack_threshold;
>> +
>> +     /* First we have to scan through at least one pack */
>> +     mem_want = pack->pack_size + pack->index_size;
>> +     /* then pack-objects needs lots more for book keeping */
>> +     mem_want += sizeof(struct object_entry) * nr_objects;
>> +     /*
>> +      * internal rev-list --all --objects takes up some memory too,
>> +      * let's say half of it is for blobs
>> +      */
>> +     mem_want += sizeof(struct blob) * nr_objects / 2;
>> +     /*
>> +      * and the other half is for trees (commits and tags are
>> +      * usually insignificant)
>> +      */
>> +     mem_want += sizeof(struct tree) * nr_objects / 2;
>> +     /* and then obj_hash[], underestimated in fact */
>> +     mem_want += sizeof(struct object *) * nr_objects;
>> +     /*
>> +      * read_sha1_file() (either at delta calculation phase, or
>> +      * writing phase) also fills up the delta base cache
>> +      */
>> +     mem_want += delta_base_cache_limit;
>> +     /* and of course pack-objects has its own delta cache */
>> +     mem_want += max_delta_cache_size;
>
> I'm not familiar enough with this part to say, but isn't this assuming a
> lot about the distribution of objects in a way that will cause is not to
> repack in some pathological cases?

Yeah this assumes a "normal" case. When the estimation is really off,
either we exclude the base pack or repack everything unnecessarily,
but we always repack. A wrong decision here can only affect
performance, not correctness.

There's one case I probably should address though. This "exclude the
base pack" will create two packs in the end, one big and one small.
But if the second pack is getting as big as the first one, it's time
we merge both into one.

> Probably worth documenting...
>
>> +     /* Only allow 1/3 of memory for pack-objects */
>> +     mem_have = total_ram() / 3;
>
> Would be great to have this be a configurable variable, so you could set
> it to e.g. 33% (like here), 50% etc.

Hmm.. isn't gc.bigPackThreshold enough? I mean in a controlled
environment, you probably already know how much ram is available, and
much of this estimation is based on pack size (well the number of
objects in the pack) anyway, you could avoid all this heuristics by
saying "when the base pack is larger than 1GB, always exclude it in
repack". This estimation should only needed when people do not
configure anything (and still expect reasonable defaults). Or when you
plan multiple 'gc' runs on the same machine?
-- 
Duy

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-03-06  0:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-03-06  0:10 [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Duy Nguyen
  -- strict thread matches above, loose matches on Subject: below --
2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
2018-03-01 18:14     ` Junio C Hamano
2018-03-02  0:00       ` Duy Nguyen
2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).