git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* 2.18.0 Regression: packing performance and effectiveness
@ 2018-07-18 22:51 Elijah Newren
  2018-07-18 22:51 ` [RFC PATCH] fix-v1: revert "pack-objects: shrink delta_size field in struct object_entry" Elijah Newren
                   ` (4 more replies)
  0 siblings, 5 replies; 44+ messages in thread
From: Elijah Newren @ 2018-07-18 22:51 UTC (permalink / raw)
  To: git; +Cc: pclouds, Elijah Newren

I had a user report some poor behavior of 'git gc --aggressive' on a
certain repo (which I sadly cannot share).  Turns out that on this
repo, this operation takes about 60% longer and produces a pack
roughly twice the expected size.

Naturally, bisecting takes a while but it points to this commit:

0aca34e8269514ebb67676e0470a314615494ae8 is the first bad commit
commit 0aca34e8269514ebb67676e0470a314615494ae8
Author: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Date:   Sat Apr 14 17:35:11 2018 +0200

    pack-objects: shrink delta_size field in struct object_entry

    Allowing a delta size of 64 bits is crazy. Shrink this field down to
    20 bits with one overflow bit.

    If we find an existing delta larger than 1MB, we do not cache
    delta_size at all and will get the value from oe_size(), potentially
    from disk if it's larger than 4GB.

    Note, since DELTA_SIZE() is used in try_delta() code, it must be
    thread-safe. Luckily oe_size() does guarantee this so we it is
    thread-safe.

    Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
    Signed-off-by: Junio C Hamano <gitster@pobox.com>

To put some numbers behind this, I got on a very beefy box (40 cores,
160GB RAM) and ran some comparisons:

Version  Pack (MB)  MaxRSS(kB)  Time (s)
-------  ---------  ----------  --------
 2.17.0     5498     43513628    2494.85
 2.18.0    10531     40449596    4168.94
 fix-v1     5509     42509784    2480.74
 fiv-v2     5509     41644104    2468.25

where fix-v1 and fix-v2 are different patches on top of git-2.18.0
that I'll follow up to this email with.  The patches are just meant as
discussion starters.  I'll add signoffs and proper commit messages if
folks actually want those fixes, but I suspect they're just starting
points for discussion.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC PATCH] fix-v1: revert "pack-objects: shrink delta_size field in struct object_entry"
  2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
@ 2018-07-18 22:51 ` Elijah Newren
  2018-07-18 22:51 ` [RFC PATCH] fix-v2: make OE_DELTA_SIZE_BITS a bit bigger Elijah Newren
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 44+ messages in thread
From: Elijah Newren @ 2018-07-18 22:51 UTC (permalink / raw)
  To: git; +Cc: pclouds, Elijah Newren

This reverts commit 0aca34e8269514ebb67676e0470a314615494ae8.

---
 builtin/pack-objects.c | 26 ++++++++++----------------
 pack-objects.h         | 23 +----------------------
 2 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 71056d8294..4775b4b4e5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -35,12 +35,10 @@
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
 #define SIZE(obj) oe_size(&to_pack, obj)
 #define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
-#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
-#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -148,7 +146,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != DELTA_SIZE(entry))
+	if (!delta_buf || delta_size != entry->delta_size)
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -299,14 +297,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = DELTA_SIZE(entry);
+		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = DELTA_SIZE(entry);
+		size = entry->delta_size;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1518,7 +1516,7 @@ static void check_object(struct object_entry *entry)
 			oe_set_type(entry, entry->in_pack_type);
 			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			SET_DELTA_SIZE(entry, in_pack_size);
+			entry->delta_size = in_pack_size;
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1954,7 +1952,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - the_hash_algo->rawsz;
 		ref_depth = 1;
 	} else {
-		max_size = DELTA_SIZE(trg_entry);
+		max_size = trg_entry->delta_size;
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -2023,14 +2021,10 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
-		free(delta_buf);
-		return 0;
-	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == DELTA_SIZE(trg_entry) &&
+		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -2045,7 +2039,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= DELTA_SIZE(trg_entry);
+		delta_cache_size -= trg_entry->delta_size;
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -2058,7 +2052,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	SET_DELTA_SIZE(trg_entry, delta_size);
+	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2181,11 +2175,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
+			size = do_compress(&entry->delta_data, entry->delta_size);
 			if (size < (1U << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= DELTA_SIZE(entry);
+				delta_cache_size -= entry->delta_size;
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..0e08f10437 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,7 +14,6 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -92,8 +91,7 @@ struct object_entry {
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
-	unsigned delta_size_valid:1;
+	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
@@ -327,23 +325,4 @@ static inline void oe_set_size(struct packing_data *pack,
 	}
 }
 
-static inline unsigned long oe_delta_size(struct packing_data *pack,
-					  const struct object_entry *e)
-{
-	if (e->delta_size_valid)
-		return e->delta_size_;
-	return oe_size(pack, e);
-}
-
-static inline void oe_set_delta_size(struct packing_data *pack,
-				     struct object_entry *e,
-				     unsigned long size)
-{
-	e->delta_size_ = size;
-	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(pack, e))
-		BUG("this can only happen in check_object() "
-		    "where delta size is the same as entry size");
-}
-
 #endif
-- 
2.18.0.1.gd83e732e4e


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [RFC PATCH] fix-v2: make OE_DELTA_SIZE_BITS a bit bigger
  2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
  2018-07-18 22:51 ` [RFC PATCH] fix-v1: revert "pack-objects: shrink delta_size field in struct object_entry" Elijah Newren
@ 2018-07-18 22:51 ` Elijah Newren
  2018-07-19  5:41 ` 2.18.0 Regression: packing performance and effectiveness Duy Nguyen
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 44+ messages in thread
From: Elijah Newren @ 2018-07-18 22:51 UTC (permalink / raw)
  To: git; +Cc: pclouds, Elijah Newren

---
builtin/pack-objects.c | 2 +-
 pack-objects.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 71056d8294..49b7af295d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2023,7 +2023,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
+	if (delta_size >= (1UL << OE_DELTA_SIZE_BITS)) {
 		free(delta_buf);
 		return 0;
 	}
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..26021328aa 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,7 +14,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	32
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
-- 
2.18.0.1.gd83e732e4e


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
  2018-07-18 22:51 ` [RFC PATCH] fix-v1: revert "pack-objects: shrink delta_size field in struct object_entry" Elijah Newren
  2018-07-18 22:51 ` [RFC PATCH] fix-v2: make OE_DELTA_SIZE_BITS a bit bigger Elijah Newren
@ 2018-07-19  5:41 ` Duy Nguyen
  2018-07-19  5:49   ` Jeff King
  2018-07-19 15:27   ` Elijah Newren
  2018-07-19  5:44 ` Jeff King
  2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
  4 siblings, 2 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19  5:41 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List

On Thu, Jul 19, 2018 at 12:51 AM Elijah Newren <newren@gmail.com> wrote:
>
> I had a user report some poor behavior of 'git gc --aggressive' on a
> certain repo (which I sadly cannot share).  Turns out that on this
> repo, this operation takes about 60% longer and produces a pack
> roughly twice the expected size.

The intention was to make life better for weaker machines but
definitely should not slow down beefier ones, so yes this is
definitely a regression.

Is it possible to share "verify-pack -v <pack file>" output of the
pack produced by 2.17.0 and 2.18.0? The only sensitive info there is
sha-1, which you can replace with just "SHA-1" if you want. I'm more
interested in delta sizes and distribution.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
                   ` (2 preceding siblings ...)
  2018-07-19  5:41 ` 2.18.0 Regression: packing performance and effectiveness Duy Nguyen
@ 2018-07-19  5:44 ` Jeff King
  2018-07-19  5:57   ` Duy Nguyen
  2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
  4 siblings, 1 reply; 44+ messages in thread
From: Jeff King @ 2018-07-19  5:44 UTC (permalink / raw)
  To: Elijah Newren; +Cc: git, pclouds

On Wed, Jul 18, 2018 at 03:51:08PM -0700, Elijah Newren wrote:

> I had a user report some poor behavior of 'git gc --aggressive' on a
> certain repo (which I sadly cannot share).  Turns out that on this
> repo, this operation takes about 60% longer and produces a pack
> roughly twice the expected size.
> 
> Naturally, bisecting takes a while but it points to this commit:

Thanks for finding and bisecting.

> To put some numbers behind this, I got on a very beefy box (40 cores,
> 160GB RAM) and ran some comparisons:
> 
> Version  Pack (MB)  MaxRSS(kB)  Time (s)
> -------  ---------  ----------  --------
>  2.17.0     5498     43513628    2494.85
>  2.18.0    10531     40449596    4168.94
>  fix-v1     5509     42509784    2480.74
>  fiv-v2     5509     41644104    2468.25
> 
> where fix-v1 and fix-v2 are different patches on top of git-2.18.0
> that I'll follow up to this email with.  The patches are just meant as
> discussion starters.  I'll add signoffs and proper commit messages if
> folks actually want those fixes, but I suspect they're just starting
> points for discussion.

Hmm. So what's the mechanism causing the issue here? Looking at the
patch from 0aca34e826 (pack-objects: shrink delta_size field in struct
object_entry, 2018-04-14), it should _mostly_ be about how we store data
in memory, and should not impact the deltas we find.

But reading the patch and looking at your v2, it seems clear that this
hunk is the culprit:

@@ -2006,10 +2008,14 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
        delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
        if (!delta_buf)
                return 0;
+       if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
+               free(delta_buf);
+               return 0;
+       }

So we're punting on large deltas, and presumably your test case has some
files that are large but delta well (and we miss those opportunities,
which wastes further time in the delta search and creates a lousy pack).

I'm not sure why we need this hunk, though. Eventually we'd hit
SET_DELTA_SIZE(), and I thought the point is that it should handle large
sizes with some kind of fallback (the commit message even mentions an
"overflow bit"). Looking at oe_set_delta_size(), though, I don't think
the fallback would really work here; we'd hid the BUG().

At any rate, it should be easy to construct a test case.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19  5:41 ` 2.18.0 Regression: packing performance and effectiveness Duy Nguyen
@ 2018-07-19  5:49   ` Jeff King
  2018-07-19 15:27   ` Elijah Newren
  1 sibling, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-19  5:49 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 07:41:03AM +0200, Duy Nguyen wrote:

> On Thu, Jul 19, 2018 at 12:51 AM Elijah Newren <newren@gmail.com> wrote:
> >
> > I had a user report some poor behavior of 'git gc --aggressive' on a
> > certain repo (which I sadly cannot share).  Turns out that on this
> > repo, this operation takes about 60% longer and produces a pack
> > roughly twice the expected size.
> 
> The intention was to make life better for weaker machines but
> definitely should not slow down beefier ones, so yes this is
> definitely a regression.
> 
> Is it possible to share "verify-pack -v <pack file>" output of the
> pack produced by 2.17.0 and 2.18.0? The only sensitive info there is
> sha-1, which you can replace with just "SHA-1" if you want. I'm more
> interested in delta sizes and distribution.

Try this:

-- >8 --
#!/bin/sh

rm -rf repo

git init repo
cd repo

dd if=/dev/urandom of=one.rand bs=1M count=2
dd if=/dev/urandom of=two.rand bs=1M count=2
dd if=/dev/urandom of=big.rand bs=1M count=20

cat one.rand big.rand >file
git add file
git commit -m one

cat two.rand big.rand >file
git add file
git commit -m two

git repack -ad
git cat-file --batch-all-objects --batch-check='%(objectname) %(deltabase)'
-- 8< --

Using git 2.17 for the repack results in a single delta (which should be
about 2MB, because it will say "delete one.rand and add two.rand" or
vice versa).

Using git 2.18, I get no delta at all.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19  5:44 ` Jeff King
@ 2018-07-19  5:57   ` Duy Nguyen
  2018-07-19 15:16     ` Duy Nguyen
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19  5:57 UTC (permalink / raw)
  To: Jeff King; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 7:44 AM Jeff King <peff@peff.net> wrote:
>
> On Wed, Jul 18, 2018 at 03:51:08PM -0700, Elijah Newren wrote:
>
> > I had a user report some poor behavior of 'git gc --aggressive' on a
> > certain repo (which I sadly cannot share).  Turns out that on this
> > repo, this operation takes about 60% longer and produces a pack
> > roughly twice the expected size.
> >
> > Naturally, bisecting takes a while but it points to this commit:
>
> Thanks for finding and bisecting.
>
> > To put some numbers behind this, I got on a very beefy box (40 cores,
> > 160GB RAM) and ran some comparisons:
> >
> > Version  Pack (MB)  MaxRSS(kB)  Time (s)
> > -------  ---------  ----------  --------
> >  2.17.0     5498     43513628    2494.85
> >  2.18.0    10531     40449596    4168.94
> >  fix-v1     5509     42509784    2480.74
> >  fiv-v2     5509     41644104    2468.25
> >
> > where fix-v1 and fix-v2 are different patches on top of git-2.18.0
> > that I'll follow up to this email with.  The patches are just meant as
> > discussion starters.  I'll add signoffs and proper commit messages if
> > folks actually want those fixes, but I suspect they're just starting
> > points for discussion.
>
> Hmm. So what's the mechanism causing the issue here? Looking at the
> patch from 0aca34e826 (pack-objects: shrink delta_size field in struct
> object_entry, 2018-04-14), it should _mostly_ be about how we store data
> in memory, and should not impact the deltas we find.
>
> But reading the patch and looking at your v2, it seems clear that this
> hunk is the culprit:
>
> @@ -2006,10 +2008,14 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
>         delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
>         if (!delta_buf)
>                 return 0;
> +       if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
> +               free(delta_buf);
> +               return 0;
> +       }
>
> So we're punting on large deltas, and presumably your test case has some
> files that are large but delta well (and we miss those opportunities,
> which wastes further time in the delta search and creates a lousy pack).
>
> I'm not sure why we need this hunk, though. Eventually we'd hit
> SET_DELTA_SIZE(), and I thought the point is that it should handle large
> sizes with some kind of fallback (the commit message even mentions an
> "overflow bit"). Looking at oe_set_delta_size(), though, I don't think
> the fallback would really work here; we'd hid the BUG().

The fallback is for existing on-disk deltas where we can actually get
the size from. For new deltas we have to store the size somewhere in
memory and we can't because this field's size is reduced.

I thought 2M was generous but I was obviously wrong. I'd like to see
the biggest delta size in this pack and whether it's still reasonable
to increase OE_DELTA_SIZE_BITS. But if it's very close to 4GB limit
then we could just store 64-bit delta size in a separate array.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19  5:57   ` Duy Nguyen
@ 2018-07-19 15:16     ` Duy Nguyen
  2018-07-19 16:42       ` Elijah Newren
                         ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19 15:16 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Jeff King

On Thu, Jul 19, 2018 at 07:57:37AM +0200, Duy Nguyen wrote:
> I thought 2M was generous but I was obviously wrong. I'd like to see
> the biggest delta size in this pack and whether it's still reasonable
> to increase OE_DELTA_SIZE_BITS. But if it's very close to 4GB limit
> then we could just store 64-bit delta size in a separate array.

I realize now that these two options don't have to be mutually
exclusive and I should provide a good fallback in terms of performance
anyway.

So Elijah, could you try this patch and see if performance and pack
size go back to 1.7.0? I can write proper commit message and test
support later if it proves a good improvement.

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ebc8cefb53..7f3546057d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2023,10 +2023,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
-		free(delta_buf);
-		return 0;
-	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
diff --git a/pack-objects.c b/pack-objects.c
index 92708522e7..191ed45faf 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -160,6 +160,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 		if (!pdata->in_pack_by_idx)
 			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
+		if (pdata->delta_size)
+			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
@@ -177,3 +179,17 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 	return new_entry;
 }
+
+void oe_prepare_delta_size_array(struct packing_data *pack)
+{
+	int i;
+
+	/*
+	 * nr_alloc, not nr_objects to align with realloc() strategy in
+	 * packlist_alloc()
+	 */
+	ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
+
+	for (i = 0; i < pack->nr_objects; i++)
+		pack->delta_size[i] = pack->objects[i].delta_size_;
+}
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..978500e474 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,7 +14,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	21
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -93,7 +93,6 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
-	unsigned delta_size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
@@ -130,6 +129,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	unsigned long *delta_size;
 
 	/*
 	 * Only one of these can be non-NULL and they have different
@@ -330,20 +330,27 @@ static inline void oe_set_size(struct packing_data *pack,
 static inline unsigned long oe_delta_size(struct packing_data *pack,
 					  const struct object_entry *e)
 {
-	if (e->delta_size_valid)
+	if (!pack->delta_size)
 		return e->delta_size_;
-	return oe_size(pack, e);
+	return pack->delta_size[e - pack->objects];
 }
 
+void oe_prepare_delta_size_array(struct packing_data *pack);
 static inline void oe_set_delta_size(struct packing_data *pack,
 				     struct object_entry *e,
 				     unsigned long size)
 {
 	e->delta_size_ = size;
-	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(pack, e))
-		BUG("this can only happen in check_object() "
-		    "where delta size is the same as entry size");
+	if (!pack->delta_size && e->delta_size_ == size)
+		return;
+	/*
+	 * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
+	 * limit. delta_size_ will not be used anymore. All delta sizes are now
+	 * from the delta_size[] array.
+	 */
+	if (!pack->delta_size)
+		oe_prepare_delta_size_array(pack);
+	pack->delta_size[e - pack->objects] = size;
 }
 
 #endif
-- 8< --


--
Duy

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19  5:41 ` 2.18.0 Regression: packing performance and effectiveness Duy Nguyen
  2018-07-19  5:49   ` Jeff King
@ 2018-07-19 15:27   ` Elijah Newren
  2018-07-19 15:43     ` Duy Nguyen
  1 sibling, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-19 15:27 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Wed, Jul 18, 2018 at 10:41 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Jul 19, 2018 at 12:51 AM Elijah Newren <newren@gmail.com> wrote:
>>
>> I had a user report some poor behavior of 'git gc --aggressive' on a
>> certain repo (which I sadly cannot share).  Turns out that on this
>> repo, this operation takes about 60% longer and produces a pack
>> roughly twice the expected size.
>
> The intention was to make life better for weaker machines but
> definitely should not slow down beefier ones, so yes this is
> definitely a regression.

Not sure if it matters, but the original discovery was on a laptop.
Probably 4 cores and 16 GB RAM.  I duplicated on my workstation (8
cores, 24 GB RAM).  However, since the original patch was memory
related and I noticed the repacking using all available memory, I
repeated the testcases while measuring memory but did it on a machine
that wouldn't be memory limited.

> Is it possible to share "verify-pack -v <pack file>" output of the
> pack produced by 2.17.0 and 2.18.0? The only sensitive info there is
> sha-1, which you can replace with just "SHA-1" if you want. I'm more
> interested in delta sizes and distribution.

For the deltas, assuming the size-in-pack field (4th column) is the relevant one

Number of objects in repo (lines of verify-pack output): 4460755
Number of deltas: 2831384
Number of deltas greater than 1MB: 72
Min: 14
Median: 47
Mean: 586
99.999 percentile: 11366280.6 (10.8 MB)
Max: 141664210 (135.1 MB)

If the size field (3rd column) is the relevant one, then the numbers
change slightly to:

Number of deltas greater than 1MB: 101
Min: 4
Median: 33
Mean: 660
99.999 percentile: 12245551.7 (11.7 MB)
Max: 144658342 (138.0 MB)

I checked out the blob which had the biggest delta, as well as the
blob it was a delta against.  One was a 280 MB file, the other a 278
MB file.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 15:27   ` Elijah Newren
@ 2018-07-19 15:43     ` Duy Nguyen
  0 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19 15:43 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List

On Thu, Jul 19, 2018 at 5:27 PM Elijah Newren <newren@gmail.com> wrote:
>
> On Wed, Jul 18, 2018 at 10:41 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> > On Thu, Jul 19, 2018 at 12:51 AM Elijah Newren <newren@gmail.com> wrote:
> >>
> >> I had a user report some poor behavior of 'git gc --aggressive' on a
> >> certain repo (which I sadly cannot share).  Turns out that on this
> >> repo, this operation takes about 60% longer and produces a pack
> >> roughly twice the expected size.
> >
> > The intention was to make life better for weaker machines but
> > definitely should not slow down beefier ones, so yes this is
> > definitely a regression.
>
> Not sure if it matters, but the original discovery was on a laptop.
> Probably 4 cores and 16 GB RAM.  I duplicated on my workstation (8
> cores, 24 GB RAM).  However, since the original patch was memory
> related and I noticed the repacking using all available memory, I
> repeated the testcases while measuring memory but did it on a machine
> that wouldn't be memory limited.

That should be plentiful for a 5GB pack with default cache settings, I think.

> > Is it possible to share "verify-pack -v <pack file>" output of the
> > pack produced by 2.17.0 and 2.18.0? The only sensitive info there is
> > sha-1, which you can replace with just "SHA-1" if you want. I'm more
> > interested in delta sizes and distribution.
>
> For the deltas, assuming the size-in-pack field (4th column) is the relevant one
>
> Number of objects in repo (lines of verify-pack output): 4460755
> Number of deltas: 2831384
> Number of deltas greater than 1MB: 72
> Min: 14
> Median: 47
> Mean: 586
> 99.999 percentile: 11366280.6 (10.8 MB)
> Max: 141664210 (135.1 MB)
>
> If the size field (3rd column) is the relevant one, then the numbers
> change slightly to:
>
> Number of deltas greater than 1MB: 101
> Min: 4
> Median: 33
> Mean: 660
> 99.999 percentile: 12245551.7 (11.7 MB)

I think size is the right one because even deltas are compressed
before stored in the pack file (I probably should check the code...)
but anyway this number is bigger and that sounds to me like 16MB as
delta size limit should cover common case nicely. 32MB to be on the
safe side, but given default cache size of 256MB, a single delta 1/8
the size of the cache sounds too much.

> Max: 144658342 (138.0 MB)

This one would trigger the patch I just sent, which should also handle
it nicely (I hope).

> I checked out the blob which had the biggest delta, as well as the
> blob it was a delta against.  One was a 280 MB file, the other a 278
> MB file.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 15:16     ` Duy Nguyen
@ 2018-07-19 16:42       ` Elijah Newren
  2018-07-19 17:23         ` Jeff King
  2018-07-19 17:04       ` Jeff King
  2018-07-19 19:25       ` Junio C Hamano
  2 siblings, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-19 16:42 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Jeff King

On Thu, Jul 19, 2018 at 8:16 AM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Jul 19, 2018 at 07:57:37AM +0200, Duy Nguyen wrote:
>> I thought 2M was generous but I was obviously wrong. I'd like to see
>> the biggest delta size in this pack and whether it's still reasonable
>> to increase OE_DELTA_SIZE_BITS. But if it's very close to 4GB limit
>> then we could just store 64-bit delta size in a separate array.
>
> I realize now that these two options don't have to be mutually
> exclusive and I should provide a good fallback in terms of performance
> anyway.
>
> So Elijah, could you try this patch and see if performance and pack
> size go back to 1.7.0? I can write proper commit message and test
> support later if it proves a good improvement.

Thanks for the quick turnaround.  Unfortunately, I have some bad news.
With this patch, I get the following:

$ /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
Enumerating objects: 4460703, done.
Counting objects: 100% (4460703/4460703), done.
Delta compression using up to 40 threads.
Compressing objects: 100% (3807140/3807140), done.
Writing objects: 100% (4460703/4460703), done.
Total 4460703 (delta 2831383), reused 1587071 (delta 0)
error: failed to unpack compressed delta at offset 183854150 from
.git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack
fatal: packed object 20ce811e53dabbb8ef9368c108cbbdfa65639c03 (stored
in .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack)
is corrupt
error: failed to run prune
MaxRSS:40025196 Time:2531.52


It looks like this comes after it deleted the old packs, so running
git fsck afterwards prints lots of errors.  Just in case they're
helpful at all, there are 26 "error: failed to unpack compressed delta
at offset..." messages, 26 "error: cannot unpack ... from ... at
offset ..." messages, 17 "error: failed to read delta base object..."
messages, 7 "broken link from ... to ..." messages, four missing
blobs, and three missing trees.

(This was run on a copy of the repo, so no harm done; I can still use
the original.)


> -- 8< --
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index ebc8cefb53..7f3546057d 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -2023,10 +2023,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
>         delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
>         if (!delta_buf)
>                 return 0;
> -       if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
> -               free(delta_buf);
> -               return 0;
> -       }
>
>         if (DELTA(trg_entry)) {
>                 /* Prefer only shallower same-sized deltas. */
> diff --git a/pack-objects.c b/pack-objects.c
> index 92708522e7..191ed45faf 100644
> --- a/pack-objects.c
> +++ b/pack-objects.c
> @@ -160,6 +160,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>
>                 if (!pdata->in_pack_by_idx)
>                         REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
> +               if (pdata->delta_size)
> +                       REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
>         }
>
>         new_entry = pdata->objects + pdata->nr_objects++;
> @@ -177,3 +179,17 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>
>         return new_entry;
>  }
> +
> +void oe_prepare_delta_size_array(struct packing_data *pack)
> +{
> +       int i;
> +
> +       /*
> +        * nr_alloc, not nr_objects to align with realloc() strategy in
> +        * packlist_alloc()
> +        */
> +       ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
> +
> +       for (i = 0; i < pack->nr_objects; i++)
> +               pack->delta_size[i] = pack->objects[i].delta_size_;
> +}
> diff --git a/pack-objects.h b/pack-objects.h
> index edf74dabdd..978500e474 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -14,7 +14,7 @@
>   * above this limit. Don't lower it too much.
>   */
>  #define OE_SIZE_BITS           31
> -#define OE_DELTA_SIZE_BITS     20
> +#define OE_DELTA_SIZE_BITS     21
>
>  /*
>   * State flags for depth-first search used for analyzing delta cycles.
> @@ -93,7 +93,6 @@ struct object_entry {
>                                      * uses the same base as me
>                                      */
>         unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
> -       unsigned delta_size_valid:1;
>         unsigned in_pack_idx:OE_IN_PACK_BITS;   /* already in pack */
>         unsigned z_delta_size:OE_Z_DELTA_BITS;
>         unsigned type_valid:1;
> @@ -130,6 +129,7 @@ struct packing_data {
>         uint32_t index_size;
>
>         unsigned int *in_pack_pos;
> +       unsigned long *delta_size;
>
>         /*
>          * Only one of these can be non-NULL and they have different
> @@ -330,20 +330,27 @@ static inline void oe_set_size(struct packing_data *pack,
>  static inline unsigned long oe_delta_size(struct packing_data *pack,
>                                           const struct object_entry *e)
>  {
> -       if (e->delta_size_valid)
> +       if (!pack->delta_size)
>                 return e->delta_size_;
> -       return oe_size(pack, e);
> +       return pack->delta_size[e - pack->objects];
>  }
>
> +void oe_prepare_delta_size_array(struct packing_data *pack);
>  static inline void oe_set_delta_size(struct packing_data *pack,
>                                      struct object_entry *e,
>                                      unsigned long size)
>  {
>         e->delta_size_ = size;
> -       e->delta_size_valid = e->delta_size_ == size;
> -       if (!e->delta_size_valid && size != oe_size(pack, e))
> -               BUG("this can only happen in check_object() "
> -                   "where delta size is the same as entry size");
> +       if (!pack->delta_size && e->delta_size_ == size)
> +               return;
> +       /*
> +        * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
> +        * limit. delta_size_ will not be used anymore. All delta sizes are now
> +        * from the delta_size[] array.
> +        */
> +       if (!pack->delta_size)
> +               oe_prepare_delta_size_array(pack);
> +       pack->delta_size[e - pack->objects] = size;
>  }
>
>  #endif
> -- 8< --
>
>
> --
> Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 15:16     ` Duy Nguyen
  2018-07-19 16:42       ` Elijah Newren
@ 2018-07-19 17:04       ` Jeff King
  2018-07-19 19:25       ` Junio C Hamano
  2 siblings, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-19 17:04 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 05:16:42PM +0200, Duy Nguyen wrote:

> On Thu, Jul 19, 2018 at 07:57:37AM +0200, Duy Nguyen wrote:
> > I thought 2M was generous but I was obviously wrong. I'd like to see
> > the biggest delta size in this pack and whether it's still reasonable
> > to increase OE_DELTA_SIZE_BITS. But if it's very close to 4GB limit
> > then we could just store 64-bit delta size in a separate array.
> 
> I realize now that these two options don't have to be mutually
> exclusive and I should provide a good fallback in terms of performance
> anyway.

Yeah, this is what I had assumed was happening in the original code. :)

> +void oe_prepare_delta_size_array(struct packing_data *pack)
> +{
> +	int i;
> +
> +	/*
> +	 * nr_alloc, not nr_objects to align with realloc() strategy in
> +	 * packlist_alloc()
> +	 */
> +	ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
> +
> +	for (i = 0; i < pack->nr_objects; i++)
> +		pack->delta_size[i] = pack->objects[i].delta_size_;
> +}

This iterator should probably be a uint32_t, to match nr_objects.

The rest of the patch looks OK to me. From Elijah's failure report there
clearly is _some_ problem where we end up with a truncated write of a
delta. But I don't see it from code inspection, nor could I reproduce
it.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 16:42       ` Elijah Newren
@ 2018-07-19 17:23         ` Jeff King
  2018-07-19 17:31           ` Duy Nguyen
  0 siblings, 1 reply; 44+ messages in thread
From: Jeff King @ 2018-07-19 17:23 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Duy Nguyen, Git Mailing List

On Thu, Jul 19, 2018 at 09:42:00AM -0700, Elijah Newren wrote:

> Thanks for the quick turnaround.  Unfortunately, I have some bad news.
> With this patch, I get the following:
> 
> $ /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
> Enumerating objects: 4460703, done.
> Counting objects: 100% (4460703/4460703), done.
> Delta compression using up to 40 threads.
> Compressing objects: 100% (3807140/3807140), done.
> Writing objects: 100% (4460703/4460703), done.
> Total 4460703 (delta 2831383), reused 1587071 (delta 0)
> error: failed to unpack compressed delta at offset 183854150 from
> .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack
> fatal: packed object 20ce811e53dabbb8ef9368c108cbbdfa65639c03 (stored
> in .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack)
> is corrupt
> error: failed to run prune
> MaxRSS:40025196 Time:2531.52

Looking at that output, my _guess_ is that we somehow end up with a
bogus delta_size value and write out a truncated entry. But I couldn't
reproduce the issue with smaller test cases.

Maybe instrumenting Git with the patch below and running:

  GIT_TRACE_DELTA=$PWD/delta.out git gc --aggressive
  perl -lne '
	/(get|put) ([0-9a-f]{40}) (\d+)/ or next;
	if ($1 eq "put") {
		$h{$2} and print "double put: $2 = ($h{$2}, $3)";
		$h{$2} = $3;
	} else {
		$h{$2} == $3 or print "mismatched get: $2 = ($h{$2}, $3)"
	}
  ' <delta.out

would show some anomalies in the get/set sizes?

---
diff --git a/pack-objects.h b/pack-objects.h
index 978500e474..77a6aae62b 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -327,7 +327,9 @@ static inline void oe_set_size(struct packing_data *pack,
 	}
 }
 
-static inline unsigned long oe_delta_size(struct packing_data *pack,
+static struct trace_key trace_delta = TRACE_KEY_INIT(DELTA);
+
+static inline unsigned long oe_delta_size_1(struct packing_data *pack,
 					  const struct object_entry *e)
 {
 	if (!pack->delta_size)
@@ -335,11 +337,23 @@ static inline unsigned long oe_delta_size(struct packing_data *pack,
 	return pack->delta_size[e - pack->objects];
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	unsigned long ret = oe_delta_size_1(pack, e);
+	trace_printf_key(&trace_delta, "get %s %lu",
+			 oid_to_hex(&e->idx.oid), ret);
+	return ret;
+}
+
 void oe_prepare_delta_size_array(struct packing_data *pack);
 static inline void oe_set_delta_size(struct packing_data *pack,
 				     struct object_entry *e,
 				     unsigned long size)
 {
+	trace_printf_key(&trace_delta, "put %s %lu",
+			 oid_to_hex(&e->idx.oid), size);
+
 	e->delta_size_ = size;
 	if (!pack->delta_size && e->delta_size_ == size)
 		return;

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 17:23         ` Jeff King
@ 2018-07-19 17:31           ` Duy Nguyen
  2018-07-19 18:24             ` Duy Nguyen
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19 17:31 UTC (permalink / raw)
  To: Jeff King; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 01:23:58PM -0400, Jeff King wrote:
> On Thu, Jul 19, 2018 at 09:42:00AM -0700, Elijah Newren wrote:
> 
> > Thanks for the quick turnaround.  Unfortunately, I have some bad news.
> > With this patch, I get the following:
> > 
> > $ /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
> > Enumerating objects: 4460703, done.
> > Counting objects: 100% (4460703/4460703), done.
> > Delta compression using up to 40 threads.
> > Compressing objects: 100% (3807140/3807140), done.
> > Writing objects: 100% (4460703/4460703), done.
> > Total 4460703 (delta 2831383), reused 1587071 (delta 0)
> > error: failed to unpack compressed delta at offset 183854150 from
> > .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack
> > fatal: packed object 20ce811e53dabbb8ef9368c108cbbdfa65639c03 (stored
> > in .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack)
> > is corrupt
> > error: failed to run prune
> > MaxRSS:40025196 Time:2531.52
> 
> Looking at that output, my _guess_ is that we somehow end up with a
> bogus delta_size value and write out a truncated entry. But I couldn't
> reproduce the issue with smaller test cases.

Could it be a race condition? I ran the whole test suite with this
fallback code activated (by forcing the delta size limit down to two
bytes) and nothing failed. Perhaps something like this on top? I'll
need to see if helgrind could spot anything...

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7f3546057d..1eccbc91d2 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2054,7 +2054,16 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
+
+	/*
+	 * Locking is needed because SET_DELTA_SIZE() internally call
+	 * oe_prepare_delta_size_array() which may touch other entries,
+	 * which are updated in parallel.
+	 */
+	cache_lock();
 	SET_DELTA_SIZE(trg_entry, delta_size);
+	cache_unlock();
+
 	trg->depth = src->depth + 1;
 
 	return 1;
diff --git a/pack-objects.c b/pack-objects.c
index 89699cf15c..9e52af32c3 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -185,13 +185,16 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 void oe_prepare_delta_size_array(struct packing_data *pack)
 {
 	uint32_t i;
+	uint32_t *delta_size;
 
 	/*
 	 * nr_alloc, not nr_objects to align with realloc() strategy in
 	 * packlist_alloc()
 	 */
-	ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
+	ALLOC_ARRAY(delta_size, pack->nr_alloc);
 
 	for (i = 0; i < pack->nr_objects; i++)
-		pack->delta_size[i] = pack->objects[i].delta_size_;
+		delta_size[i] = pack->objects[i].delta_size_;
+
+	pack->delta_size = delta_size;
 }
-- 8< --

Elijah, another thing you could try (if you have plenty of time to
spare) is run this repack with a single thread. It's going to take
forever though...

--
Duy

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 17:31           ` Duy Nguyen
@ 2018-07-19 18:24             ` Duy Nguyen
  2018-07-19 19:17               ` Jeff King
  2018-07-19 23:11               ` Elijah Newren
  0 siblings, 2 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-19 18:24 UTC (permalink / raw)
  To: Jeff King; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 07:31:35PM +0200, Duy Nguyen wrote:
> On Thu, Jul 19, 2018 at 01:23:58PM -0400, Jeff King wrote:
> > On Thu, Jul 19, 2018 at 09:42:00AM -0700, Elijah Newren wrote:
> > 
> > > Thanks for the quick turnaround.  Unfortunately, I have some bad news.
> > > With this patch, I get the following:
> > > 
> > > $ /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
> > > Enumerating objects: 4460703, done.
> > > Counting objects: 100% (4460703/4460703), done.
> > > Delta compression using up to 40 threads.
> > > Compressing objects: 100% (3807140/3807140), done.
> > > Writing objects: 100% (4460703/4460703), done.
> > > Total 4460703 (delta 2831383), reused 1587071 (delta 0)
> > > error: failed to unpack compressed delta at offset 183854150 from
> > > .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack
> > > fatal: packed object 20ce811e53dabbb8ef9368c108cbbdfa65639c03 (stored
> > > in .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack)
> > > is corrupt
> > > error: failed to run prune
> > > MaxRSS:40025196 Time:2531.52
> > 
> > Looking at that output, my _guess_ is that we somehow end up with a
> > bogus delta_size value and write out a truncated entry. But I couldn't
> > reproduce the issue with smaller test cases.
> 
> Could it be a race condition?

I'm convinced my code is racy (between two writes). I created a broken
pack once with 32 threads. Elijah please try again with this new
patch. It should fix this (I only tried repack a few times so far but
will continue)

The race is this

1. Thread one sees a large delta size and NULL delta_size[] array,
   allocates the new array and in the middle of copying old delta
   sizes over.

2. Thread two wants to write a new (large) delta size. It sees that
   delta_size[] is already allocated, it writes the correct size there
   (and truncated one in object_entry->delta_size_)

3. Back to thread one, it now copies the truncated value in
   delta_size_ from step 2 to delta_size[] array, overwriting the good
   value that thread two wrote.

There is also a potential read/write race where a read from
pack_size[] happens when the array is not ready. But I don't think it
can happen with current try_delta() code. I protect it anyway to be
safe.

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ebc8cefb53..d67997f11c 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -32,6 +32,12 @@
 #include "object-store.h"
 #include "dir.h"
 
+static unsigned long oe_delta_size(struct packing_data *pack,
+				   const struct object_entry *e);
+static void oe_set_delta_size(struct packing_data *pack,
+			      struct object_entry *e,
+			      unsigned long size);
+
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
 #define SIZE(obj) oe_size(&to_pack, obj)
 #define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
@@ -1915,6 +1921,51 @@ unsigned long oe_get_size_slow(struct packing_data *pack,
 	return size;
 }
 
+static unsigned long oe_delta_size(struct packing_data *pack,
+				   const struct object_entry *e)
+{
+	unsigned long size;
+
+	read_lock();	 /* to protect access to pack->delta_size[] */
+	if (pack->delta_size)
+		size = pack->delta_size[e - pack->objects];
+	else
+		size = e->delta_size_;
+	read_unlock();
+	return size;
+}
+
+static void oe_set_delta_size(struct packing_data *pack,
+			      struct object_entry *e,
+			      unsigned long size)
+{
+	read_lock();	 /* to protect access to pack->delta_size[] */
+	if (!pack->delta_size && size < pack->oe_delta_size_limit) {
+		e->delta_size_ = size;
+		read_unlock();
+		return;
+	}
+	/*
+	 * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
+	 * limit. delta_size_ will not be used anymore. All delta sizes are now
+	 * from the delta_size[] array.
+	 */
+	if (!pack->delta_size) {
+		uint32_t i;
+
+		/*
+		 * nr_alloc, not nr_objects to align with realloc() strategy in
+		 * packlist_alloc()
+		 */
+		ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
+
+		for (i = 0; i < pack->nr_objects; i++)
+			pack->delta_size[i] = pack->objects[i].delta_size_;
+	}
+	pack->delta_size[e - pack->objects] = size;
+	read_unlock();
+}
+
 static int try_delta(struct unpacked *trg, struct unpacked *src,
 		     unsigned max_depth, unsigned long *mem_usage)
 {
@@ -2023,10 +2074,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
-		free(delta_buf);
-		return 0;
-	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 4b04c75b7f..2a5bff4a1c 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -14,6 +14,7 @@ then
 	export GIT_TEST_SPLIT_INDEX=yes
 	export GIT_TEST_FULL_IN_PACK_ARRAY=true
 	export GIT_TEST_OE_SIZE=10
+	export GIT_TEST_OE_DELTA_SIZE=5
 	make --quiet test
 fi
 
diff --git a/pack-objects.c b/pack-objects.c
index 92708522e7..e3c32bbfc2 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -146,6 +146,8 @@ void prepare_packing_data(struct packing_data *pdata)
 
 	pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
 					     1U << OE_SIZE_BITS);
+	pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
+						   1U << OE_DELTA_SIZE_BITS);
 }
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -160,6 +162,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 		if (!pdata->in_pack_by_idx)
 			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
+		if (pdata->delta_size)
+			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..7477c7b919 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,7 +14,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	20
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -93,12 +93,12 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
-	unsigned delta_size_valid:1;
+	unsigned char in_pack_header_size;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
-	unsigned type_:TYPE_BITS;
 	unsigned no_try_delta:1;
+	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -108,17 +108,16 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
 
 	/*
 	 * pahole results on 64-bit linux (gcc and clang)
 	 *
-	 *   size: 80, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 80, bit_padding: 9 bits
 	 *
 	 * and on 32-bit (gcc)
 	 *
-	 *   size: 76, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 76, bit_padding: 9 bits
 	 */
 };
 
@@ -130,6 +129,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	uint32_t *delta_size;
 
 	/*
 	 * Only one of these can be non-NULL and they have different
@@ -141,6 +141,7 @@ struct packing_data {
 	struct packed_git **in_pack;
 
 	uintmax_t oe_size_limit;
+	uintmax_t oe_delta_size_limit;
 };
 
 void prepare_packing_data(struct packing_data *pdata);
@@ -327,23 +328,4 @@ static inline void oe_set_size(struct packing_data *pack,
 	}
 }
 
-static inline unsigned long oe_delta_size(struct packing_data *pack,
-					  const struct object_entry *e)
-{
-	if (e->delta_size_valid)
-		return e->delta_size_;
-	return oe_size(pack, e);
-}
-
-static inline void oe_set_delta_size(struct packing_data *pack,
-				     struct object_entry *e,
-				     unsigned long size)
-{
-	e->delta_size_ = size;
-	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(pack, e))
-		BUG("this can only happen in check_object() "
-		    "where delta size is the same as entry size");
-}
-
 #endif
diff --git a/t/README b/t/README
index 8373a27fea..9028b47d92 100644
--- a/t/README
+++ b/t/README
@@ -315,6 +315,10 @@ packs on demand. This normally only happens when the object size is
 over 2GB. This variable forces the code path on any object larger than
 <n> bytes.
 
+GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
+path where deltas larger than this limit require extra memory
+allocation for bookkeeping.
+
 Naming Tests
 ------------
 
-- 8< --

--
Duy

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 18:24             ` Duy Nguyen
@ 2018-07-19 19:17               ` Jeff King
  2018-07-19 23:11               ` Elijah Newren
  1 sibling, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-19 19:17 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Elijah Newren, Git Mailing List

On Thu, Jul 19, 2018 at 08:24:42PM +0200, Duy Nguyen wrote:

> > > Looking at that output, my _guess_ is that we somehow end up with a
> > > bogus delta_size value and write out a truncated entry. But I couldn't
> > > reproduce the issue with smaller test cases.
> > 
> > Could it be a race condition?
> 
> I'm convinced my code is racy (between two writes). I created a broken
> pack once with 32 threads. Elijah please try again with this new
> patch. It should fix this (I only tried repack a few times so far but
> will continue)

Good thinking, it's definitely racy. And that's why my tiny reproduction
didn't work. I even tried bumping it up to 10 blobs instead of 2, but
that's not nearly enough.

> The race is this
> 
> 1. Thread one sees a large delta size and NULL delta_size[] array,
>    allocates the new array and in the middle of copying old delta
>    sizes over.
> 
> 2. Thread two wants to write a new (large) delta size. It sees that
>    delta_size[] is already allocated, it writes the correct size there
>    (and truncated one in object_entry->delta_size_)
> 
> 3. Back to thread one, it now copies the truncated value in
>    delta_size_ from step 2 to delta_size[] array, overwriting the good
>    value that thread two wrote.

Right. Or we could even allocate two delta_size arrays, since the
NULL-check and the allocation are not atomic.

> There is also a potential read/write race where a read from
> pack_size[] happens when the array is not ready. But I don't think it
> can happen with current try_delta() code. I protect it anyway to be
> safe.

Hrm. That one's disappointing, because we read much more often than we
write, and this introduces potential lock contention. It may not matter
much in practice, though.

> +static unsigned long oe_delta_size(struct packing_data *pack,
> +				   const struct object_entry *e)
> +{
> +	unsigned long size;
> +
> +	read_lock();	 /* to protect access to pack->delta_size[] */
> +	if (pack->delta_size)
> +		size = pack->delta_size[e - pack->objects];
> +	else
> +		size = e->delta_size_;
> +	read_unlock();
> +	return size;
> +}

Yuck, we even have to pay the read_lock() cost when we don't overflow
into the pack->delta_size array (but I agree we have to for
correctness).

Again, though, this amount of contention probably doesn't make a big
difference, since we're holding the lock for such a short time
(especially compared to all the work of computing the deltas).

This could be separate from the read_lock(), though, since that one does
block for much longer (e.g., while zlib inflating objects from disk).

> +static void oe_set_delta_size(struct packing_data *pack,
> +			      struct object_entry *e,
> +			      unsigned long size)
> +{
> +	read_lock();	 /* to protect access to pack->delta_size[] */
> +	if (!pack->delta_size && size < pack->oe_delta_size_limit) {
> +		e->delta_size_ = size;
> +		read_unlock();
> +		return;
> +	}

And ditto for this one. I thought we could get away with the "fast case"
skipping the lock, but we have to check pack->delta_size atomically to
even use it.

If each individual delta_size had an overflow bit that indicates "use me
literally" or "look me up in the array", then I think the "quick" ones
could avoid locking. It may not be worth the complexity though.

> @@ -160,6 +162,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>  
>  		if (!pdata->in_pack_by_idx)
>  			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
> +		if (pdata->delta_size)
> +			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
>  	}
>  

This realloc needs to happen under the lock, too, I think. It would be
OK without locking for an in-place realloc, but if the chunk has to be
moved, somebody in oe_set_delta_size() might write to the old memory.

This is in a file that doesn't even know about read_lock(), of course.
Probably you need a delta mutex as part of the "struct packing_data",
and then it can just be handled inside pack-objects.c entirely.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 15:16     ` Duy Nguyen
  2018-07-19 16:42       ` Elijah Newren
  2018-07-19 17:04       ` Jeff King
@ 2018-07-19 19:25       ` Junio C Hamano
  2018-07-19 19:27         ` Junio C Hamano
  2 siblings, 1 reply; 44+ messages in thread
From: Junio C Hamano @ 2018-07-19 19:25 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Elijah Newren, Git Mailing List, Jeff King

Duy Nguyen <pclouds@gmail.com> writes:

> @@ -330,20 +330,27 @@ static inline void oe_set_size(struct packing_data *pack,
>  static inline unsigned long oe_delta_size(struct packing_data *pack,
>  					  const struct object_entry *e)
>  {
> -	if (e->delta_size_valid)
> +	if (!pack->delta_size)
>  		return e->delta_size_;
> -	return oe_size(pack, e);
> +	return pack->delta_size[e - pack->objects];
>  }
>  
> +void oe_prepare_delta_size_array(struct packing_data *pack);
>  static inline void oe_set_delta_size(struct packing_data *pack,
>  				     struct object_entry *e,
>  				     unsigned long size)
>  {
>  	e->delta_size_ = size;
> -	e->delta_size_valid = e->delta_size_ == size;
> -	if (!e->delta_size_valid && size != oe_size(pack, e))
> -		BUG("this can only happen in check_object() "
> -		    "where delta size is the same as entry size");
> +	if (!pack->delta_size && e->delta_size_ == size)
> +		return;
> +	/*
> +	 * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
> +	 * limit. delta_size_ will not be used anymore. All delta sizes are now
> +	 * from the delta_size[] array.
> +	 */
> +	if (!pack->delta_size)
> +		oe_prepare_delta_size_array(pack);
> +	pack->delta_size[e - pack->objects] = size;
>  }

Imagine that you create a delta and set its size (which happens to
fit in the entry) and then create another whose size does not fit in
the entry.  How does oe_delta_size() know not to look at
pack->delta_size[] and instead look at e->delta_size_ when it wants
to know the delta for the first one?  IOW, shouldn't there be a
"backfill" procedure when oe_set_delta_size() notices that we now
switch to pack->delta_size[] array?


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 19:25       ` Junio C Hamano
@ 2018-07-19 19:27         ` Junio C Hamano
  0 siblings, 0 replies; 44+ messages in thread
From: Junio C Hamano @ 2018-07-19 19:27 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Elijah Newren, Git Mailing List, Jeff King

Junio C Hamano <gitster@pobox.com> writes:

> Imagine that you create a delta and set its size (which happens to
> fit in the entry) and then create another whose size does not fit in
> the entry.  How does oe_delta_size() know not to look at
> pack->delta_size[] and instead look at e->delta_size_ when it wants
> to know the delta for the first one?  IOW, shouldn't there be a
> "backfill" procedure when oe_set_delta_size() notices that we now
> switch to pack->delta_size[] array?

Ah, ignore this.  That is what happens in the prepare_ thing.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 18:24             ` Duy Nguyen
  2018-07-19 19:17               ` Jeff King
@ 2018-07-19 23:11               ` Elijah Newren
  2018-07-20  5:28                 ` Jeff King
  1 sibling, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-19 23:11 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Jeff King, Git Mailing List

On Thu, Jul 19, 2018 at 11:24 AM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Jul 19, 2018 at 07:31:35PM +0200, Duy Nguyen wrote:
>> On Thu, Jul 19, 2018 at 01:23:58PM -0400, Jeff King wrote:
>> > On Thu, Jul 19, 2018 at 09:42:00AM -0700, Elijah Newren wrote:
>> >
>> > > Thanks for the quick turnaround.  Unfortunately, I have some bad news.
>> > > With this patch, I get the following:
>> > >
>> > > $ /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
>> > > Enumerating objects: 4460703, done.
>> > > Counting objects: 100% (4460703/4460703), done.
>> > > Delta compression using up to 40 threads.
>> > > Compressing objects: 100% (3807140/3807140), done.
>> > > Writing objects: 100% (4460703/4460703), done.
>> > > Total 4460703 (delta 2831383), reused 1587071 (delta 0)
>> > > error: failed to unpack compressed delta at offset 183854150 from
>> > > .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack
>> > > fatal: packed object 20ce811e53dabbb8ef9368c108cbbdfa65639c03 (stored
>> > > in .git/objects/pack/pack-30d4f0b0e5a03dc91a658a0586f4e74cdf4a94d6.pack)
>> > > is corrupt
>> > > error: failed to run prune
>> > > MaxRSS:40025196 Time:2531.52
>> >
>> > Looking at that output, my _guess_ is that we somehow end up with a
>> > bogus delta_size value and write out a truncated entry. But I couldn't
>> > reproduce the issue with smaller test cases.
>>
>> Could it be a race condition?
>
> I'm convinced my code is racy (between two writes). I created a broken
> pack once with 32 threads. Elijah please try again with this new
> patch. It should fix this (I only tried repack a few times so far but
> will continue)
>
> The race is this
>
> 1. Thread one sees a large delta size and NULL delta_size[] array,
>    allocates the new array and in the middle of copying old delta
>    sizes over.
>
> 2. Thread two wants to write a new (large) delta size. It sees that
>    delta_size[] is already allocated, it writes the correct size there
>    (and truncated one in object_entry->delta_size_)
>
> 3. Back to thread one, it now copies the truncated value in
>    delta_size_ from step 2 to delta_size[] array, overwriting the good
>    value that thread two wrote.
>
> There is also a potential read/write race where a read from
> pack_size[] happens when the array is not ready. But I don't think it
> can happen with current try_delta() code. I protect it anyway to be
> safe.

Looking at the output from Peff's instrumentation elsewhere in this
thread, I see a lot of lines like
   mismatched get: 32889efd307c7be376da9e3d45a78305f14ba73a = (, 28)
Does that mean it was reading the array when it wasn't ready?


Anyway, with your latest patch (which I'm labelling fix-v4), git gc
--aggressive completes, git fsck likes the result, and the new table
of stats on this repo becomes:

Version  Pack (MB)  MaxRSS(kB)  Time (s)
-------  ---------  ----------  --------
 2.17.0     5498     43513628    2494.85
 2.18.0    10531     40449596    4168.94
 fix-v1     5509     42509784    2480.74
 fiv-v2     5509     41644104    2468.25
 fiv-v4     5500     44400948    2761.74


So, the pack size is back to what is expected.  The code takes about
10% longer and requires 2% more memory than git-2.17.0, but the pack
size was the main issue.


However, it's interesting to also look at the effect on packing
linux.git (on the same beefy hardware):

Version  Pack (MB)  MaxRSS(kB)  Time (s)
-------  ---------  ----------  --------
 2.17.0     1279     11382932      632.24
 2.18.0     1279     10817568      621.97
 fiv-v4     1279     11484168     1193.67

While the pack size is nice and small, the original memory savings
added in 2.18.0 are gone and the performance is much worse.  :-(

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-19 23:11               ` Elijah Newren
@ 2018-07-20  5:28                 ` Jeff King
  2018-07-20  5:30                   ` Jeff King
                                     ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Jeff King @ 2018-07-20  5:28 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Duy Nguyen, Git Mailing List

On Thu, Jul 19, 2018 at 04:11:01PM -0700, Elijah Newren wrote:

> Looking at the output from Peff's instrumentation elsewhere in this
> thread, I see a lot of lines like
>    mismatched get: 32889efd307c7be376da9e3d45a78305f14ba73a = (, 28)
> Does that mean it was reading the array when it wasn't ready?

Yes, it looks like we saw a "get" without a "set". Though this could
also be due to threading. The tracing isn't atomic with respect to the
actual get/set operation, so it's possible that the ordering of the
trace output does not match the ordering of the actual operations.

> However, it's interesting to also look at the effect on packing
> linux.git (on the same beefy hardware):
> 
> Version  Pack (MB)  MaxRSS(kB)  Time (s)
> -------  ---------  ----------  --------
>  2.17.0     1279     11382932      632.24
>  2.18.0     1279     10817568      621.97
>  fiv-v4     1279     11484168     1193.67
> 
> While the pack size is nice and small, the original memory savings
> added in 2.18.0 are gone and the performance is much worse.  :-(

Interesting. I can't reproduce here. The fix-v4 case is only slightly
slower than 2.18.0. Can you double check that your compiler flags, etc,
were the same? Many times I've accidentally compared -O0 to -O0. :)

You might also try the patch below (on top of fix-v4), which moves the
locking to its own dedicated mutex. That should reduce lock contention,
and it fixes the remaining realloc where I think we're still racy. On my
repack of linux.git, it dropped the runtime from 6m3s to 5m41s, almost
entirely in system CPU.

I didn't measure my max rss. However, I'd caution slightly against
drawing too much conclusion from it, for two reasons:

  1. RSS includes mmap'd packfiles, which is subject to whatever pages
     the OS feels like keeping in RAM. So using more heap can sometimes
     go unnoticed in that count, since you're just trading heap pages
     for mmap pages. Although that implies some memory pressure, and it
     sounds like your machine is sufficiently beefy to avoid that.

  2. Peak heap is going to depend on the threading. You have one thread
     per CPU working on a window of objects, each of which will be in
     memory at once. So I'd expect a fair bit of fluctuation in the peak
     just depending on how the threads line up with each other (some of
     it random, and some of it maybe impacted by what the code does, but
     in a way that just happens to fall out for this specific workload).

Which isn't to say measuring it is useless. The trends may override the
noise from those two things. I've just run into problems in the past
trying to get consistent measurements.

Here's the lock patch.

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ba14a1bfbc..b76ce04cb9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1926,12 +1926,12 @@ static unsigned long oe_delta_size(struct packing_data *pack,
 {
 	unsigned long size;
 
-	read_lock();	 /* to protect access to pack->delta_size[] */
+	pack_delta_lock(pack);
 	if (pack->delta_size)
 		size = pack->delta_size[e - pack->objects];
 	else
 		size = e->delta_size_;
-	read_unlock();
+	pack_delta_unlock(pack);
 	return size;
 }
 
@@ -1939,10 +1939,10 @@ static void oe_set_delta_size(struct packing_data *pack,
 			      struct object_entry *e,
 			      unsigned long size)
 {
-	read_lock();	 /* to protect access to pack->delta_size[] */
+	pack_delta_lock(pack);
 	if (!pack->delta_size && size < pack->oe_delta_size_limit) {
 		e->delta_size_ = size;
-		read_unlock();
+		pack_delta_unlock(pack);
 		return;
 	}
 	/*
@@ -1963,7 +1963,7 @@ static void oe_set_delta_size(struct packing_data *pack,
 			pack->delta_size[i] = pack->objects[i].delta_size_;
 	}
 	pack->delta_size[e - pack->objects] = size;
-	read_unlock();
+	pack_delta_unlock(pack);
 }
 
 static int try_delta(struct unpacked *trg, struct unpacked *src,
diff --git a/pack-objects.c b/pack-objects.c
index e3c32bbfc2..8d9c2dfb82 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -148,6 +148,10 @@ void prepare_packing_data(struct packing_data *pdata)
 					     1U << OE_SIZE_BITS);
 	pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
 						   1U << OE_DELTA_SIZE_BITS);
+
+#ifndef NO_PTHREADS
+	pthread_mutex_init(&pdata->delta_lock, NULL);
+#endif
 }
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -162,8 +166,10 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 		if (!pdata->in_pack_by_idx)
 			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
+		pack_delta_lock(pdata);
 		if (pdata->delta_size)
 			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
+		pack_delta_unlock(pdata);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
diff --git a/pack-objects.h b/pack-objects.h
index 730990c5d2..29b6006949 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -130,6 +130,9 @@ struct packing_data {
 
 	unsigned int *in_pack_pos;
 	uint32_t *delta_size;
+#ifndef NO_PTHREADS
+	pthread_mutex_t delta_lock;
+#endif
 
 	/*
 	 * Only one of these can be non-NULL and they have different
@@ -144,6 +147,14 @@ struct packing_data {
 	uintmax_t oe_delta_size_limit;
 };
 
+#ifndef NO_PTHREADS
+#define pack_delta_lock(pdata) pthread_mutex_lock(&pdata->delta_lock)
+#define pack_delta_unlock(pdata) pthread_mutex_unlock(&pdata->delta_lock)
+#else
+#define pack_delta_lock(pdata)
+#define pack_delta_unlock(pdata)
+#endif
+
 void prepare_packing_data(struct packing_data *pdata);
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-20  5:28                 ` Jeff King
@ 2018-07-20  5:30                   ` Jeff King
  2018-07-20  5:47                   ` Duy Nguyen
  2018-07-20 17:21                   ` Elijah Newren
  2 siblings, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-20  5:30 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Duy Nguyen, Git Mailing List

On Fri, Jul 20, 2018 at 01:28:29AM -0400, Jeff King wrote:

> @@ -162,8 +166,10 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>  
>  		if (!pdata->in_pack_by_idx)
>  			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
> +		pack_delta_lock(pdata);
>  		if (pdata->delta_size)
>  			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
> +		pack_delta_unlock(pdata);
>  	}

This made me wonder if in_pack_by_idx needs a similar lock. But
actually, I don't think either is necessary. We only allocate new
entries during the counting phase, not during the threaded delta search.

So this hunk could be dropped (and the system CPU benefit I saw is just
from reduced lock contention).

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-20  5:28                 ` Jeff King
  2018-07-20  5:30                   ` Jeff King
@ 2018-07-20  5:47                   ` Duy Nguyen
  2018-07-20 17:21                   ` Elijah Newren
  2 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-20  5:47 UTC (permalink / raw)
  To: Jeff King; +Cc: Elijah Newren, Git Mailing List

On Fri, Jul 20, 2018 at 7:28 AM Jeff King <peff@peff.net> wrote:
>
> On Thu, Jul 19, 2018 at 04:11:01PM -0700, Elijah Newren wrote:
>
> > Looking at the output from Peff's instrumentation elsewhere in this
> > thread, I see a lot of lines like
> >    mismatched get: 32889efd307c7be376da9e3d45a78305f14ba73a = (, 28)
> > Does that mean it was reading the array when it wasn't ready?
>
> Yes, it looks like we saw a "get" without a "set". Though this could
> also be due to threading. The tracing isn't atomic with respect to the
> actual get/set operation, so it's possible that the ordering of the
> trace output does not match the ordering of the actual operations.
>
> > However, it's interesting to also look at the effect on packing
> > linux.git (on the same beefy hardware):
> >
> > Version  Pack (MB)  MaxRSS(kB)  Time (s)
> > -------  ---------  ----------  --------
> >  2.17.0     1279     11382932      632.24
> >  2.18.0     1279     10817568      621.97
> >  fiv-v4     1279     11484168     1193.67
> >
> > While the pack size is nice and small, the original memory savings
> > added in 2.18.0 are gone and the performance is much worse.  :-(
>
> Interesting. I can't reproduce here. The fix-v4 case is only slightly
> slower than 2.18.0. Can you double check that your compiler flags, etc,
> were the same? Many times I've accidentally compared -O0 to -O0. :)

He ran 40 threads though. That number of threads can make lock
contention very expensive. Yeah my money is also on lock contention.

> You might also try the patch below (on top of fix-v4), which moves the

Another thing Elijah could try is watch CPU utilization. If this is
lock contention, I think core utilization should be much lower because
we spend more time waiting than actually doing things.

> locking to its own dedicated mutex. That should reduce lock contention,

I think we could use cache_lock() which is for non-odb shared data
(and delta_size[] fits this category)

> and it fixes the remaining realloc where I think we're still racy. On my

Yeah it's not truly racy as you also noted in another mail. I'll make
a note about this in the commit message.

> repack of linux.git, it dropped the runtime from 6m3s to 5m41s, almost
> entirely in system CPU.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
                   ` (3 preceding siblings ...)
  2018-07-19  5:44 ` Jeff King
@ 2018-07-20 15:39 ` Nguyễn Thái Ngọc Duy
  2018-07-20 17:40   ` Jeff King
                     ` (2 more replies)
  4 siblings, 3 replies; 44+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-07-20 15:39 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Elijah Newren, Jeff King,
	Nguyễn Thái Ngọc Duy

Let's start with some background about oe_delta_size() and
oe_set_delta_size(). If you already know, skip the next paragraph.

These two are added in 0aca34e826 (pack-objects: shrink delta_size
field in struct object_entry - 2018-04-14) to help reduce 'struct
object_entry' size. The delta size field in this struct is reduced to
only contain max 2MB. So if any new delta is produced and larger than
2MB, it's dropped because we can't really save such a large size
anywhere. Fallback is provided in case existingpackfiles already have
large deltas, then we can retrieve it from the pack.

While this should help small machines repacking large repos (i.e. less
memory pressure), dropping large deltas during the delta selection
process could end up with worse pack files. And if existing packfiles
already have >2MB delta and pack-objects is instructed to not reuse
deltas, all of them will be dropped on the floor, and the resulting
pack would be definitely bigger.

There is also a regression in terms of CPU/IO if we have large on-disk
deltas because fallback code needs to parse the pack every time the
delta size is needed and just access to the mmap'd pack data is enough
for extra page faults when memory is under pressure.

Both of these issues were reported on the mailing list. Here's some
numbers for comparison.

    Version  Pack (MB)  MaxRSS(kB)  Time (s)
    -------  ---------  ----------  --------
     2.17.0     5498     43513628    2494.85
     2.18.0    10531     40449596    4168.94

This patch provides a better fallback that is

- cheaper in terms of cpu and io because we won't have to read
  existing pack files as much

- better in terms of pack size because the pack heuristics is back to
  2.17.0 time, we do not drop large deltas at all

If we encounter any delta (on-disk or created during try_delta phase)
that is larger than the 2MB limit, we stop using delta_size_ field for
this because it can't contain such size anyway. A new array of delta
size is dynamically allocated and can hold all the deltas that 2.17.0
can [1]. All current delta sizes are migrated over.

With this, we do not have to drop deltas in try_delta() anymore. Of
course the downside is we use slightly more memory, even compared to
2.17.0. But since this is considered an uncommon case, a bit more
memory consumption should not be a problem.

Delta size limit is also raised from 2MB to 32 MB to better cover
common case and avoid that extra memory consumption (99.999% deltas in
this reported repo are under 12MB). Other fields are shuffled around
to keep this struct packed tight. We don't use more memory in common
case even with this limit update.

A note about thread synchronization. Since this code can be run in
parallel during delta searching phase, we need a mutex. The realloc
part in packlist_alloc() is not protected because it only happens
during the object counting phase, which is always single-threaded.

The locking in oe_delta_size() could also be dropped if we make sure
the pack->delta_size pointer assignment in oe_set_delta_size() is
atomic. But let's keep the lock for now to be on the safe side. Lock
contention should not be that high for this lock.

[1] With a small tweak. 2.17.0 on 64-bit linux can hold 2^64 byte
    deltas, which is absolutely insane. But windows builds, even
    64-bit version, only hold 2^32. So reducing it to 2^32 should be
    quite safe.

Reported-by: Elijah Newren <newren@gmail.com>
Helped-by: Elijah Newren <newren@gmail.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 I'm optimistic that the slowness on linux repo is lock contention
 (because if it's not then I have no clue what is). So let's start
 doing proper patches.

 If we want a quick fix for 2.18.1. I suggest bumping up
 OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
 think it's safe to fast track this patch.

 builtin/pack-objects.c    |  6 ++--
 ci/run-build-and-tests.sh |  1 +
 pack-objects.c            | 21 ++++++++++++
 pack-objects.h            | 68 +++++++++++++++++++++++++++++++--------
 t/README                  |  4 +++
 5 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ebc8cefb53..3bc182b1b6 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2023,10 +2023,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
-		free(delta_buf);
-		return 0;
-	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
@@ -2278,6 +2274,8 @@ static void init_threaded_search(void)
 	pthread_mutex_init(&cache_mutex, NULL);
 	pthread_mutex_init(&progress_mutex, NULL);
 	pthread_cond_init(&progress_cond, NULL);
+	pthread_mutex_init(&to_pack.lock, NULL);
+	to_pack.lock_initialized = 1;
 	old_try_to_free_routine = set_try_to_free_routine(try_to_free_from_threads);
 }
 
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 4b04c75b7f..2a5bff4a1c 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -14,6 +14,7 @@ then
 	export GIT_TEST_SPLIT_INDEX=yes
 	export GIT_TEST_FULL_IN_PACK_ARRAY=true
 	export GIT_TEST_OE_SIZE=10
+	export GIT_TEST_OE_DELTA_SIZE=5
 	make --quiet test
 fi
 
diff --git a/pack-objects.c b/pack-objects.c
index 92708522e7..eef344b7c1 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -88,6 +88,23 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 	return &pdata->objects[pdata->index[i] - 1];
 }
 
+uint32_t *new_delta_size_array(struct packing_data *pack)
+{
+	uint32_t *delta_size;
+	uint32_t i;
+
+	/*
+	 * nr_alloc, not nr_objects to align with realloc() strategy in
+	 * packlist_alloc()
+	 */
+	ALLOC_ARRAY(delta_size, pack->nr_alloc);
+
+	for (i = 0; i < pack->nr_objects; i++)
+		delta_size[i] = pack->objects[i].delta_size_;
+
+	return delta_size;
+}
+
 static void prepare_in_pack_by_idx(struct packing_data *pdata)
 {
 	struct packed_git **mapping, *p;
@@ -146,6 +163,8 @@ void prepare_packing_data(struct packing_data *pdata)
 
 	pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
 					     1U << OE_SIZE_BITS);
+	pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
+						   1U << OE_DELTA_SIZE_BITS);
 }
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -160,6 +179,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 		if (!pdata->in_pack_by_idx)
 			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
+		if (pdata->delta_size)
+			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..9f977ae800 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #include "object-store.h"
+#include "thread-utils.h"
 
 #define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
 
@@ -14,7 +15,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	24
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -93,12 +94,12 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
-	unsigned delta_size_valid:1;
+	unsigned char in_pack_header_size;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
-	unsigned type_:TYPE_BITS;
 	unsigned no_try_delta:1;
+	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -108,17 +109,16 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
 
 	/*
 	 * pahole results on 64-bit linux (gcc and clang)
 	 *
-	 *   size: 80, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 80, bit_padding: 9 bits
 	 *
 	 * and on 32-bit (gcc)
 	 *
-	 *   size: 76, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 76, bit_padding: 9 bits
 	 */
 };
 
@@ -130,6 +130,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	uint32_t *delta_size;
 
 	/*
 	 * Only one of these can be non-NULL and they have different
@@ -140,10 +141,32 @@ struct packing_data {
 	struct packed_git **in_pack_by_idx;
 	struct packed_git **in_pack;
 
+#ifndef NO_PTHREADS
+	int lock_initialized;
+	pthread_mutex_t lock;
+#endif
+
 	uintmax_t oe_size_limit;
+	uintmax_t oe_delta_size_limit;
 };
 
 void prepare_packing_data(struct packing_data *pdata);
+
+static inline void packing_data_lock(struct packing_data *pdata)
+{
+#ifndef NO_PTHREADS
+	if (pdata->lock_initialized)
+		pthread_mutex_lock(&pdata->lock);
+#endif
+}
+static inline void packing_data_unlock(struct packing_data *pdata)
+{
+#ifndef NO_PTHREADS
+	if (pdata->lock_initialized)
+		pthread_mutex_unlock(&pdata->lock);
+#endif
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos);
@@ -330,20 +353,37 @@ static inline void oe_set_size(struct packing_data *pack,
 static inline unsigned long oe_delta_size(struct packing_data *pack,
 					  const struct object_entry *e)
 {
-	if (e->delta_size_valid)
-		return e->delta_size_;
-	return oe_size(pack, e);
+	unsigned long size;
+
+	packing_data_lock(pack);
+	if (pack->delta_size)
+		size = pack->delta_size[e - pack->objects];
+	else
+		size = e->delta_size_;
+	packing_data_unlock(pack);
+	return size;
 }
 
+uint32_t *new_delta_size_array(struct packing_data *pdata);
 static inline void oe_set_delta_size(struct packing_data *pack,
 				     struct object_entry *e,
 				     unsigned long size)
 {
-	e->delta_size_ = size;
-	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(pack, e))
-		BUG("this can only happen in check_object() "
-		    "where delta size is the same as entry size");
+	packing_data_lock(pack);
+	if (!pack->delta_size && size < pack->oe_delta_size_limit) {
+		packing_data_unlock(pack);
+		e->delta_size_ = size;
+		return;
+	}
+	/*
+	 * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
+	 * limit. delta_size_ will not be used anymore. All delta sizes are
+	 * now from the delta_size[] array.
+	 */
+	if (!pack->delta_size)
+		pack->delta_size = new_delta_size_array(pack);
+	pack->delta_size[e - pack->objects] = size;
+	packing_data_unlock(pack);
 }
 
 #endif
diff --git a/t/README b/t/README
index 8373a27fea..9028b47d92 100644
--- a/t/README
+++ b/t/README
@@ -315,6 +315,10 @@ packs on demand. This normally only happens when the object size is
 over 2GB. This variable forces the code path on any object larger than
 <n> bytes.
 
+GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
+path where deltas larger than this limit require extra memory
+allocation for bookkeeping.
+
 Naming Tests
 ------------
 
-- 
2.18.0.rc2.476.g39500d3211


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: 2.18.0 Regression: packing performance and effectiveness
  2018-07-20  5:28                 ` Jeff King
  2018-07-20  5:30                   ` Jeff King
  2018-07-20  5:47                   ` Duy Nguyen
@ 2018-07-20 17:21                   ` Elijah Newren
  2 siblings, 0 replies; 44+ messages in thread
From: Elijah Newren @ 2018-07-20 17:21 UTC (permalink / raw)
  To: Jeff King; +Cc: Duy Nguyen, Git Mailing List

On Thu, Jul 19, 2018 at 10:28 PM, Jeff King <peff@peff.net> wrote:
> On Thu, Jul 19, 2018 at 04:11:01PM -0700, Elijah Newren wrote:
>
>> Looking at the output from Peff's instrumentation elsewhere in this
>> thread, I see a lot of lines like
>>    mismatched get: 32889efd307c7be376da9e3d45a78305f14ba73a = (, 28)
>> Does that mean it was reading the array when it wasn't ready?
>
> Yes, it looks like we saw a "get" without a "set". Though this could
> also be due to threading. The tracing isn't atomic with respect to the
> actual get/set operation, so it's possible that the ordering of the
> trace output does not match the ordering of the actual operations.
>
>> However, it's interesting to also look at the effect on packing
>> linux.git (on the same beefy hardware):
>>
>> Version  Pack (MB)  MaxRSS(kB)  Time (s)
>> -------  ---------  ----------  --------
>>  2.17.0     1279     11382932      632.24
>>  2.18.0     1279     10817568      621.97
>>  fiv-v4     1279     11484168     1193.67
>>
>> While the pack size is nice and small, the original memory savings
>> added in 2.18.0 are gone and the performance is much worse.  :-(
>
> Interesting. I can't reproduce here. The fix-v4 case is only slightly
> slower than 2.18.0. Can you double check that your compiler flags, etc,
> were the same? Many times I've accidentally compared -O0 to -O0. :)

Same build flags each time, but as Duy points out, this was on a
40-processor box.

> You might also try the patch below (on top of fix-v4), which moves the
> locking to its own dedicated mutex. That should reduce lock contention,
> and it fixes the remaining realloc where I think we're still racy. On my
> repack of linux.git, it dropped the runtime from 6m3s to 5m41s, almost
> entirely in system CPU.

I had to add an include of pthread.h to get it to compile, but
otherwise I'm trying it out.  Re-running a few times with each version
to see how big the run-to-run variance is.

> I didn't measure my max rss. However, I'd caution slightly against
> drawing too much conclusion from it, for two reasons:
>
>   1. RSS includes mmap'd packfiles, which is subject to whatever pages
>      the OS feels like keeping in RAM. So using more heap can sometimes
>      go unnoticed in that count, since you're just trading heap pages
>      for mmap pages. Although that implies some memory pressure, and it
>      sounds like your machine is sufficiently beefy to avoid that.
>
>   2. Peak heap is going to depend on the threading. You have one thread
>      per CPU working on a window of objects, each of which will be in
>      memory at once. So I'd expect a fair bit of fluctuation in the peak
>      just depending on how the threads line up with each other (some of
>      it random, and some of it maybe impacted by what the code does, but
>      in a way that just happens to fall out for this specific workload).
>
> Which isn't to say measuring it is useless. The trends may override the
> noise from those two things. I've just run into problems in the past
> trying to get consistent measurements.

Good to know.  I was actually measuring it due to commit 3b13a5f26387
("pack-objects: reorder members to shrink struct object_entry",
2018-04-14) referencing
https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/ as
being the measurement methodology to back up the results of the
original purpose of the change.  (I did miss out on the "run 3 times,
report the lowest value" that Ævar did, though.)  I figured that as
long as we were further tweaking things along the lines of that patch
series, it made sense to try to do similar measurements to see if we
were at least improving things relative to 2.17.0.

> Here's the lock patch.
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index ba14a1bfbc..b76ce04cb9 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -1926,12 +1926,12 @@ static unsigned long oe_delta_size(struct packing_data *pack,
>  {
>         unsigned long size;
>
> -       read_lock();     /* to protect access to pack->delta_size[] */
> +       pack_delta_lock(pack);
>         if (pack->delta_size)
>                 size = pack->delta_size[e - pack->objects];
>         else
>                 size = e->delta_size_;
> -       read_unlock();
> +       pack_delta_unlock(pack);
>         return size;
>  }
>
> @@ -1939,10 +1939,10 @@ static void oe_set_delta_size(struct packing_data *pack,
>                               struct object_entry *e,
>                               unsigned long size)
>  {
> -       read_lock();     /* to protect access to pack->delta_size[] */
> +       pack_delta_lock(pack);
>         if (!pack->delta_size && size < pack->oe_delta_size_limit) {
>                 e->delta_size_ = size;
> -               read_unlock();
> +               pack_delta_unlock(pack);
>                 return;
>         }
>         /*
> @@ -1963,7 +1963,7 @@ static void oe_set_delta_size(struct packing_data *pack,
>                         pack->delta_size[i] = pack->objects[i].delta_size_;
>         }
>         pack->delta_size[e - pack->objects] = size;
> -       read_unlock();
> +       pack_delta_unlock(pack);
>  }
>
>  static int try_delta(struct unpacked *trg, struct unpacked *src,
> diff --git a/pack-objects.c b/pack-objects.c
> index e3c32bbfc2..8d9c2dfb82 100644
> --- a/pack-objects.c
> +++ b/pack-objects.c
> @@ -148,6 +148,10 @@ void prepare_packing_data(struct packing_data *pdata)
>                                              1U << OE_SIZE_BITS);
>         pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
>                                                    1U << OE_DELTA_SIZE_BITS);
> +
> +#ifndef NO_PTHREADS
> +       pthread_mutex_init(&pdata->delta_lock, NULL);
> +#endif
>  }
>
>  struct object_entry *packlist_alloc(struct packing_data *pdata,
> @@ -162,8 +166,10 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>
>                 if (!pdata->in_pack_by_idx)
>                         REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
> +               pack_delta_lock(pdata);
>                 if (pdata->delta_size)
>                         REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
> +               pack_delta_unlock(pdata);
>         }
>
>         new_entry = pdata->objects + pdata->nr_objects++;
> diff --git a/pack-objects.h b/pack-objects.h
> index 730990c5d2..29b6006949 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -130,6 +130,9 @@ struct packing_data {
>
>         unsigned int *in_pack_pos;
>         uint32_t *delta_size;
> +#ifndef NO_PTHREADS
> +       pthread_mutex_t delta_lock;
> +#endif
>
>         /*
>          * Only one of these can be non-NULL and they have different
> @@ -144,6 +147,14 @@ struct packing_data {
>         uintmax_t oe_delta_size_limit;
>  };
>
> +#ifndef NO_PTHREADS
> +#define pack_delta_lock(pdata) pthread_mutex_lock(&pdata->delta_lock)
> +#define pack_delta_unlock(pdata) pthread_mutex_unlock(&pdata->delta_lock)
> +#else
> +#define pack_delta_lock(pdata)
> +#define pack_delta_unlock(pdata)
> +#endif
> +
>  void prepare_packing_data(struct packing_data *pdata);
>  struct object_entry *packlist_alloc(struct packing_data *pdata,
>                                     const unsigned char *sha1,

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
@ 2018-07-20 17:40   ` Jeff King
  2018-07-21  4:23     ` Duy Nguyen
  2018-07-20 17:43   ` Elijah Newren
  2018-07-22  8:04   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2 siblings, 1 reply; 44+ messages in thread
From: Jeff King @ 2018-07-20 17:40 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano, Elijah Newren

On Fri, Jul 20, 2018 at 05:39:43PM +0200, Nguyễn Thái Ngọc Duy wrote:

> Let's start with some background about oe_delta_size() and
> oe_set_delta_size(). If you already know, skip the next paragraph.
> 
> These two are added in 0aca34e826 (pack-objects: shrink delta_size
> field in struct object_entry - 2018-04-14) to help reduce 'struct
> object_entry' size. The delta size field in this struct is reduced to
> only contain max 2MB. So if any new delta is produced and larger than
> 2MB, it's dropped because we can't really save such a large size
> anywhere. Fallback is provided in case existingpackfiles already have
> large deltas, then we can retrieve it from the pack.

Minor nit, but isn't this 1MB (it was 2MB after one of your patches, but
I think v2.18.0 has 20 bits)?

> [...more commit message...]

Nicely explained overall.

> With this, we do not have to drop deltas in try_delta() anymore. Of
> course the downside is we use slightly more memory, even compared to
> 2.17.0. But since this is considered an uncommon case, a bit more
> memory consumption should not be a problem.

I wondered how common this might be. The easiest way to see the largest
delta sizes is:

  git cat-file --batch-all-objects \
               --batch-check='%(objectsize:disk) %(deltabase)' |
  grep -v 0000000000000000000000000000000000000000 |
  sort -rn | head

The biggest one in the kernel is ~300k. Which is about what I'd expect
for a normal source code repo. Even some private repos I have with a lot
of binary artifacts top out at about 3MB. So the new 32MB is probably
pretty unlikely to trigger, unless you really do have a bunch of huge
artifacts.

If you do, then more memory pressure isn't great. But as a relative
measure, the extra couple megabytes to store one 32-bit int per object
is probably fine.

> A note about thread synchronization. Since this code can be run in
> parallel during delta searching phase, we need a mutex. The realloc
> part in packlist_alloc() is not protected because it only happens
> during the object counting phase, which is always single-threaded.

Right, thanks for clarifying this here.

> The locking in oe_delta_size() could also be dropped if we make sure
> the pack->delta_size pointer assignment in oe_set_delta_size() is
> atomic. But let's keep the lock for now to be on the safe side. Lock
> contention should not be that high for this lock.

Yeah, I agree with this, now that we're not using the read_lock().

> [1] With a small tweak. 2.17.0 on 64-bit linux can hold 2^64 byte
>     deltas, which is absolutely insane. But windows builds, even
>     64-bit version, only hold 2^32. So reducing it to 2^32 should be
>     quite safe.

I'm not sure I completely agree with this. While 4GB deltas should be
pretty rare, the nice thing about 64-bit is that you never have to even
think about whether the variable is large enough. I think the 2^32
limitations on Windows are something we should be fixing in the long
term (though there it is even worse because it is not just deltas, but
entire objects).

> ---
>  I'm optimistic that the slowness on linux repo is lock contention
>  (because if it's not then I have no clue what is). So let's start
>  doing proper patches.

Me too, but I'd love to see more numbers from Elijah.

>  If we want a quick fix for 2.18.1. I suggest bumping up
>  OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
>  think it's safe to fast track this patch.

Also agreed.

> @@ -2278,6 +2274,8 @@ static void init_threaded_search(void)
>  	pthread_mutex_init(&cache_mutex, NULL);
>  	pthread_mutex_init(&progress_mutex, NULL);
>  	pthread_cond_init(&progress_cond, NULL);
> +	pthread_mutex_init(&to_pack.lock, NULL);
> +	to_pack.lock_initialized = 1;
>  	old_try_to_free_routine = set_try_to_free_routine(try_to_free_from_threads);
>  }

This is new in this iteration. I guess this is to cover the case where
we are built with pthread support, but --threads=1?

Given that we no longer have to touch this lock during the realloc, is
it worth actually putting it into to_pack? Instead, we could keep it
local to pack-objects, alongside all the other locks (and use the
lock_mutex() helper which handles the single-thread case).

Your original patch had to copy the oe_* helpers into the file to handle
that. But I think we're essentially just locking the whole functions:

> @@ -330,20 +353,37 @@ static inline void oe_set_size(struct packing_data *pack,
>  static inline unsigned long oe_delta_size(struct packing_data *pack,
>  					  const struct object_entry *e)
>  {
> -	if (e->delta_size_valid)
> -		return e->delta_size_;
> -	return oe_size(pack, e);
> +	unsigned long size;
> +
> +	packing_data_lock(pack);
> +	if (pack->delta_size)
> +		size = pack->delta_size[e - pack->objects];
> +	else
> +		size = e->delta_size_;
> +	packing_data_unlock(pack);
> +	return size;
>  }

So could we just wrap that like:

  static inline DELTA_SIZE(struct object_entry *oe)
  {
	unsigned long ret;

	delta_lock();
	ret = oe_delta_size(&to_pack, oe);
	delta_unlock();

	return size;
  }

That makes the implementation of oe_delta_size() simpler, too, since
we'd be free to directly return.

(Yes, it's a little weird to keep the all-caps name for a non-macro. I'd
be fine downcasing it, though I think it still works to point out that
something funny and global is going on).

> [...]

The rest of the patch looks pretty sensible to me.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
  2018-07-20 17:40   ` Jeff King
@ 2018-07-20 17:43   ` Elijah Newren
  2018-07-20 23:52     ` Elijah Newren
                       ` (2 more replies)
  2018-07-22  8:04   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2 siblings, 3 replies; 44+ messages in thread
From: Elijah Newren @ 2018-07-20 17:43 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: Git Mailing List, Junio C Hamano, Jeff King

On Fri, Jul 20, 2018 at 8:39 AM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> Let's start with some background about oe_delta_size() and
> oe_set_delta_size(). If you already know, skip the next paragraph.
>
> These two are added in 0aca34e826 (pack-objects: shrink delta_size
> field in struct object_entry - 2018-04-14) to help reduce 'struct
> object_entry' size. The delta size field in this struct is reduced to
> only contain max 2MB. So if any new delta is produced and larger than

I think you mean 1MB?

$ git grep OE_DELTA_SIZE_BITS v2.18.0
v2.18.0:builtin/pack-objects.c: if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
v2.18.0:pack-objects.h:#define OE_DELTA_SIZE_BITS       20
v2.18.0:pack-objects.h: unsigned delta_size_:OE_DELTA_SIZE_BITS; /*
delta data size (uncompressed) */

In https://public-inbox.org/git/20180719151640.GA24997@duynguyen.home/,
you bumped this to 21, which may be where you got the 2MB figure?
Your 2MB figure also appears on the next line and a few other places
in the commit message.

> 2MB, it's dropped because we can't really save such a large size
> anywhere. Fallback is provided in case existingpackfiles already have

Missing space: s/existingpackfiles/existing packfiles/

> large deltas, then we can retrieve it from the pack.
>
> While this should help small machines repacking large repos (i.e. less

Maybe change "repacking large repos" to "repacking large repos without
large deltas"?

> memory pressure), dropping large deltas during the delta selection
> process could end up with worse pack files. And if existing packfiles
> already have >2MB delta and pack-objects is instructed to not reuse
> deltas, all of them will be dropped on the floor, and the resulting
> pack would be definitely bigger.
>
> There is also a regression in terms of CPU/IO if we have large on-disk
> deltas because fallback code needs to parse the pack every time the
> delta size is needed and just access to the mmap'd pack data is enough
> for extra page faults when memory is under pressure.
>
> Both of these issues were reported on the mailing list. Here's some
> numbers for comparison.
>
>     Version  Pack (MB)  MaxRSS(kB)  Time (s)
>     -------  ---------  ----------  --------
>      2.17.0     5498     43513628    2494.85
>      2.18.0    10531     40449596    4168.94
>
> This patch provides a better fallback that is
>
> - cheaper in terms of cpu and io because we won't have to read
>   existing pack files as much
>
> - better in terms of pack size because the pack heuristics is back to
>   2.17.0 time, we do not drop large deltas at all
>
> If we encounter any delta (on-disk or created during try_delta phase)
> that is larger than the 2MB limit, we stop using delta_size_ field for
> this because it can't contain such size anyway. A new array of delta
> size is dynamically allocated and can hold all the deltas that 2.17.0
> can [1]. All current delta sizes are migrated over.
>
> With this, we do not have to drop deltas in try_delta() anymore. Of
> course the downside is we use slightly more memory, even compared to
> 2.17.0. But since this is considered an uncommon case, a bit more
> memory consumption should not be a problem.

Out of curiosity, would it be possible to use the delta_size_ field
for deltas that are small enough, and only use an external data
structure (perhaps a hash rather than an array) for the few deltas
that are large?

> Delta size limit is also raised from 2MB to 32 MB to better cover
> common case and avoid that extra memory consumption (99.999% deltas in
> this reported repo are under 12MB). Other fields are shuffled around
> to keep this struct packed tight. We don't use more memory in common
> case even with this limit update.
>
> A note about thread synchronization. Since this code can be run in
> parallel during delta searching phase, we need a mutex. The realloc
> part in packlist_alloc() is not protected because it only happens
> during the object counting phase, which is always single-threaded.
>
> The locking in oe_delta_size() could also be dropped if we make sure
> the pack->delta_size pointer assignment in oe_set_delta_size() is
> atomic. But let's keep the lock for now to be on the safe side. Lock
> contention should not be that high for this lock.
>
> [1] With a small tweak. 2.17.0 on 64-bit linux can hold 2^64 byte
>     deltas, which is absolutely insane. But windows builds, even
>     64-bit version, only hold 2^32. So reducing it to 2^32 should be
>     quite safe.
>
> Reported-by: Elijah Newren <newren@gmail.com>
> Helped-by: Elijah Newren <newren@gmail.com>
> Helped-by: Jeff King <peff@peff.net>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  I'm optimistic that the slowness on linux repo is lock contention
>  (because if it's not then I have no clue what is). So let's start
>  doing proper patches.

I'll be testing this shortly...

>
>  If we want a quick fix for 2.18.1. I suggest bumping up
>  OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
>  think it's safe to fast track this patch.

Okay, I'll submit that with a proper commit message then.  Since you
also bump OE_DELTA_SIZE_BITS in your patch (though to a different
value), it'll require a conflict resolution when merging maint into
master, but at least the change won't silently leak through.

>  builtin/pack-objects.c    |  6 ++--
>  ci/run-build-and-tests.sh |  1 +
>  pack-objects.c            | 21 ++++++++++++
>  pack-objects.h            | 68 +++++++++++++++++++++++++++++++--------
>  t/README                  |  4 +++
>  5 files changed, 82 insertions(+), 18 deletions(-)
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index ebc8cefb53..3bc182b1b6 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -2023,10 +2023,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
>         delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
>         if (!delta_buf)
>                 return 0;
> -       if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
> -               free(delta_buf);
> -               return 0;
> -       }
>
>         if (DELTA(trg_entry)) {
>                 /* Prefer only shallower same-sized deltas. */
> @@ -2278,6 +2274,8 @@ static void init_threaded_search(void)
>         pthread_mutex_init(&cache_mutex, NULL);
>         pthread_mutex_init(&progress_mutex, NULL);
>         pthread_cond_init(&progress_cond, NULL);
> +       pthread_mutex_init(&to_pack.lock, NULL);
> +       to_pack.lock_initialized = 1;
>         old_try_to_free_routine = set_try_to_free_routine(try_to_free_from_threads);
>  }
>
> diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
> index 4b04c75b7f..2a5bff4a1c 100755
> --- a/ci/run-build-and-tests.sh
> +++ b/ci/run-build-and-tests.sh
> @@ -14,6 +14,7 @@ then
>         export GIT_TEST_SPLIT_INDEX=yes
>         export GIT_TEST_FULL_IN_PACK_ARRAY=true
>         export GIT_TEST_OE_SIZE=10
> +       export GIT_TEST_OE_DELTA_SIZE=5
>         make --quiet test
>  fi
>
> diff --git a/pack-objects.c b/pack-objects.c
> index 92708522e7..eef344b7c1 100644
> --- a/pack-objects.c
> +++ b/pack-objects.c
> @@ -88,6 +88,23 @@ struct object_entry *packlist_find(struct packing_data *pdata,
>         return &pdata->objects[pdata->index[i] - 1];
>  }
>
> +uint32_t *new_delta_size_array(struct packing_data *pack)
> +{
> +       uint32_t *delta_size;
> +       uint32_t i;
> +
> +       /*
> +        * nr_alloc, not nr_objects to align with realloc() strategy in
> +        * packlist_alloc()
> +        */
> +       ALLOC_ARRAY(delta_size, pack->nr_alloc);
> +
> +       for (i = 0; i < pack->nr_objects; i++)
> +               delta_size[i] = pack->objects[i].delta_size_;
> +
> +       return delta_size;
> +}
> +
>  static void prepare_in_pack_by_idx(struct packing_data *pdata)
>  {
>         struct packed_git **mapping, *p;
> @@ -146,6 +163,8 @@ void prepare_packing_data(struct packing_data *pdata)
>
>         pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
>                                              1U << OE_SIZE_BITS);
> +       pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
> +                                                  1U << OE_DELTA_SIZE_BITS);

Perhaps 1UL instead of 1U, in case the user specifies the value of 32?
 I know that flag is meant for testing smaller values, but since 32
was the first value I tried for OE_DELTA_SIZE_BITS when debugging my
issue maybe it makes sense to allow it?

>  }
>
>  struct object_entry *packlist_alloc(struct packing_data *pdata,
> @@ -160,6 +179,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>
>                 if (!pdata->in_pack_by_idx)
>                         REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
> +               if (pdata->delta_size)
> +                       REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
>         }
>
>         new_entry = pdata->objects + pdata->nr_objects++;
> diff --git a/pack-objects.h b/pack-objects.h
> index edf74dabdd..9f977ae800 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -2,6 +2,7 @@
>  #define PACK_OBJECTS_H
>
>  #include "object-store.h"
> +#include "thread-utils.h"
>
>  #define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
>
> @@ -14,7 +15,7 @@
>   * above this limit. Don't lower it too much.
>   */
>  #define OE_SIZE_BITS           31
> -#define OE_DELTA_SIZE_BITS     20
> +#define OE_DELTA_SIZE_BITS     24
>
>  /*
>   * State flags for depth-first search used for analyzing delta cycles.
> @@ -93,12 +94,12 @@ struct object_entry {
>                                      * uses the same base as me
>                                      */
>         unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
> -       unsigned delta_size_valid:1;
> +       unsigned char in_pack_header_size;
>         unsigned in_pack_idx:OE_IN_PACK_BITS;   /* already in pack */
>         unsigned z_delta_size:OE_Z_DELTA_BITS;
>         unsigned type_valid:1;
> -       unsigned type_:TYPE_BITS;
>         unsigned no_try_delta:1;
> +       unsigned type_:TYPE_BITS;
>         unsigned in_pack_type:TYPE_BITS; /* could be delta */
>         unsigned preferred_base:1; /*
>                                     * we do not pack this, but is available
> @@ -108,17 +109,16 @@ struct object_entry {
>         unsigned tagged:1; /* near the very tip of refs */
>         unsigned filled:1; /* assigned write-order */
>         unsigned dfs_state:OE_DFS_STATE_BITS;
> -       unsigned char in_pack_header_size;
>         unsigned depth:OE_DEPTH_BITS;
>
>         /*
>          * pahole results on 64-bit linux (gcc and clang)
>          *
> -        *   size: 80, bit_padding: 20 bits, holes: 8 bits
> +        *   size: 80, bit_padding: 9 bits
>          *
>          * and on 32-bit (gcc)
>          *
> -        *   size: 76, bit_padding: 20 bits, holes: 8 bits
> +        *   size: 76, bit_padding: 9 bits
>          */
>  };
>
> @@ -130,6 +130,7 @@ struct packing_data {
>         uint32_t index_size;
>
>         unsigned int *in_pack_pos;
> +       uint32_t *delta_size;
>
>         /*
>          * Only one of these can be non-NULL and they have different
> @@ -140,10 +141,32 @@ struct packing_data {
>         struct packed_git **in_pack_by_idx;
>         struct packed_git **in_pack;
>
> +#ifndef NO_PTHREADS
> +       int lock_initialized;
> +       pthread_mutex_t lock;
> +#endif
> +
>         uintmax_t oe_size_limit;
> +       uintmax_t oe_delta_size_limit;
>  };
>
>  void prepare_packing_data(struct packing_data *pdata);
> +
> +static inline void packing_data_lock(struct packing_data *pdata)
> +{
> +#ifndef NO_PTHREADS
> +       if (pdata->lock_initialized)
> +               pthread_mutex_lock(&pdata->lock);
> +#endif
> +}
> +static inline void packing_data_unlock(struct packing_data *pdata)
> +{
> +#ifndef NO_PTHREADS
> +       if (pdata->lock_initialized)
> +               pthread_mutex_unlock(&pdata->lock);
> +#endif
> +}
> +
>  struct object_entry *packlist_alloc(struct packing_data *pdata,
>                                     const unsigned char *sha1,
>                                     uint32_t index_pos);
> @@ -330,20 +353,37 @@ static inline void oe_set_size(struct packing_data *pack,
>  static inline unsigned long oe_delta_size(struct packing_data *pack,
>                                           const struct object_entry *e)
>  {
> -       if (e->delta_size_valid)
> -               return e->delta_size_;
> -       return oe_size(pack, e);
> +       unsigned long size;
> +
> +       packing_data_lock(pack);
> +       if (pack->delta_size)
> +               size = pack->delta_size[e - pack->objects];
> +       else
> +               size = e->delta_size_;
> +       packing_data_unlock(pack);
> +       return size;
>  }
>
> +uint32_t *new_delta_size_array(struct packing_data *pdata);
>  static inline void oe_set_delta_size(struct packing_data *pack,
>                                      struct object_entry *e,
>                                      unsigned long size)
>  {
> -       e->delta_size_ = size;
> -       e->delta_size_valid = e->delta_size_ == size;
> -       if (!e->delta_size_valid && size != oe_size(pack, e))
> -               BUG("this can only happen in check_object() "
> -                   "where delta size is the same as entry size");
> +       packing_data_lock(pack);
> +       if (!pack->delta_size && size < pack->oe_delta_size_limit) {
> +               packing_data_unlock(pack);
> +               e->delta_size_ = size;
> +               return;
> +       }
> +       /*
> +        * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
> +        * limit. delta_size_ will not be used anymore. All delta sizes are
> +        * now from the delta_size[] array.
> +        */
> +       if (!pack->delta_size)
> +               pack->delta_size = new_delta_size_array(pack);
> +       pack->delta_size[e - pack->objects] = size;
> +       packing_data_unlock(pack);
>  }
>
>  #endif
> diff --git a/t/README b/t/README
> index 8373a27fea..9028b47d92 100644
> --- a/t/README
> +++ b/t/README
> @@ -315,6 +315,10 @@ packs on demand. This normally only happens when the object size is
>  over 2GB. This variable forces the code path on any object larger than
>  <n> bytes.
>
> +GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
> +path where deltas larger than this limit require extra memory
> +allocation for bookkeeping.
> +
>  Naming Tests
>  ------------
>
> --
> 2.18.0.rc2.476.g39500d3211

Missing the 2.18.0 tag?  ;-)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 17:43   ` Elijah Newren
@ 2018-07-20 23:52     ` Elijah Newren
  2018-07-21  4:07       ` Duy Nguyen
  2018-07-21  4:47     ` Duy Nguyen
  2018-07-23 12:34     ` Elijah Newren
  2 siblings, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-20 23:52 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: Git Mailing List, Junio C Hamano, Jeff King

On Fri, Jul 20, 2018 at 10:43 AM, Elijah Newren <newren@gmail.com> wrote:
> On Fri, Jul 20, 2018 at 8:39 AM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:

>> This patch provides a better fallback that is
>>
>> - cheaper in terms of cpu and io because we won't have to read
>>   existing pack files as much
>>
>> - better in terms of pack size because the pack heuristics is back to
>>   2.17.0 time, we do not drop large deltas at all
>>
>> If we encounter any delta (on-disk or created during try_delta phase)
>> that is larger than the 2MB limit, we stop using delta_size_ field for
>> this because it can't contain such size anyway. A new array of delta
>> size is dynamically allocated and can hold all the deltas that 2.17.0
>> can [1]. All current delta sizes are migrated over.
>>
>> With this, we do not have to drop deltas in try_delta() anymore. Of
>> course the downside is we use slightly more memory, even compared to
>> 2.17.0. But since this is considered an uncommon case, a bit more
>> memory consumption should not be a problem.

Is the increased memory only supposed to affect the uncommon case?  I
was able to verify that 2.18.0 used less memory than 2.17.0 (for
either my repo or linux.git), but I don't see nearly the same memory
reduction relative to 2.17.0 for linux.git with your new patches.

>> ---
>>  I'm optimistic that the slowness on linux repo is lock contention
>>  (because if it's not then I have no clue what is). So let's start
>>  doing proper patches.
>
> I'll be testing this shortly...

Using these definitions for versions:

  fix-v5: https://public-inbox.org/git/20180720052829.GA3852@sigill.intra.peff.net/
(plus what it depends on)
  fix-v6: The patch I'm responding to
  fix-v2: https://public-inbox.org/git/20180718225110.17639-3-newren@gmail.com/

and inspired by
https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/, I
ran

   /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive

three times on a beefy box (40 cores, 160GB ram, otherwise quiet
system).  If I take the lowest MaxRSS, and the lowest time reported
(regardless of whether from the same run), then the results are as
follows for linux.git (all cases repacked down to 1279 MB, so removing
that from the table):

Version  MaxRSS(kB)  Time (s)
-------  ----------  --------
 2.17.0   11413236    621.36
 2.18.0   10987152    621.80
 fix-v5   11105564    836.29
 fix-v6   11357304    831.73
 fix-v2   10873348    619.96

The runtime could swing up to 10 seconds between runs.  The memory use
also had some variability, but all runs of fix-v2 had lower MaxRSS
than any runs of 2.18.0 (which surprised me); all runs of 2.18.0 used
less memory than any run of fix-v6, and all runs of fix-v6 used less
memory than any run of 2.17.0.  fix-v5 had the most variabiilty in
MaxRSS, ranging from as low as some 2.18.0 runs up to higher than any
2.17.0 runs.

...but maybe that'd change with more than 3 runs of each?

Anyway, this is a lot better than the 1193.67 seconds I saw with
fix-v4 (https://public-inbox.org/git/20180719182442.GA5796@duynguyen.home/,
which Peff fixed up into fix-v5), suggesting it is lock contention,
but we're still about 33% slower than 2.17.0 and we've lost almost all
of the memory savings.  Somewhat surprisingly, the highly simplistic
fix-v2 does a lot better on both measures.


However, I'm just concentrating on a beefy machine; it may be that v6
drastically outperforms v2 on weaker hardware?  Can others measure a
lower memory usage for v6 than v2?


Also, for the original repo with the huge deltas that started it all,
the numbers are (only run once since it takes so long):

Version  Pack (MB)  MaxRSS(kB)  Time (s)
-------  ---------  ----------  --------
 2.17.0     5498     43513628    2494.85
 2.18.0    10531     40449596    4168.94
 fix-v5     5500     42805716    2577.50
 fiv-v6     5509     42814836    2605.36
 fix-v2     5509     41644104    2468.25


>>  If we want a quick fix for 2.18.1. I suggest bumping up
>>  OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
>>  think it's safe to fast track this patch.
>
> Okay, I'll submit that with a proper commit message then.  Since you
> also bump OE_DELTA_SIZE_BITS in your patch (though to a different
> value), it'll require a conflict resolution when merging maint into
> master, but at least the change won't silently leak through.

...except now I'm wondering if the commit message should mention that
it's just a stopgap fix for 2.18.1, or whether it's actually the fix
that we just want to use going forward as well.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 23:52     ` Elijah Newren
@ 2018-07-21  4:07       ` Duy Nguyen
  2018-07-21  7:08         ` Duy Nguyen
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-21  4:07 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Sat, Jul 21, 2018 at 1:52 AM Elijah Newren <newren@gmail.com> wrote:
>
> On Fri, Jul 20, 2018 at 10:43 AM, Elijah Newren <newren@gmail.com> wrote:
> > On Fri, Jul 20, 2018 at 8:39 AM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
>
> >> This patch provides a better fallback that is
> >>
> >> - cheaper in terms of cpu and io because we won't have to read
> >>   existing pack files as much
> >>
> >> - better in terms of pack size because the pack heuristics is back to
> >>   2.17.0 time, we do not drop large deltas at all
> >>
> >> If we encounter any delta (on-disk or created during try_delta phase)
> >> that is larger than the 2MB limit, we stop using delta_size_ field for
> >> this because it can't contain such size anyway. A new array of delta
> >> size is dynamically allocated and can hold all the deltas that 2.17.0
> >> can [1]. All current delta sizes are migrated over.
> >>
> >> With this, we do not have to drop deltas in try_delta() anymore. Of
> >> course the downside is we use slightly more memory, even compared to
> >> 2.17.0. But since this is considered an uncommon case, a bit more
> >> memory consumption should not be a problem.
>
> Is the increased memory only supposed to affect the uncommon case?  I
> was able to verify that 2.18.0 used less memory than 2.17.0 (for
> either my repo or linux.git), but I don't see nearly the same memory
> reduction relative to 2.17.0 for linux.git with your new patches.
>
> >> ---
> >>  I'm optimistic that the slowness on linux repo is lock contention
> >>  (because if it's not then I have no clue what is). So let's start
> >>  doing proper patches.
> >
> > I'll be testing this shortly...
>
> Using these definitions for versions:
>
>   fix-v5: https://public-inbox.org/git/20180720052829.GA3852@sigill.intra.peff.net/
> (plus what it depends on)
>   fix-v6: The patch I'm responding to
>   fix-v2: https://public-inbox.org/git/20180718225110.17639-3-newren@gmail.com/
>
> and inspired by
> https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/, I
> ran
>
>    /usr/bin/time -f 'MaxRSS:%M Time:%e' git gc --aggressive
>
> three times on a beefy box (40 cores, 160GB ram, otherwise quiet
> system).  If I take the lowest MaxRSS, and the lowest time reported
> (regardless of whether from the same run), then the results are as
> follows for linux.git (all cases repacked down to 1279 MB, so removing
> that from the table):
>
> Version  MaxRSS(kB)  Time (s)
> -------  ----------  --------
>  2.17.0   11413236    621.36
>  2.18.0   10987152    621.80
>  fix-v5   11105564    836.29
>  fix-v6   11357304    831.73
>  fix-v2   10873348    619.96
>
> The runtime could swing up to 10 seconds between runs.  The memory use
> also had some variability, but all runs of fix-v2 had lower MaxRSS
> than any runs of 2.18.0 (which surprised me); all runs of 2.18.0 used
> less memory than any run of fix-v6, and all runs of fix-v6 used less
> memory than any run of 2.17.0.  fix-v5 had the most variabiilty in
> MaxRSS, ranging from as low as some 2.18.0 runs up to higher than any
> 2.17.0 runs.

As Peff said, RSS is not a very good way to measure memory usage.
valgrind --tool=massif would be the best, or just grep "heap" in
/proc/$PID/smaps. fix-v2 should increase struct object_entry's size
and memory usage for all repos, while v6 has the same memory usage as
2.17.0 in heap, and extra once it hits a big delta.

> ...but maybe that'd change with more than 3 runs of each?
>
> Anyway, this is a lot better than the 1193.67 seconds I saw with
> fix-v4 (https://public-inbox.org/git/20180719182442.GA5796@duynguyen.home/,
> which Peff fixed up into fix-v5), suggesting it is lock contention,
> but we're still about 33% slower than 2.17.0 and we've lost almost all
> of the memory savings.  Somewhat surprisingly, the highly simplistic
> fix-v2 does a lot better on both measures.

Maybe we should avoid locking when the delta is small enough then and
see how it goes. This is linux.git so there's no big delta and we
would end up not locking at all.

If you do "git repack -adf --threads=8", does the time difference
between v6 and 2.17.0 reduce (suggesting we still hit lock contention
hard)?

> However, I'm just concentrating on a beefy machine; it may be that v6
> drastically outperforms v2 on weaker hardware?  Can others measure a
> lower memory usage for v6 than v2?

I'll try it with massif on linux.git, but this is a very weak laptop
(4GB) so  it'll take a while....

> Also, for the original repo with the huge deltas that started it all,
> the numbers are (only run once since it takes so long):
>
> Version  Pack (MB)  MaxRSS(kB)  Time (s)
> -------  ---------  ----------  --------
>  2.17.0     5498     43513628    2494.85
>  2.18.0    10531     40449596    4168.94
>  fix-v5     5500     42805716    2577.50
>  fiv-v6     5509     42814836    2605.36
>  fix-v2     5509     41644104    2468.25
>
>
> >>  If we want a quick fix for 2.18.1. I suggest bumping up
> >>  OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
> >>  think it's safe to fast track this patch.
> >
> > Okay, I'll submit that with a proper commit message then.  Since you
> > also bump OE_DELTA_SIZE_BITS in your patch (though to a different
> > value), it'll require a conflict resolution when merging maint into
> > master, but at least the change won't silently leak through.
>
> ...except now I'm wondering if the commit message should mention that
> it's just a stopgap fix for 2.18.1, or whether it's actually the fix
> that we just want to use going forward as well.

no fix-v2 is really a stopgap. With 2.17.0 (and fix-v6) we pay 80
bytes per object, or 6.5M * 80 = 520 MB heap. fix-v2 increases this
struct to 88 bytes so we pay 52 MB extra.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 17:40   ` Jeff King
@ 2018-07-21  4:23     ` Duy Nguyen
  2018-07-23 21:37       ` Jeff King
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-21  4:23 UTC (permalink / raw)
  To: Jeff King; +Cc: Git Mailing List, Junio C Hamano, Elijah Newren

On Fri, Jul 20, 2018 at 7:40 PM Jeff King <peff@peff.net> wrote:
>
> On Fri, Jul 20, 2018 at 05:39:43PM +0200, Nguyễn Thái Ngọc Duy wrote:
>
> > Let's start with some background about oe_delta_size() and
> > oe_set_delta_size(). If you already know, skip the next paragraph.
> >
> > These two are added in 0aca34e826 (pack-objects: shrink delta_size
> > field in struct object_entry - 2018-04-14) to help reduce 'struct
> > object_entry' size. The delta size field in this struct is reduced to
> > only contain max 2MB. So if any new delta is produced and larger than
> > 2MB, it's dropped because we can't really save such a large size
> > anywhere. Fallback is provided in case existingpackfiles already have
> > large deltas, then we can retrieve it from the pack.
>
> Minor nit, but isn't this 1MB (it was 2MB after one of your patches, but
> I think v2.18.0 has 20 bits)?

Argh.. I think I thought "2 ** 20" in my mind then typed "2 << 20" in
python. And I thought I made a mistake in my previous commit message
because it does mention 1MB...

> > With this, we do not have to drop deltas in try_delta() anymore. Of
> > course the downside is we use slightly more memory, even compared to
> > 2.17.0. But since this is considered an uncommon case, a bit more
> > memory consumption should not be a problem.
>
> I wondered how common this might be. The easiest way to see the largest
> delta sizes is:
>
>   git cat-file --batch-all-objects \
>                --batch-check='%(objectsize:disk) %(deltabase)' |
>   grep -v 0000000000000000000000000000000000000000 |
>   sort -rn | head
>
> The biggest one in the kernel is ~300k. Which is about what I'd expect
> for a normal source code repo. Even some private repos I have with a lot
> of binary artifacts top out at about 3MB. So the new 32MB is probably

I'll keep these numbers in v2 commit message, easier to find later.

> > [1] With a small tweak. 2.17.0 on 64-bit linux can hold 2^64 byte
> >     deltas, which is absolutely insane. But windows builds, even
> >     64-bit version, only hold 2^32. So reducing it to 2^32 should be
> >     quite safe.
>
> I'm not sure I completely agree with this. While 4GB deltas should be
> pretty rare, the nice thing about 64-bit is that you never have to even
> think about whether the variable is large enough. I think the 2^32
> limitations on Windows are something we should be fixing in the long
> term (though there it is even worse because it is not just deltas, but
> entire objects).

I guess that means we stick to uint64_t then? It does increase more
memory on 32-bit architecture (and probably processing cost as well
because 64-bit uses up 2 registers).

> > @@ -2278,6 +2274,8 @@ static void init_threaded_search(void)
> >       pthread_mutex_init(&cache_mutex, NULL);
> >       pthread_mutex_init(&progress_mutex, NULL);
> >       pthread_cond_init(&progress_cond, NULL);
> > +     pthread_mutex_init(&to_pack.lock, NULL);
> > +     to_pack.lock_initialized = 1;
> >       old_try_to_free_routine = set_try_to_free_routine(try_to_free_from_threads);
> >  }
>
> This is new in this iteration. I guess this is to cover the case where
> we are built with pthread support, but --threads=1?

If you mean the "lock_initialized" variable, not really. the other
_lock() macros in builtin/ call pthread_mutex_lock() unconditionally,
which is fine. But I feel a bit uncomfortable doing the same in
pack-objects.h which technically is library code (but yes practically
is a long arm of builtin/pack-objects.c), so lock_initialized is there
to make sure we don't touch uninitialized locks if somebody forgets to
init them first.

> Given that we no longer have to touch this lock during the realloc, is
> it worth actually putting it into to_pack? Instead, we could keep it
> local to pack-objects, alongside all the other locks (and use the
> lock_mutex() helper which handles the single-thread case).

You probably notice the lock name is not "delta_size_lock". I intended
to reuse this for locking other fields in struct packing_data as well.
But that might be a bad idea.

I have no strong opinion about this, so if we still end up locking the
whole functions, I'll just move the lock back close to the others in
builtin/pack-objects.c

> Your original patch had to copy the oe_* helpers into the file to handle
> that. But I think we're essentially just locking the whole functions:

I'll try to avoid this lock when deltas are small and see if it helps
the linux.git case on Elijah's machine. So we may end up locking just
a part of these functions.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 17:43   ` Elijah Newren
  2018-07-20 23:52     ` Elijah Newren
@ 2018-07-21  4:47     ` Duy Nguyen
  2018-07-21  6:56       ` Elijah Newren
  2018-07-22  6:22       ` Elijah Newren
  2018-07-23 12:34     ` Elijah Newren
  2 siblings, 2 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-21  4:47 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Fri, Jul 20, 2018 at 10:43:25AM -0700, Elijah Newren wrote:
> > This patch provides a better fallback that is
> >
> > - cheaper in terms of cpu and io because we won't have to read
> >   existing pack files as much
> >
> > - better in terms of pack size because the pack heuristics is back to
> >   2.17.0 time, we do not drop large deltas at all
> >
> > If we encounter any delta (on-disk or created during try_delta phase)
> > that is larger than the 2MB limit, we stop using delta_size_ field for
> > this because it can't contain such size anyway. A new array of delta
> > size is dynamically allocated and can hold all the deltas that 2.17.0
> > can [1]. All current delta sizes are migrated over.
> >
> > With this, we do not have to drop deltas in try_delta() anymore. Of
> > course the downside is we use slightly more memory, even compared to
> > 2.17.0. But since this is considered an uncommon case, a bit more
> > memory consumption should not be a problem.
> 
> Out of curiosity, would it be possible to use the delta_size_ field
> for deltas that are small enough, and only use an external data
> structure (perhaps a hash rather than an array) for the few deltas
> that are large?

We could. And because repack time is still a bit higher in your
linux.git case. Let's try this. No locking in common case and very
small locked region when we hit large deltas

-- 8< --
diff --git a/pack-objects.c b/pack-objects.c
index eef344b7c1..e3c32bbfc2 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -88,23 +88,6 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 	return &pdata->objects[pdata->index[i] - 1];
 }
 
-uint32_t *new_delta_size_array(struct packing_data *pack)
-{
-	uint32_t *delta_size;
-	uint32_t i;
-
-	/*
-	 * nr_alloc, not nr_objects to align with realloc() strategy in
-	 * packlist_alloc()
-	 */
-	ALLOC_ARRAY(delta_size, pack->nr_alloc);
-
-	for (i = 0; i < pack->nr_objects; i++)
-		delta_size[i] = pack->objects[i].delta_size_;
-
-	return delta_size;
-}
-
 static void prepare_in_pack_by_idx(struct packing_data *pdata)
 {
 	struct packed_git **mapping, *p;
diff --git a/pack-objects.h b/pack-objects.h
index 9f977ae800..11890e7217 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -15,7 +15,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	24
+#define OE_DELTA_SIZE_BITS	23
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -94,6 +94,7 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	unsigned delta_size_valid:1;
 	unsigned char in_pack_header_size;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
@@ -353,37 +354,26 @@ static inline void oe_set_size(struct packing_data *pack,
 static inline unsigned long oe_delta_size(struct packing_data *pack,
 					  const struct object_entry *e)
 {
-	unsigned long size;
-
-	packing_data_lock(pack);
-	if (pack->delta_size)
-		size = pack->delta_size[e - pack->objects];
+	if (e->delta_size_valid)
+		return e->delta_size_;
 	else
-		size = e->delta_size_;
-	packing_data_unlock(pack);
-	return size;
+		return pack->delta_size[e - pack->objects];
 }
 
-uint32_t *new_delta_size_array(struct packing_data *pdata);
 static inline void oe_set_delta_size(struct packing_data *pack,
 				     struct object_entry *e,
 				     unsigned long size)
 {
-	packing_data_lock(pack);
-	if (!pack->delta_size && size < pack->oe_delta_size_limit) {
-		packing_data_unlock(pack);
+	if (size < pack->oe_delta_size_limit) {
 		e->delta_size_ = size;
-		return;
+		e->delta_size_valid = 1;
+	} else {
+		packing_data_lock(pack);
+		if (!pack->delta_size)
+			ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
+		packing_data_unlock(pack);
+		pack->delta_size[e - pack->objects] = size;
 	}
-	/*
-	 * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
-	 * limit. delta_size_ will not be used anymore. All delta sizes are
-	 * now from the delta_size[] array.
-	 */
-	if (!pack->delta_size)
-		pack->delta_size = new_delta_size_array(pack);
-	pack->delta_size[e - pack->objects] = size;
-	packing_data_unlock(pack);
 }
 
 #endif
-- 8< --

> > --
> > 2.18.0.rc2.476.g39500d3211
> 
> Missing the 2.18.0 tag?  ;-)

Hehe I was a bit busy lately and have not rebased my branch on the
latest and greatest version.
--
Duy

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-21  4:47     ` Duy Nguyen
@ 2018-07-21  6:56       ` Elijah Newren
  2018-07-21  7:14         ` Duy Nguyen
  2018-07-22  6:22       ` Elijah Newren
  1 sibling, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-21  6:56 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Fri, Jul 20, 2018 at 9:47 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Fri, Jul 20, 2018 at 10:43:25AM -0700, Elijah Newren wrote:

>> Out of curiosity, would it be possible to use the delta_size_ field
>> for deltas that are small enough, and only use an external data
>> structure (perhaps a hash rather than an array) for the few deltas
>> that are large?
>
> We could. And because repack time is still a bit higher in your
> linux.git case. Let's try this. No locking in common case and very
> small locked region when we hit large deltas

Sweet, I'm starting some tests with it...

> -- 8< --
...
> diff --git a/pack-objects.h b/pack-objects.h
> index 9f977ae800..11890e7217 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -15,7 +15,7 @@
...
> @@ -353,37 +354,26 @@ static inline void oe_set_size(struct packing_data *pack,
>  static inline unsigned long oe_delta_size(struct packing_data *pack,
>                                           const struct object_entry *e)
>  {
> -       unsigned long size;
> -
> -       packing_data_lock(pack);
> -       if (pack->delta_size)
> -               size = pack->delta_size[e - pack->objects];
> +       if (e->delta_size_valid)
> +               return e->delta_size_;
>         else
> -               size = e->delta_size_;
> -       packing_data_unlock(pack);
> -       return size;
> +               return pack->delta_size[e - pack->objects];

Should this last line be wrapped with packing_data_lock/unlock calls?
(And similarly in oe_set_delta_size when writing to this array?)

>  }
>
> -uint32_t *new_delta_size_array(struct packing_data *pdata);
>  static inline void oe_set_delta_size(struct packing_data *pack,
>                                      struct object_entry *e,
>                                      unsigned long size)
>  {
> -       packing_data_lock(pack);
> -       if (!pack->delta_size && size < pack->oe_delta_size_limit) {
> -               packing_data_unlock(pack);
> +       if (size < pack->oe_delta_size_limit) {
>                 e->delta_size_ = size;
> -               return;
> +               e->delta_size_valid = 1;
> +       } else {
> +               packing_data_lock(pack);
> +               if (!pack->delta_size)
> +                       ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
> +               packing_data_unlock(pack);
> +               pack->delta_size[e - pack->objects] = size;
>         }
> -       /*
> -        * We have had at least one delta size exceeding OE_DELTA_SIZE_BITS
> -        * limit. delta_size_ will not be used anymore. All delta sizes are
> -        * now from the delta_size[] array.
> -        */
> -       if (!pack->delta_size)
> -               pack->delta_size = new_delta_size_array(pack);
> -       pack->delta_size[e - pack->objects] = size;
> -       packing_data_unlock(pack);
>  }
>
>  #endif

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-21  4:07       ` Duy Nguyen
@ 2018-07-21  7:08         ` Duy Nguyen
  0 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-21  7:08 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Sat, Jul 21, 2018 at 6:07 AM Duy Nguyen <pclouds@gmail.com> wrote:
> > However, I'm just concentrating on a beefy machine; it may be that v6
> > drastically outperforms v2 on weaker hardware?  Can others measure a
> > lower memory usage for v6 than v2?
>
> I'll try it with massif on linux.git, but this is a very weak laptop
> (4GB) so  it'll take a while....

OK massif reports are too big to upload so I only pastebin'd the peak
memory snapshot. fix-v2 uses 1.449 MB [1] (M = 1000) while my patch
uses 1.394 MB heap. The actual saving if we focus on packlist_alloc()
is 599,147,916B - 544,691,628B = roughly 50 MB or 8%.

It's not _that_ big saving, so if your test runs do not show
significant speedups, then fix-v2 is probably a better option.

[1] https://pastebin.com/8iDcm9M4
[2] https://pastebin.com/bXBjzPvN
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-21  6:56       ` Elijah Newren
@ 2018-07-21  7:14         ` Duy Nguyen
  0 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-21  7:14 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Sat, Jul 21, 2018 at 8:56 AM Elijah Newren <newren@gmail.com> wrote:
> > -- 8< --
> ...
> > diff --git a/pack-objects.h b/pack-objects.h
> > index 9f977ae800..11890e7217 100644
> > --- a/pack-objects.h
> > +++ b/pack-objects.h
> > @@ -15,7 +15,7 @@
> ...
> > @@ -353,37 +354,26 @@ static inline void oe_set_size(struct packing_data *pack,
> >  static inline unsigned long oe_delta_size(struct packing_data *pack,
> >                                           const struct object_entry *e)
> >  {
> > -       unsigned long size;
> > -
> > -       packing_data_lock(pack);
> > -       if (pack->delta_size)
> > -               size = pack->delta_size[e - pack->objects];
> > +       if (e->delta_size_valid)
> > +               return e->delta_size_;
> >         else
> > -               size = e->delta_size_;
> > -       packing_data_unlock(pack);
> > -       return size;
> > +               return pack->delta_size[e - pack->objects];
>
> Should this last line be wrapped with packing_data_lock/unlock calls?
> (And similarly in oe_set_delta_size when writing to this array?)

No. Data related to "e" alone could be accessed unprotected. This is
what try_delta has been doing (it's "trg_entry" in there). Because we
know that pack->delta_size[e - pack->objects] is for "e" and no other
entry, we can have the same treatment. If we access other entries'
data though like in my new_delta_size_array(), that could be racy.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-21  4:47     ` Duy Nguyen
  2018-07-21  6:56       ` Elijah Newren
@ 2018-07-22  6:22       ` Elijah Newren
  2018-07-22  6:49         ` Duy Nguyen
  1 sibling, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-22  6:22 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Fri, Jul 20, 2018 at 9:47 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Fri, Jul 20, 2018 at 10:43:25AM -0700, Elijah Newren wrote:
>> Out of curiosity, would it be possible to use the delta_size_ field
>> for deltas that are small enough, and only use an external data
>> structure (perhaps a hash rather than an array) for the few deltas
>> that are large?
>
> We could. And because repack time is still a bit higher in your
> linux.git case. Let's try this. No locking in common case and very
> small locked region when we hit large deltas

This one looks like a winner.  Labelling this as fix-v7, this rounds
out the table to:

Version  Time (s)
-------  --------
 2.17.0   621.36
 2.18.0   621.80
 fix-v5   836.29
 fix-v6   831.73
 fix-v2   619.96
 fix-v7   622.88

So the runtime is basically within the noise of different runs of the
timing for 2.17.0 or 2.18.0 or -v2, and is much faster than -v5 or
-v6.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-22  6:22       ` Elijah Newren
@ 2018-07-22  6:49         ` Duy Nguyen
  0 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-22  6:49 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Sun, Jul 22, 2018 at 8:22 AM Elijah Newren <newren@gmail.com> wrote:
>
> On Fri, Jul 20, 2018 at 9:47 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> > On Fri, Jul 20, 2018 at 10:43:25AM -0700, Elijah Newren wrote:
> >> Out of curiosity, would it be possible to use the delta_size_ field
> >> for deltas that are small enough, and only use an external data
> >> structure (perhaps a hash rather than an array) for the few deltas
> >> that are large?
> >
> > We could. And because repack time is still a bit higher in your
> > linux.git case. Let's try this. No locking in common case and very
> > small locked region when we hit large deltas
>
> This one looks like a winner.  Labelling this as fix-v7, this rounds
> out the table to:
>
> Version  Time (s)
> -------  --------
>  2.17.0   621.36
>  2.18.0   621.80
>  fix-v5   836.29
>  fix-v6   831.73
>  fix-v2   619.96
>  fix-v7   622.88
>
> So the runtime is basically within the noise of different runs of the
> timing for 2.17.0 or 2.18.0 or -v2, and is much faster than -v5 or
> -v6.

Thanks. I'm looking forward to asking you to test lock-related changes
on this 40-core monster in the future :D

Unrelated point of improvement for the future. I notice that at least
on my machine, i have 100% cpu on one core during writing phase,
likely because deltas are being recomputed to be written down and we
don't produce deltas fast enough. We should be able to take advantage
of multiple cores to recompute deltas in advance at this stage and
shorten pack-objects time some more.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
  2018-07-20 17:40   ` Jeff King
  2018-07-20 17:43   ` Elijah Newren
@ 2018-07-22  8:04   ` Nguyễn Thái Ngọc Duy
  2018-07-23 18:04     ` Junio C Hamano
  2018-07-26  8:12     ` Johannes Sixt
  2 siblings, 2 replies; 44+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-07-22  8:04 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Elijah Newren, Jeff King,
	Nguyễn Thái Ngọc Duy

Let's start with some background about oe_delta_size() and
oe_set_delta_size(). If you already know, skip the next paragraph.

These two are added in 0aca34e826 (pack-objects: shrink delta_size
field in struct object_entry - 2018-04-14) to help reduce 'struct
object_entry' size. The delta size field in this struct is reduced to
only contain max 1MB. So if any new delta is produced and larger than
1MB, it's dropped because we can't really save such a large size
anywhere. Fallback is provided in case existing packfiles already have
large deltas, then we can retrieve it from the pack.

While this should help small machines repacking large repos without
large deltas (i.e. less memory pressure), dropping large deltas during
the delta selection process could end up with worse pack files. And if
existing packfiles already have >1MB delta and pack-objects is
instructed to not reuse deltas, all of them will be dropped on the
floor, and the resulting pack would be definitely bigger.

There is also a regression in terms of CPU/IO if we have large on-disk
deltas because fallback code needs to parse the pack every time the
delta size is needed and just access to the mmap'd pack data is enough
for extra page faults when memory is under pressure.

Both of these issues were reported on the mailing list. Here's some
numbers for comparison.

    Version  Pack (MB)  MaxRSS(kB)  Time (s)
    -------  ---------  ----------  --------
     2.17.0     5498     43513628    2494.85
     2.18.0    10531     40449596    4168.94

This patch provides a better fallback that is

- cheaper in terms of cpu and io because we won't have to read
  existing pack files as much

- better in terms of pack size because the pack heuristics is back to
  2.17.0 time, we do not drop large deltas at all

If we encounter any delta (on-disk or created during try_delta phase)
that is larger than the 1MB limit, we stop using delta_size_ field for
this because it can't contain such size anyway. A new array of delta
size is dynamically allocated and can hold all the deltas that 2.17.0
can. This array only contains delta sizes that delta_size_ can't
contain.

With this, we do not have to drop deltas in try_delta() anymore. Of
course the downside is we use slightly more memory, even compared to
2.17.0. But since this is considered an uncommon case, a bit more
memory consumption should not be a problem.

Delta size limit is also raised from 1MB to 16MB to better cover
common case and avoid that extra memory consumption (99.999% deltas in
this reported repo are under 12MB; Jeff noted binary artifacts topped
out at about 3MB in some other private repos). Other fields are
shuffled around to keep this struct packed tight. We don't use more
memory in common case even with this limit update.

A note about thread synchronization. Since this code can be run in
parallel during delta searching phase, we need a mutex. The realloc
part in packlist_alloc() is not protected because it only happens
during the object counting phase, which is always single-threaded.

Access to e->delta_size_ (and by extension
pack->delta_size[e - pack->objects]) is unprotected as before, the
thread scheduler in pack-objects must make sure "e" is never updated
by two different threads.

The area under the new lock is as small as possible, avoiding locking
at all in common case, since lock contention with high thread count
could be expensive (most blobs are small enough that delta compute
time is short and we end up taking the lock very often). The previous
attempt to always hold a lock in oe_delta_size() and
oe_set_delta_size() increases execution time by 33% when repacking
linux.git with with 40 threads.

Reported-by: Elijah Newren <newren@gmail.com>
Helped-by: Elijah Newren <newren@gmail.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 v2 should keep execution time back to 2.17.0 range (I never thought
 locks are that expensive..). Changes since v1:

 - oe_delta_size() and oe_set_delta_size() rewritten to keep locking
   to minimum. This also brings back delta_size_valid field so that
   we avoid accessing shared data as much as possible.
 - delta_size[] is now an array of unsigned long instead of uint32_t
   (i.e. 64-bit on linux, 32-bit on windows, like in 2.17.0)
 - delta size limit is reduced to 16 MB
 - lock_initialized field is removed

 builtin/pack-objects.c    |  5 +---
 ci/run-build-and-tests.sh |  1 +
 pack-objects.c            |  4 +++
 pack-objects.h            | 59 +++++++++++++++++++++++++++++++--------
 t/README                  |  4 +++
 5 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ebc8cefb53..a77659f9ce 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2023,10 +2023,6 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
-		free(delta_buf);
-		return 0;
-	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
@@ -2278,6 +2274,7 @@ static void init_threaded_search(void)
 	pthread_mutex_init(&cache_mutex, NULL);
 	pthread_mutex_init(&progress_mutex, NULL);
 	pthread_cond_init(&progress_cond, NULL);
+	pthread_mutex_init(&to_pack.lock, NULL);
 	old_try_to_free_routine = set_try_to_free_routine(try_to_free_from_threads);
 }
 
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 4b04c75b7f..2a5bff4a1c 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -14,6 +14,7 @@ then
 	export GIT_TEST_SPLIT_INDEX=yes
 	export GIT_TEST_FULL_IN_PACK_ARRAY=true
 	export GIT_TEST_OE_SIZE=10
+	export GIT_TEST_OE_DELTA_SIZE=5
 	make --quiet test
 fi
 
diff --git a/pack-objects.c b/pack-objects.c
index 92708522e7..6ef87e5683 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -146,6 +146,8 @@ void prepare_packing_data(struct packing_data *pdata)
 
 	pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
 					     1U << OE_SIZE_BITS);
+	pdata->oe_delta_size_limit = git_env_ulong("GIT_TEST_OE_DELTA_SIZE",
+						   1UL << OE_DELTA_SIZE_BITS);
 }
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -160,6 +162,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
 		if (!pdata->in_pack_by_idx)
 			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
+		if (pdata->delta_size)
+			REALLOC_ARRAY(pdata->delta_size, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..d9c144bf62 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #include "object-store.h"
+#include "thread-utils.h"
 
 #define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
 
@@ -14,7 +15,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	23
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -94,11 +95,12 @@ struct object_entry {
 				     */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	unsigned delta_size_valid:1;
+	unsigned char in_pack_header_size;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
-	unsigned type_:TYPE_BITS;
 	unsigned no_try_delta:1;
+	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -108,17 +110,16 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
 
 	/*
 	 * pahole results on 64-bit linux (gcc and clang)
 	 *
-	 *   size: 80, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 80, bit_padding: 9 bits
 	 *
 	 * and on 32-bit (gcc)
 	 *
-	 *   size: 76, bit_padding: 20 bits, holes: 8 bits
+	 *   size: 76, bit_padding: 9 bits
 	 */
 };
 
@@ -130,6 +131,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	unsigned long *delta_size;
 
 	/*
 	 * Only one of these can be non-NULL and they have different
@@ -140,10 +142,29 @@ struct packing_data {
 	struct packed_git **in_pack_by_idx;
 	struct packed_git **in_pack;
 
+#ifndef NO_PTHREADS
+	pthread_mutex_t lock;
+#endif
+
 	uintmax_t oe_size_limit;
+	uintmax_t oe_delta_size_limit;
 };
 
 void prepare_packing_data(struct packing_data *pdata);
+
+static inline void packing_data_lock(struct packing_data *pdata)
+{
+#ifndef NO_PTHREADS
+	pthread_mutex_lock(&pdata->lock);
+#endif
+}
+static inline void packing_data_unlock(struct packing_data *pdata)
+{
+#ifndef NO_PTHREADS
+	pthread_mutex_unlock(&pdata->lock);
+#endif
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos);
@@ -332,18 +353,34 @@ static inline unsigned long oe_delta_size(struct packing_data *pack,
 {
 	if (e->delta_size_valid)
 		return e->delta_size_;
-	return oe_size(pack, e);
+
+	/*
+	 * pack->detla_size[] can't be NULL because oe_set_delta_size()
+	 * must have been called when a new delta is saved with
+	 * oe_set_delta().
+	 * If oe_delta() returns NULL (i.e. default state, which means
+	 * delta_size_valid is also false), then the caller must never
+	 * call oe_delta_size().
+	 */
+	return pack->delta_size[e - pack->objects];
 }
 
 static inline void oe_set_delta_size(struct packing_data *pack,
 				     struct object_entry *e,
 				     unsigned long size)
 {
-	e->delta_size_ = size;
-	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(pack, e))
-		BUG("this can only happen in check_object() "
-		    "where delta size is the same as entry size");
+	if (size < pack->oe_delta_size_limit) {
+		e->delta_size_ = size;
+		e->delta_size_valid = 1;
+	} else {
+		packing_data_lock(pack);
+		if (!pack->delta_size)
+			ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
+		packing_data_unlock(pack);
+
+		pack->delta_size[e - pack->objects] = size;
+		e->delta_size_valid = 0;
+	}
 }
 
 #endif
diff --git a/t/README b/t/README
index 8373a27fea..9028b47d92 100644
--- a/t/README
+++ b/t/README
@@ -315,6 +315,10 @@ packs on demand. This normally only happens when the object size is
 over 2GB. This variable forces the code path on any object larger than
 <n> bytes.
 
+GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
+path where deltas larger than this limit require extra memory
+allocation for bookkeeping.
+
 Naming Tests
 ------------
 
-- 
2.18.0.656.gda699b98b3


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-20 17:43   ` Elijah Newren
  2018-07-20 23:52     ` Elijah Newren
  2018-07-21  4:47     ` Duy Nguyen
@ 2018-07-23 12:34     ` Elijah Newren
  2018-07-23 15:50       ` Duy Nguyen
  2 siblings, 1 reply; 44+ messages in thread
From: Elijah Newren @ 2018-07-23 12:34 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, peff, Elijah Newren

On Fri, Jul 20, 2018 at 10:43 AM, Elijah Newren <newren@gmail.com> wrote:
> On Fri, Jul 20, 2018 at 8:39 AM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:

>>  If we want a quick fix for 2.18.1. I suggest bumping up
>>  OE_DELTA_SIZE_BITS like Elijah did in his second option. I don't
>>  think it's safe to fast track this patch.
>
> Okay, I'll submit that with a proper commit message then.  Since you
> also bump OE_DELTA_SIZE_BITS in your patch (though to a different
> value), it'll require a conflict resolution when merging maint into
> master, but at least the change won't silently leak through.

And here's the re-submitted patch, with commit message...

-- 8< --
Subject: [PATCH] pack-objects: fix repacking of repositories with some large
 deltas

Commit 0aca34e826 (pack-objects: shrink delta_size field in struct
object_entry - 2018-04-14) reduced 'struct object_entry' size, by dropping
the delta size field to 20 bits (max delta size 1MB).  There was a
fallback to reference existing pack files that already had large deltas,
but when aggressively repacking, it drops the deltas on the floor,
resulting in much worse packs.

For comparison, the following measurements were made for running 'git gc
--aggressive' for one particular repo:

    Version  Pack (MB)  MaxRSS(kB)  Time (s)
    -------  ---------  ----------  --------
     2.17.0     5498     43513628    2494.85
     2.18.0    10531     40449596    4168.94

This patch provides a stop-gap improvement for maint that increases the
delta size back up to 32 bits (still an improvement over the 64 bit size
of the original).  A better fix preserving the memory savings for most
repos that do not have large deltas is being prepared for 2.19.0.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
I think Duy's comment that I'm responding to suggests he Acks this
patch for maint, but there is that little 'if' he put in his statement.

I'll be on vacation until the end of the week, so I apologize in advance
for not responding quickly...

 builtin/pack-objects.c | 2 +-
 pack-objects.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 71056d8294..49b7af295d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2023,7 +2023,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
+	if (delta_size >= (1UL << OE_DELTA_SIZE_BITS)) {
 		free(delta_buf);
 		return 0;
 	}
diff --git a/pack-objects.h b/pack-objects.h
index edf74dabdd..26021328aa 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,7 +14,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	20
+#define OE_DELTA_SIZE_BITS	32
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
-- 
2.18.0.2.gf4c50c7885


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-23 12:34     ` Elijah Newren
@ 2018-07-23 15:50       ` Duy Nguyen
  0 siblings, 0 replies; 44+ messages in thread
From: Duy Nguyen @ 2018-07-23 15:50 UTC (permalink / raw)
  To: Elijah Newren; +Cc: Git Mailing List, Junio C Hamano, Jeff King

On Mon, Jul 23, 2018 at 2:34 PM Elijah Newren <newren@gmail.com> wrote:
> This patch provides a stop-gap improvement for maint that increases the
> delta size back up to 32 bits (still an improvement over the 64 bit size
> of the original).

The "still an improvement" is actually not true. Due to padding and
stuff, struct object_entry's size is increased by 8 bytes, even though
you don't use all of it. It's possible to maybe shuffle fields around
a bit to keep this struct smaller. But it's not really worth it since
this is temporary anyway.

> I think Duy's comment that I'm responding to suggests he Acks this
> patch for maint, but there is that little 'if' he put in his statement.

Yep. Ack'd by me. It's up to Junio to decide if it's worth doing this
or not, of course.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-22  8:04   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
@ 2018-07-23 18:04     ` Junio C Hamano
  2018-07-23 18:38       ` Duy Nguyen
  2018-07-26  8:12     ` Johannes Sixt
  1 sibling, 1 reply; 44+ messages in thread
From: Junio C Hamano @ 2018-07-23 18:04 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Elijah Newren, Jeff King

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Access to e->delta_size_ (and by extension
> pack->delta_size[e - pack->objects]) is unprotected as before, the
> thread scheduler in pack-objects must make sure "e" is never updated
> by two different threads.

OK.  Do we need to worry about "e" (e.g. "e->delta_size_valid")
being accessed while/before it is set by another thread?

oe_delta_size() makes unprotected accesses to .delta_size_ and
pack->delta_size[e - pack->objects], so we apparently do not, and
oe_set_delta_size() only protects the allocation call and does not
prevent a reader in oe_delta_size() from first reading the _valid
field, noticing that it is 0 as initialized, and goes on to read the
pack->delta_size[] slot for the entry, while the writer is setting
the size to .delta_size_ field and flipping _valid bit, without ever
storing the size in the pack->delta_size[] array.

> @@ -130,6 +131,7 @@ struct packing_data {
>  	uint32_t index_size;
>  
>  	unsigned int *in_pack_pos;
> +	unsigned long *delta_size;
>  
>  	/*
>  	 * Only one of these can be non-NULL and they have different
> @@ -140,10 +142,29 @@ struct packing_data {
>  	struct packed_git **in_pack_by_idx;
>  	struct packed_git **in_pack;
>  
> +#ifndef NO_PTHREADS
> +	pthread_mutex_t lock;

I am wondering if we want the variable to say what data it is
protecting from simultaneous accesses, or leave it as generic so
that any new caller that wants to lock any (new) thing that is
associated with a packing_data structure can grab it for other
purposes.  The design of this patch clearly is the latter, which is
OK for now, I think.

> @@ -332,18 +353,34 @@ static inline unsigned long oe_delta_size(struct packing_data *pack,
>  {
>  	if (e->delta_size_valid)
>  		return e->delta_size_;
> -	return oe_size(pack, e);
> +
> +	/*
> +	 * pack->detla_size[] can't be NULL because oe_set_delta_size()
> +	 * must have been called when a new delta is saved with
> +	 * oe_set_delta().
> +	 * If oe_delta() returns NULL (i.e. default state, which means
> +	 * delta_size_valid is also false), then the caller must never
> +	 * call oe_delta_size().
> +	 */
> +	return pack->delta_size[e - pack->objects];
>  }
>  
>  static inline void oe_set_delta_size(struct packing_data *pack,
>  				     struct object_entry *e,
>  				     unsigned long size)
>  {
> -	e->delta_size_ = size;
> -	e->delta_size_valid = e->delta_size_ == size;
> -	if (!e->delta_size_valid && size != oe_size(pack, e))
> -		BUG("this can only happen in check_object() "
> -		    "where delta size is the same as entry size");
> +	if (size < pack->oe_delta_size_limit) {
> +		e->delta_size_ = size;
> +		e->delta_size_valid = 1;
> +	} else {
> +		packing_data_lock(pack);
> +		if (!pack->delta_size)
> +			ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
> +		packing_data_unlock(pack);
> +
> +		pack->delta_size[e - pack->objects] = size;
> +		e->delta_size_valid = 0;
> +	}
>  }

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-23 18:04     ` Junio C Hamano
@ 2018-07-23 18:38       ` Duy Nguyen
  2018-07-23 18:49         ` Duy Nguyen
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-23 18:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, Elijah Newren, Jeff King

On Mon, Jul 23, 2018 at 8:04 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
> > Access to e->delta_size_ (and by extension
> > pack->delta_size[e - pack->objects]) is unprotected as before, the
> > thread scheduler in pack-objects must make sure "e" is never updated
> > by two different threads.
>
> OK.  Do we need to worry about "e" (e.g. "e->delta_size_valid")
> being accessed while/before it is set by another thread?

I left some details out because I think it's getting to a gray area.
We already do this

        if (!DELTA(trg_entry)) {
                max_size = trg_size/2 - the_hash_algo->rawsz;
                ref_depth = 1;
        } else {
                max_size = DELTA_SIZE(trg_entry);
        ...
        SET_DELTA(trg_entry, src_entry);
        SET_DELTA_SIZE(trg_entry, delta_size);

if the bottom half is running on one thread and stopped in the middle
while the top half is running in another thread, we have a problem.
But perhaps max_size is not that big a deal because incorrect max_size
may produce a bad pack but can't corrupt it.

I will have to study the thread dispatch code more to have a better
answer, unfortunately.
--
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-23 18:38       ` Duy Nguyen
@ 2018-07-23 18:49         ` Duy Nguyen
  2018-07-23 21:30           ` Jeff King
  0 siblings, 1 reply; 44+ messages in thread
From: Duy Nguyen @ 2018-07-23 18:49 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, Elijah Newren, Jeff King

On Mon, Jul 23, 2018 at 8:38 PM Duy Nguyen <pclouds@gmail.com> wrote:
> I will have to study the thread dispatch code more to have a better
> answer, unfortunately.

Well.. I thought I would need this weekend for this, but a quick look
and ll_find_deltas() suggests that what we're doing is safe. At least
you and Jeff are way to familiar with the delta window concept in
pack-objects. So in multithread mode, we have a big list of all
objects, then the list is cut in N sublists and each is owned entirely
by one thread. Each thread then can slide a window over this sublist
to search for the best delta.

Because of this partitioning, if trg_entry is being processed now, it
will not be accessed by any other thread. It's owned by this thread
and will be accessed again as src_entry when the next entry becomes
the delta target in the same thread. As long as we don't access
objects of another thread (and my v1 violates this) we should be ok.

At the end of ll_find_deltas() though, we have the "stealing" stuff,
but it's protected by progress_lock() already, outside try_delta() so
we're sure nobody is updating any object_entry when the stealing
happens.

I will double check again this weekend just to be sure.
-- 
Duy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-23 18:49         ` Duy Nguyen
@ 2018-07-23 21:30           ` Jeff King
  0 siblings, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-23 21:30 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Git Mailing List, Elijah Newren

On Mon, Jul 23, 2018 at 08:49:59PM +0200, Duy Nguyen wrote:

> On Mon, Jul 23, 2018 at 8:38 PM Duy Nguyen <pclouds@gmail.com> wrote:
> > I will have to study the thread dispatch code more to have a better
> > answer, unfortunately.
> 
> Well.. I thought I would need this weekend for this, but a quick look
> and ll_find_deltas() suggests that what we're doing is safe. At least
> you and Jeff are way to familiar with the delta window concept in
> pack-objects. So in multithread mode, we have a big list of all
> objects, then the list is cut in N sublists and each is owned entirely
> by one thread. Each thread then can slide a window over this sublist
> to search for the best delta.
> 
> Because of this partitioning, if trg_entry is being processed now, it
> will not be accessed by any other thread. It's owned by this thread
> and will be accessed again as src_entry when the next entry becomes
> the delta target in the same thread. As long as we don't access
> objects of another thread (and my v1 violates this) we should be ok.

Yes, that matches my knowledge of how this all works. And if it didn't,
I think the code would have been racy even _before_ your patches.

The only thing that this pack->delta_size approach is changing is that
managing that array needs to happen under lock, because it touches the
whole list.

And as long as we back-fill it from any arbitrary e->delta_size_, that
means that touching e->delta_size_ needs to be done under lock. That's
why I think keeping the individual valid flag in each entry makes the
most sense. Then whenever we overflow a particular e->delta_size_, we
don't have to care about anybody else's size.

Which I think is what your v2 patch is doing.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] pack-objects: fix performance issues on packing large deltas
  2018-07-21  4:23     ` Duy Nguyen
@ 2018-07-23 21:37       ` Jeff King
  0 siblings, 0 replies; 44+ messages in thread
From: Jeff King @ 2018-07-23 21:37 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Elijah Newren

On Sat, Jul 21, 2018 at 06:23:32AM +0200, Duy Nguyen wrote:

> > I'm not sure I completely agree with this. While 4GB deltas should be
> > pretty rare, the nice thing about 64-bit is that you never have to even
> > think about whether the variable is large enough. I think the 2^32
> > limitations on Windows are something we should be fixing in the long
> > term (though there it is even worse because it is not just deltas, but
> > entire objects).
> 
> I guess that means we stick to uint64_t then? It does increase more
> memory on 32-bit architecture (and probably processing cost as well
> because 64-bit uses up 2 registers).

Yes, but if we switch away from an array to a hash, then we get the best
of both worlds: we are using 64-bits to store the size, but we only need
an entry for deltas that are actually big.

Then the common small-delta case remains efficient in both CPU and
memory, and we pay the costs only in proportion to the number of large
deltas (though the hash is a slightly higher cost for those large deltas
than an array).

> > This is new in this iteration. I guess this is to cover the case where
> > we are built with pthread support, but --threads=1?
> 
> If you mean the "lock_initialized" variable, not really. the other
> _lock() macros in builtin/ call pthread_mutex_lock() unconditionally,
> which is fine. But I feel a bit uncomfortable doing the same in
> pack-objects.h which technically is library code (but yes practically
> is a long arm of builtin/pack-objects.c), so lock_initialized is there
> to make sure we don't touch uninitialized locks if somebody forgets to
> init them first.

I think the ones in builtin/ check threads_active to avoid actually
locking. And that's set in init_thread(), which we do not call if we are
using a single thread. So I think this is basically doing the same
thing, but with a separate flag (since the library code does not know
about threads_active).

> > Your original patch had to copy the oe_* helpers into the file to handle
> > that. But I think we're essentially just locking the whole functions:
> 
> I'll try to avoid this lock when deltas are small and see if it helps
> the linux.git case on Elijah's machine. So we may end up locking just
> a part of these functions.

Yeah, I think my suggestion doesn't work once we start doing more
complex locking logic. Let's just forget it. I think the
"lock_initialized" thing is probably the right approach.

It might be worth getting rid of builtin/pack-objects.c's local
threads_active variable, and just using to_pack.threads_active. The two
flag would always want to match.

-Peff

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2] pack-objects: fix performance issues on packing large deltas
  2018-07-22  8:04   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2018-07-23 18:04     ` Junio C Hamano
@ 2018-07-26  8:12     ` Johannes Sixt
  1 sibling, 0 replies; 44+ messages in thread
From: Johannes Sixt @ 2018-07-26  8:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, Elijah Newren, Jeff King

Am 22.07.2018 um 10:04 schrieb Nguyễn Thái Ngọc Duy:
> +	if (size < pack->oe_delta_size_limit) {
> +		e->delta_size_ = size;
> +		e->delta_size_valid = 1;
> +	} else {
> +		packing_data_lock(pack);
> +		if (!pack->delta_size)
> +			ALLOC_ARRAY(pack->delta_size, pack->nr_alloc);
> +		packing_data_unlock(pack);
> +
> +		pack->delta_size[e - pack->objects] = size;

My first thought was that this is wrong (falling prey to the same 
mistake as the double-checked locking pattern). But after thinking twice 
over it again, I think that this unprotected access of pack->delta_size 
is thread-safe.

Of course, I'm assuming that different threads never assign to the same 
array index.

> +		e->delta_size_valid = 0;
> +	}
>   }

-- Hannes

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2018-07-26  8:12 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-18 22:51 2.18.0 Regression: packing performance and effectiveness Elijah Newren
2018-07-18 22:51 ` [RFC PATCH] fix-v1: revert "pack-objects: shrink delta_size field in struct object_entry" Elijah Newren
2018-07-18 22:51 ` [RFC PATCH] fix-v2: make OE_DELTA_SIZE_BITS a bit bigger Elijah Newren
2018-07-19  5:41 ` 2.18.0 Regression: packing performance and effectiveness Duy Nguyen
2018-07-19  5:49   ` Jeff King
2018-07-19 15:27   ` Elijah Newren
2018-07-19 15:43     ` Duy Nguyen
2018-07-19  5:44 ` Jeff King
2018-07-19  5:57   ` Duy Nguyen
2018-07-19 15:16     ` Duy Nguyen
2018-07-19 16:42       ` Elijah Newren
2018-07-19 17:23         ` Jeff King
2018-07-19 17:31           ` Duy Nguyen
2018-07-19 18:24             ` Duy Nguyen
2018-07-19 19:17               ` Jeff King
2018-07-19 23:11               ` Elijah Newren
2018-07-20  5:28                 ` Jeff King
2018-07-20  5:30                   ` Jeff King
2018-07-20  5:47                   ` Duy Nguyen
2018-07-20 17:21                   ` Elijah Newren
2018-07-19 17:04       ` Jeff King
2018-07-19 19:25       ` Junio C Hamano
2018-07-19 19:27         ` Junio C Hamano
2018-07-20 15:39 ` [PATCH] pack-objects: fix performance issues on packing large deltas Nguyễn Thái Ngọc Duy
2018-07-20 17:40   ` Jeff King
2018-07-21  4:23     ` Duy Nguyen
2018-07-23 21:37       ` Jeff King
2018-07-20 17:43   ` Elijah Newren
2018-07-20 23:52     ` Elijah Newren
2018-07-21  4:07       ` Duy Nguyen
2018-07-21  7:08         ` Duy Nguyen
2018-07-21  4:47     ` Duy Nguyen
2018-07-21  6:56       ` Elijah Newren
2018-07-21  7:14         ` Duy Nguyen
2018-07-22  6:22       ` Elijah Newren
2018-07-22  6:49         ` Duy Nguyen
2018-07-23 12:34     ` Elijah Newren
2018-07-23 15:50       ` Duy Nguyen
2018-07-22  8:04   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2018-07-23 18:04     ` Junio C Hamano
2018-07-23 18:38       ` Duy Nguyen
2018-07-23 18:49         ` Duy Nguyen
2018-07-23 21:30           ` Jeff King
2018-07-26  8:12     ` Johannes Sixt

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).