git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash}
  2007-09-08  9:31 [PATCH] diff-delta.c: pack the index structure David Kastrup
@ 2007-09-08  8:54 ` David Kastrup
  2007-09-08 19:41   ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup
  2007-09-08 20:52   ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} Nicolas Pitre
  0 siblings, 2 replies; 6+ messages in thread
From: David Kastrup @ 2007-09-08  8:54 UTC (permalink / raw)
  To: git

The variables for the packed entries are now called just entry and
hash rather than aentry+ahash, and those for the unpacked entries have
been renamed to uentry and uhash from the original entry and hash.

While this makes the diff to the unchanged code larger, it matches the
type declarations better.

Signed-off-by: David Kastrup <dak@gnu.org>
---
 diff-delta.c |   65 ++++++++++++++++++++++++++++-----------------------------
 1 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 1b4b1c1..e7c33aa 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -135,8 +135,8 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
-	struct unpacked_index_entry *entry, **hash;
-	struct index_entry *aentry, **ahash;
+	struct unpacked_index_entry *uentry, **uhash;
+	struct index_entry *entry, **hash;
 	void *mem;
 	unsigned long memsize;
 
@@ -153,21 +153,21 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	memsize = sizeof(*hash) * hsize +
-		  sizeof(*entry) * entries;
+	memsize = sizeof(*uhash) * hsize +
+		  sizeof(*uentry) * entries;
 	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
-	hash = mem;
-	mem = hash + hsize;
-	entry = mem;
+	uhash = mem;
+	mem = uhash + hsize;
+	uentry = mem;
 
-	memset(hash, 0, hsize * sizeof(*hash));
+	memset(uhash, 0, hsize * sizeof(*uhash));
 
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
-		free(hash);
+		free(uhash);
 		return NULL;
 	}
 
@@ -181,15 +181,15 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
-			entry[-1].entry.ptr = data + RABIN_WINDOW;
+			uentry[-1].entry.ptr = data + RABIN_WINDOW;
 			--entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
-			entry->entry.ptr = data + RABIN_WINDOW;
-			entry->entry.val = val;
-			entry->next = hash[i];
-			hash[i] = entry++;
+			uentry->entry.ptr = data + RABIN_WINDOW;
+			uentry->entry.val = val;
+			uentry->next = uhash[i];
+			uhash[i] = uentry++;
 			hash_count[i]++;
 		}
 	}
@@ -209,17 +209,17 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	for (i = 0; i < hsize; i++) {
 		if (hash_count[i] < HASH_LIMIT)
 			continue;
-		entry = hash[i];
+		uentry = uhash[i];
 		do {
-			struct unpacked_index_entry *keep = entry;
+			struct unpacked_index_entry *keep = uentry;
 			int skip = hash_count[i] / HASH_LIMIT;
 			do {
 				--entries;
-				entry = entry->next;
-			} while(--skip && entry);
+				uentry = uentry->next;
+			} while(--skip && uentry);
 			++entries;
-			keep->next = entry;
-		} while(entry);
+			keep->next = uentry;
+		} while(uentry);
 	}
 	free(hash_count);
 
@@ -227,13 +227,12 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	 * linked lists */
 
 	memsize = sizeof(*index)
-		+ sizeof(*ahash) * (hsize+1)
-		+ sizeof(*aentry) * entries;
-
+		+ sizeof(*hash) * (hsize+1)
+		+ sizeof(*entry) * entries;
 	mem = malloc(memsize);
 
 	if (!mem) {
-		free(hash);
+		free(uhash);
 		return NULL;
 	}
 
@@ -244,25 +243,25 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	index->hash_mask = hmask;
 
 	mem = index + 1;
-	ahash = mem;
-	mem = ahash + (hsize+1);
-	aentry = mem;
+	hash = mem;
+	mem = hash + (hsize+1);
+	entry = mem;
 
 	/* Coalesce all entries belonging to one linked list into
 	 * consecutive array entries */
 
 	for (i = 0; i < hsize; i++) {
-		ahash[i] = aentry;
-		for (entry = hash[i]; entry; entry = entry->next)
-			*aentry++ = entry->entry;
+		hash[i] = entry;
+		for (uentry = uhash[i]; uentry; uentry = uentry->next)
+			*entry++ = uentry->entry;
 	}
 
 	/* Sentinel value to indicate the length of the last hash
 	 * bucket */
 
-	ahash[hsize] = aentry;
-	assert(aentry - (struct index_entry *)mem == entries);
-	free(hash);
+	hash[hsize] = entry;
+	assert(entry - (struct index_entry *)mem == entries);
+	free(uhash);
 
 	return index;
 }
-- 
1.5.3.GIT

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] diff-delta.c: pack the index structure
@ 2007-09-08  9:31 David Kastrup
  2007-09-08  8:54 ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
  0 siblings, 1 reply; 6+ messages in thread
From: David Kastrup @ 2007-09-08  9:31 UTC (permalink / raw)
  To: git

In normal use cases, the performance wins are not overly impressive:
we get something like 5-10% due to the slightly better locality of
memory accesses using the packed structure.

However, since the data structure for index entries saves 33% of
memory on 32-bit platforms and 40% on 64-bit platforms, the behavior
when memory gets limited should be nicer.

This is a rather well-contained change.  One obvious improvement would
be sorting the elements in one bucket according to their hash, then
using binary probing to find the elements with the right hash value.

As it stands, the output should be strictly the same as previously
unless one uses the option for limiting the amount of used memory, in
which case the created packs might be better.

Signed-off-by: David Kastrup <dak@gnu.org>
---
 diff-delta.c |   74 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 0dde2f2..1b4b1c1 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -115,9 +115,13 @@ static const unsigned int U[256] = {
 struct index_entry {
 	const unsigned char *ptr;
 	unsigned int val;
-	struct index_entry *next;
 };
 
+struct unpacked_index_entry {
+	struct index_entry entry;
+	struct unpacked_index_entry *next;
+};	
+
 struct delta_index {
 	unsigned long memsize;
 	const void *src_buf;
@@ -131,7 +135,8 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
-	struct index_entry *entry, **hash;
+	struct unpacked_index_entry *entry, **hash;
+	struct index_entry *aentry, **ahash;
 	void *mem;
 	unsigned long memsize;
 
@@ -148,28 +153,21 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	memsize = sizeof(*index) +
-		  sizeof(*hash) * hsize +
+	memsize = sizeof(*hash) * hsize +
 		  sizeof(*entry) * entries;
 	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
-	index = mem;
-	mem = index + 1;
 	hash = mem;
 	mem = hash + hsize;
 	entry = mem;
 
-	index->memsize = memsize;
-	index->src_buf = buf;
-	index->src_size = bufsize;
-	index->hash_mask = hmask;
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
-		free(index);
+		free(hash);
 		return NULL;
 	}
 
@@ -183,12 +181,13 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
-			entry[-1].ptr = data + RABIN_WINDOW;
+			entry[-1].entry.ptr = data + RABIN_WINDOW;
+			--entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
-			entry->ptr = data + RABIN_WINDOW;
-			entry->val = val;
+			entry->entry.ptr = data + RABIN_WINDOW;
+			entry->entry.val = val;
 			entry->next = hash[i];
 			hash[i] = entry++;
 			hash_count[i]++;
@@ -212,16 +211,59 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			continue;
 		entry = hash[i];
 		do {
-			struct index_entry *keep = entry;
+			struct unpacked_index_entry *keep = entry;
 			int skip = hash_count[i] / HASH_LIMIT;
 			do {
+				--entries;
 				entry = entry->next;
 			} while(--skip && entry);
+			++entries;
 			keep->next = entry;
 		} while(entry);
 	}
 	free(hash_count);
 
+	/* Now create the packed index in array form rather than
+	 * linked lists */
+
+	memsize = sizeof(*index)
+		+ sizeof(*ahash) * (hsize+1)
+		+ sizeof(*aentry) * entries;
+
+	mem = malloc(memsize);
+
+	if (!mem) {
+		free(hash);
+		return NULL;
+	}
+
+	index = mem;
+	index->memsize = memsize;
+	index->src_buf = buf;
+	index->src_size = bufsize;
+	index->hash_mask = hmask;
+
+	mem = index + 1;
+	ahash = mem;
+	mem = ahash + (hsize+1);
+	aentry = mem;
+
+	/* Coalesce all entries belonging to one linked list into
+	 * consecutive array entries */
+
+	for (i = 0; i < hsize; i++) {
+		ahash[i] = aentry;
+		for (entry = hash[i]; entry; entry = entry->next)
+			*aentry++ = entry->entry;
+	}
+
+	/* Sentinel value to indicate the length of the last hash
+	 * bucket */
+
+	ahash[hsize] = aentry;
+	assert(aentry - (struct index_entry *)mem == entries);
+	free(hash);
+
 	return index;
 }
 
@@ -302,7 +344,7 @@ create_delta(const struct delta_index *index,
 			val ^= U[data[-RABIN_WINDOW]];
 			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 			i = val & index->hash_mask;
-			for (entry = index->hash[i]; entry; entry = entry->next) {
+			for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = ref_top - ref;
-- 
1.5.3.GIT

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] diff-delta.c: Rationalize culling of hash buckets
  2007-09-08  8:54 ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
@ 2007-09-08 19:41   ` David Kastrup
  2007-09-08 20:52   ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} Nicolas Pitre
  1 sibling, 0 replies; 6+ messages in thread
From: David Kastrup @ 2007-09-08 19:41 UTC (permalink / raw)
  To: git

The previous hash bucket culling resulted in a somewhat unpredictable
number of hash bucket entries in the order of magnitude of HASH_LIMIT.

Replace this with a Bresenham-like algorithm leaving us with exactly
HASH_LIMIT entries by uniform culling.
---

This is assuming that the previous patch series has been applied
(lacking any final comments or Acks on it; but I think I addressed the
issues).  The hash bucket code culling has really annoyed me with its
unintuitive results.  If the preceding patch series is thrown out (I
can't think why it would), one could rework this to fit the current
master if really needed.

 diff-delta.c |   41 +++++++++++++++++++++++++++++++----------
 1 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index e7c33aa..98795c6 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -207,19 +207,40 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
-		if (hash_count[i] < HASH_LIMIT)
+		int acc;
+
+		if (hash_count[i] <= HASH_LIMIT)
 			continue;
+
+		entries -= hash_count[i] - HASH_LIMIT;
+		/* We leave exactly HASH_LIMIT entries in the bucket */
+
 		uentry = uhash[i];
+		acc = 0;
 		do {
-			struct unpacked_index_entry *keep = uentry;
-			int skip = hash_count[i] / HASH_LIMIT;
-			do {
-				--entries;
-				uentry = uentry->next;
-			} while(--skip && uentry);
-			++entries;
-			keep->next = uentry;
-		} while(uentry);
+			acc += hash_count[i] - HASH_LIMIT;
+			if (acc > 0) {
+				struct unpacked_index_entry *keep = uentry;
+				do {
+					uentry = uentry->next;
+					acc -= HASH_LIMIT;
+				} while (acc > 0);
+				keep->next = uentry->next;
+			}
+			uentry = uentry->next;
+		} while (uentry);
+
+		/* Assume that this loop is gone through exactly
+		 * HASH_LIMIT times and is entered and left with
+		 * acc==0.  So the first statement in the loop
+		 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
+		 * to the accumulator, and the inner loop consequently
+		 * is run (hash_count[i]-HASH_LIMIT) times, removing
+		 * one element from the list each time.  Since acc
+		 * balances out to 0 at the final run, the inner loop
+		 * body can't be left with uentry==NULL.  So we indeed
+		 * encounter uentry==NULL in the outer loop only.
+		 */
 	}
 	free(hash_count);
 
-- 
1.5.3.GIT

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash}
  2007-09-08  8:54 ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
  2007-09-08 19:41   ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup
@ 2007-09-08 20:52   ` Nicolas Pitre
  2007-09-08 21:06     ` David Kastrup
  1 sibling, 1 reply; 6+ messages in thread
From: Nicolas Pitre @ 2007-09-08 20:52 UTC (permalink / raw)
  To: David Kastrup; +Cc: git

On Sat, 8 Sep 2007, David Kastrup wrote:

> The variables for the packed entries are now called just entry and
> hash rather than aentry+ahash, and those for the unpacked entries have
> been renamed to uentry and uhash from the original entry and hash.
> 
> While this makes the diff to the unchanged code larger, it matches the
> type declarations better.

Since the packed version is used less often, could you rename that one 
instead, and use packed_entry and packed_hash forclarity.

Also please fold this with your other patch (not the culling one).

Then you can add my ACK.


Nicolas

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash}
  2007-09-08 20:52   ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} Nicolas Pitre
@ 2007-09-08 21:06     ` David Kastrup
  2007-09-09  0:38       ` Nicolas Pitre
  0 siblings, 1 reply; 6+ messages in thread
From: David Kastrup @ 2007-09-08 21:06 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> On Sat, 8 Sep 2007, David Kastrup wrote:
>
>> The variables for the packed entries are now called just entry and
>> hash rather than aentry+ahash, and those for the unpacked entries have
>> been renamed to uentry and uhash from the original entry and hash.
>> 
>> While this makes the diff to the unchanged code larger, it matches the
>> type declarations better.
>
> Since the packed version is used less often, could you rename that one 
> instead, and use packed_entry and packed_hash forclarity.
>
> Also please fold this with your other patch (not the culling one).

Uh, what other patch?  If I don't do the renames in the second patch,
then only the first patch remains.  Which I'll change to use the
proposed names.  Hope that's what you meant.

> Then you can add my ACK.

I'll do that, and rework the culling patch on top of that (unacked for
now).

-- 
David Kastrup, Kriemhildstr. 15, 44793 Bochum

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash}
  2007-09-08 21:06     ` David Kastrup
@ 2007-09-09  0:38       ` Nicolas Pitre
  0 siblings, 0 replies; 6+ messages in thread
From: Nicolas Pitre @ 2007-09-09  0:38 UTC (permalink / raw)
  To: David Kastrup; +Cc: git

On Sat, 8 Sep 2007, David Kastrup wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > On Sat, 8 Sep 2007, David Kastrup wrote:
> >
> >> The variables for the packed entries are now called just entry and
> >> hash rather than aentry+ahash, and those for the unpacked entries have
> >> been renamed to uentry and uhash from the original entry and hash.
> >> 
> >> While this makes the diff to the unchanged code larger, it matches the
> >> type declarations better.
> >
> > Since the packed version is used less often, could you rename that one 
> > instead, and use packed_entry and packed_hash forclarity.
> >
> > Also please fold this with your other patch (not the culling one).
> 
> Uh, what other patch?  If I don't do the renames in the second patch,
> then only the first patch remains.  Which I'll change to use the
> proposed names.  Hope that's what you meant.

Yes, that's what I meant.

> > Then you can add my ACK.
> 
> I'll do that, and rework the culling patch on top of that (unacked for
> now).

I'm looking at it atm.


Nicolas

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-09-09  8:43 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-09-08  9:31 [PATCH] diff-delta.c: pack the index structure David Kastrup
2007-09-08  8:54 ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
2007-09-08 19:41   ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup
2007-09-08 20:52   ` [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} Nicolas Pitre
2007-09-08 21:06     ` David Kastrup
2007-09-09  0:38       ` Nicolas Pitre

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).