git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / code / Atom feed
* [PATCH] diff-delta.c: pack the index structure
@ 2007-09-08 21:17 David Kastrup
  2007-09-08 21:25 ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup
  0 siblings, 1 reply; 3+ messages in thread
From: David Kastrup @ 2007-09-08 21:17 UTC (permalink / raw)
  To: git

In normal use cases, the performance wins are not overly impressive:
we get something like 5-10% due to the slightly better locality of
memory accesses using the packed structure.

However, since the data structure for index entries saves 33% of
memory on 32-bit platforms and 40% on 64-bit platforms, the behavior
when memory gets limited should be nicer.

This is a rather well-contained change.  One obvious improvement would
be sorting the elements in one bucket according to their hash, then
using binary probing to find the elements with the right hash value.

As it stands, the output should be strictly the same as previously
unless one uses the option for limiting the amount of used memory, in
which case the created packs might be better.

Signed-off-by: David Kastrup <dak@gnu.org>
Acked-by: Nicolas Pitre <nico@cam.org>
---
 diff-delta.c |   74 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index 0dde2f2..d355e5e 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -115,9 +115,13 @@ static const unsigned int U[256] = {
 struct index_entry {
 	const unsigned char *ptr;
 	unsigned int val;
-	struct index_entry *next;
 };
 
+struct unpacked_index_entry {
+	struct index_entry entry;
+	struct unpacked_index_entry *next;
+};	
+
 struct delta_index {
 	unsigned long memsize;
 	const void *src_buf;
@@ -131,7 +135,8 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
-	struct index_entry *entry, **hash;
+	struct unpacked_index_entry *entry, **hash;
+	struct index_entry *packed_entry, **packed_hash;
 	void *mem;
 	unsigned long memsize;
 
@@ -148,28 +153,21 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	memsize = sizeof(*index) +
-		  sizeof(*hash) * hsize +
+	memsize = sizeof(*hash) * hsize +
 		  sizeof(*entry) * entries;
 	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
-	index = mem;
-	mem = index + 1;
 	hash = mem;
 	mem = hash + hsize;
 	entry = mem;
 
-	index->memsize = memsize;
-	index->src_buf = buf;
-	index->src_size = bufsize;
-	index->hash_mask = hmask;
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
-		free(index);
+		free(hash);
 		return NULL;
 	}
 
@@ -183,12 +181,13 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
-			entry[-1].ptr = data + RABIN_WINDOW;
+			entry[-1].entry.ptr = data + RABIN_WINDOW;
+			--entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
-			entry->ptr = data + RABIN_WINDOW;
-			entry->val = val;
+			entry->entry.ptr = data + RABIN_WINDOW;
+			entry->entry.val = val;
 			entry->next = hash[i];
 			hash[i] = entry++;
 			hash_count[i]++;
@@ -212,16 +211,59 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			continue;
 		entry = hash[i];
 		do {
-			struct index_entry *keep = entry;
+			struct unpacked_index_entry *keep = entry;
 			int skip = hash_count[i] / HASH_LIMIT;
 			do {
+				--entries;
 				entry = entry->next;
 			} while(--skip && entry);
+			++entries;
 			keep->next = entry;
 		} while(entry);
 	}
 	free(hash_count);
 
+	/* Now create the packed index in array form rather than
+	 * linked lists */
+
+	memsize = sizeof(*index)
+		+ sizeof(*packed_hash) * (hsize+1)
+		+ sizeof(*packed_entry) * entries;
+
+	mem = malloc(memsize);
+
+	if (!mem) {
+		free(hash);
+		return NULL;
+	}
+
+	index = mem;
+	index->memsize = memsize;
+	index->src_buf = buf;
+	index->src_size = bufsize;
+	index->hash_mask = hmask;
+
+	mem = index + 1;
+	packed_hash = mem;
+	mem = packed_hash + (hsize+1);
+	packed_entry = mem;
+
+	/* Coalesce all entries belonging to one linked list into
+	 * consecutive array entries */
+
+	for (i = 0; i < hsize; i++) {
+		packed_hash[i] = packed_entry;
+		for (entry = hash[i]; entry; entry = entry->next)
+			*packed_entry++ = entry->entry;
+	}
+
+	/* Sentinel value to indicate the length of the last hash
+	 * bucket */
+
+	packed_hash[hsize] = packed_entry;
+	assert(packed_entry - (struct index_entry *)mem == entries);
+	free(hash);
+
 	return index;
 }
 
@@ -302,7 +344,7 @@ create_delta(const struct delta_index *index,
 			val ^= U[data[-RABIN_WINDOW]];
 			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 			i = val & index->hash_mask;
-			for (entry = index->hash[i]; entry; entry = entry->next) {
+			for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = ref_top - ref;
-- 
1.5.3.GIT

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] diff-delta.c: Rationalize culling of hash buckets
  2007-09-08 21:17 [PATCH] diff-delta.c: pack the index structure David Kastrup
@ 2007-09-08 21:25 ` David Kastrup
  0 siblings, 0 replies; 3+ messages in thread
From: David Kastrup @ 2007-09-08 21:25 UTC (permalink / raw)
  To: git

The previous hash bucket culling resulted in a somewhat unpredictable
number of hash bucket entries in the order of magnitude of HASH_LIMIT.

Replace this with a Bresenham-like algorithm leaving us with exactly
HASH_LIMIT entries by uniform culling.

Signed-off-by: David Kastrup <dak@gnu.org>
---
This is to be applied on top of the patch for packing the index
structure.  Which is the reason it is posted as a followup to the
respective article.  In contrast to that patch, this patch has only
been tested/reviewed by myself as to yet.

 diff-delta.c |   41 +++++++++++++++++++++++++++++++----------
 1 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index d355e5e..1a8d6fb 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -207,19 +207,40 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
-		if (hash_count[i] < HASH_LIMIT)
+		int acc;
+
+		if (hash_count[i] <= HASH_LIMIT)
 			continue;
+
+		entries -= hash_count[i] - HASH_LIMIT;
+		/* We leave exactly HASH_LIMIT entries in the bucket */
+
 		entry = hash[i];
+		acc = 0;
 		do {
-			struct unpacked_index_entry *keep = entry;
-			int skip = hash_count[i] / HASH_LIMIT;
-			do {
-				--entries;
-				entry = entry->next;
-			} while(--skip && entry);
-			++entries;
-			keep->next = entry;
-		} while(entry);
+			acc += hash_count[i] - HASH_LIMIT;
+			if (acc > 0) {
+				struct unpacked_index_entry *keep = entry;
+				do {
+					entry = entry->next;
+					acc -= HASH_LIMIT;
+				} while (acc > 0);
+				keep->next = entry->next;
+			}
+			entry = entry->next;
+		} while (entry);
+
+		/* Assume that this loop is gone through exactly
+		 * HASH_LIMIT times and is entered and left with
+		 * acc==0.  So the first statement in the loop
+		 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
+		 * to the accumulator, and the inner loop consequently
+		 * is run (hash_count[i]-HASH_LIMIT) times, removing
+		 * one element from the list each time.  Since acc
+		 * balances out to 0 at the final run, the inner loop
+		 * body can't be left with entry==NULL.  So we indeed
+		 * encounter entry==NULL in the outer loop only.
+		 */
 	}
 	free(hash_count);
 
-- 
1.5.3.GIT

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] diff-delta.c: Rationalize culling of hash buckets
  2007-09-08  8:54 [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
@ 2007-09-08 19:41 ` David Kastrup
  0 siblings, 0 replies; 3+ messages in thread
From: David Kastrup @ 2007-09-08 19:41 UTC (permalink / raw)
  To: git

The previous hash bucket culling resulted in a somewhat unpredictable
number of hash bucket entries in the order of magnitude of HASH_LIMIT.

Replace this with a Bresenham-like algorithm leaving us with exactly
HASH_LIMIT entries by uniform culling.
---

This is assuming that the previous patch series has been applied
(lacking any final comments or Acks on it; but I think I addressed the
issues).  The hash bucket code culling has really annoyed me with its
unintuitive results.  If the preceding patch series is thrown out (I
can't think why it would), one could rework this to fit the current
master if really needed.

 diff-delta.c |   41 +++++++++++++++++++++++++++++++----------
 1 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/diff-delta.c b/diff-delta.c
index e7c33aa..98795c6 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -207,19 +207,40 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
-		if (hash_count[i] < HASH_LIMIT)
+		int acc;
+
+		if (hash_count[i] <= HASH_LIMIT)
 			continue;
+
+		entries -= hash_count[i] - HASH_LIMIT;
+		/* We leave exactly HASH_LIMIT entries in the bucket */
+
 		uentry = uhash[i];
+		acc = 0;
 		do {
-			struct unpacked_index_entry *keep = uentry;
-			int skip = hash_count[i] / HASH_LIMIT;
-			do {
-				--entries;
-				uentry = uentry->next;
-			} while(--skip && uentry);
-			++entries;
-			keep->next = uentry;
-		} while(uentry);
+			acc += hash_count[i] - HASH_LIMIT;
+			if (acc > 0) {
+				struct unpacked_index_entry *keep = uentry;
+				do {
+					uentry = uentry->next;
+					acc -= HASH_LIMIT;
+				} while (acc > 0);
+				keep->next = uentry->next;
+			}
+			uentry = uentry->next;
+		} while (uentry);
+
+		/* Assume that this loop is gone through exactly
+		 * HASH_LIMIT times and is entered and left with
+		 * acc==0.  So the first statement in the loop
+		 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
+		 * to the accumulator, and the inner loop consequently
+		 * is run (hash_count[i]-HASH_LIMIT) times, removing
+		 * one element from the list each time.  Since acc
+		 * balances out to 0 at the final run, the inner loop
+		 * body can't be left with uentry==NULL.  So we indeed
+		 * encounter uentry==NULL in the outer loop only.
+		 */
 	}
 	free(hash_count);
 
-- 
1.5.3.GIT

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2007-09-08 21:26 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-09-08 21:17 [PATCH] diff-delta.c: pack the index structure David Kastrup
2007-09-08 21:25 ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup
  -- strict thread matches above, loose matches on Subject: below --
2007-09-08  8:54 [PATCH] diff-delta.c: rename {a,}{entry,hash} to {,u}{entry,hash} David Kastrup
2007-09-08 19:41 ` [PATCH] diff-delta.c: Rationalize culling of hash buckets David Kastrup

Code repositories for project(s) associated with this inbox:

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).