git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* fast-import's hash table is slow
@ 2020-03-31  9:45 Jeff King
  2020-03-31 19:14 ` René Scharfe
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2020-03-31  9:45 UTC (permalink / raw)
  To: git

[breaking thread, since this is really an independent topic]

On Mon, Mar 30, 2020 at 10:09:30AM -0400, Jeff King wrote:

> So I arrived at this fast-import solution, which was...not super fast.
> Profiling showed that we were spending 80% of the time inserting into
> our custom hashtable, which is fixed at 2^16 entries and then chains
> beyond that. Swapping it out for a khash proved much faster, but I'm not
> sure if the memory games are too gross (see the comment in find_object
> below).
> 
> I also didn't look into whether we could get rid of the extra allocating
> pool (and store the structs right in the hash), or if it's necessary for
> their pointers to be stable.

I briefly tried to get rid of the pool. I _think_ it should be possible,
but I did see some test failures. It's entirely possible I screwed it
up. However, I did generate a few interesting measurements showing how
the current hash table behaves on this test:

  git init repo
  cd repo
  perl -e '
      my $bits = shift;
      my $nr = 2**$bits;

      for (my $i = 0; $i < $nr; $i++) {
              print "blob\n";
              print "data 4\n";
              print pack("N", $i);
      }
  ' "$@" | git fast-import

Here are wall-clock timings for the current tip of master, versus with
the patch below applied:

nr_objects   master       patch
2^20         0m04.317s    0m5.109s
2^21         0m10.204s    0m9.702s
2^22         0m27.159s    0m17.911s
2^23         1m19.038s    0m35.080s
2^24         4m18.766s    1m10.233s

The curve on master is quadratic-ish (each line has double the number of
objects of the previous one; the times don't multiply by 4, but that's
because the hash table is only part of the work we're doing). With my
patch, it's pretty linear.

But I'm still disappointed that the smallest case is actually _slower_
with the patch. The existing hash table is so simple I can imagine using
khash has a little overhead. But I'm surprised it would be so much (or
that the existing hash table does OK at 2^20; it only has 2^16 buckets).

Maybe this email will nerd-snipe René into poking at it.

The patch I tested is below (it's slightly different than what I showed
before, in that it handles duplicate insertions). Maybe using hashmap.c
would be better?

---
diff --git a/fast-import.c b/fast-import.c
index 202dda11a6..6ebac665a0 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -39,12 +39,25 @@
 
 struct object_entry {
 	struct pack_idx_entry idx;
-	struct object_entry *next;
 	uint32_t type : TYPE_BITS,
 		pack_id : PACK_ID_BITS,
 		depth : DEPTH_BITS;
 };
 
+static inline unsigned int object_entry_hash(struct object_entry *oe)
+{
+	return oidhash(&oe->idx.oid);
+}
+
+static inline int object_entry_equal(struct object_entry *a,
+				     struct object_entry *b)
+{
+	return oideq(&a->idx.oid, &b->idx.oid);
+}
+
+KHASH_INIT(object_entry_set, struct object_entry *, int, 0,
+	   object_entry_hash, object_entry_equal);
+
 struct object_entry_pool {
 	struct object_entry_pool *next_pool;
 	struct object_entry *next_free;
@@ -178,7 +191,7 @@ static off_t pack_size;
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 5000;
 static struct object_entry_pool *blocks;
-static struct object_entry *object_table[1 << 16];
+static kh_object_entry_set_t object_table;
 static struct mark_set *marks;
 static const char *export_marks_file;
 static const char *import_marks_file;
@@ -455,44 +468,45 @@ static struct object_entry *new_object(struct object_id *oid)
 
 static struct object_entry *find_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e;
-	for (e = object_table[h]; e; e = e->next)
-		if (oideq(oid, &e->idx.oid))
-			return e;
+	/*
+	 * this cast works because we only look at the oid part of the entry,
+	 * and it comes first in the struct
+	 */
+	khiter_t pos = kh_get_object_entry_set(&object_table,
+					       (struct object_entry *)oid);
+	if (pos != kh_end(&object_table))
+		return kh_key(&object_table, pos);
 	return NULL;
 }
 
 static struct object_entry *insert_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e = object_table[h];
+	struct object_entry *e;
+	int was_empty;
+	khiter_t pos;
 
-	while (e) {
-		if (oideq(oid, &e->idx.oid))
-			return e;
-		e = e->next;
-	}
+	pos = kh_put_object_entry_set(&object_table, (struct object_entry *)oid, &was_empty);
+	if (!was_empty)
+		return kh_key(&object_table, pos);
 
 	e = new_object(oid);
-	e->next = object_table[h];
 	e->idx.offset = 0;
-	object_table[h] = e;
+	kh_key(&object_table, pos) = e;
 	return e;
 }
 
 static void invalidate_pack_id(unsigned int id)
 {
-	unsigned int h;
 	unsigned long lu;
 	struct tag *t;
+	khiter_t iter;
 
-	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
-		struct object_entry *e;
-
-		for (e = object_table[h]; e; e = e->next)
+	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
+		if (kh_exist(&object_table, iter)) {
+			struct object_entry *e = kh_key(&object_table, iter);
 			if (e->pack_id == id)
 				e->pack_id = MAX_PACK_ID;
+		}
 	}
 
 	for (lu = 0; lu < branch_table_sz; lu++) {

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-03-31  9:45 fast-import's hash table is slow Jeff King
@ 2020-03-31 19:14 ` René Scharfe
  2020-03-31 23:21   ` René Scharfe
  2020-04-01 10:35   ` Jeff King
  0 siblings, 2 replies; 12+ messages in thread
From: René Scharfe @ 2020-03-31 19:14 UTC (permalink / raw)
  To: Jeff King, git

Am 31.03.20 um 11:45 schrieb Jeff King:
> [breaking thread, since this is really an independent topic]
>
> On Mon, Mar 30, 2020 at 10:09:30AM -0400, Jeff King wrote:
>
>> So I arrived at this fast-import solution, which was...not super fast.
>> Profiling showed that we were spending 80% of the time inserting into
>> our custom hashtable, which is fixed at 2^16 entries and then chains
>> beyond that. Swapping it out for a khash proved much faster, but I'm not
>> sure if the memory games are too gross (see the comment in find_object
>> below).
>>
>> I also didn't look into whether we could get rid of the extra allocating
>> pool (and store the structs right in the hash), or if it's necessary for
>> their pointers to be stable.
>
> I briefly tried to get rid of the pool. I _think_ it should be possible,
> but I did see some test failures. It's entirely possible I screwed it
> up. However, I did generate a few interesting measurements showing how
> the current hash table behaves on this test:
>
>   git init repo
>   cd repo
>   perl -e '
>       my $bits = shift;
>       my $nr = 2**$bits;
>
>       for (my $i = 0; $i < $nr; $i++) {
>               print "blob\n";
>               print "data 4\n";
>               print pack("N", $i);
>       }
>   ' "$@" | git fast-import
>
> Here are wall-clock timings for the current tip of master, versus with
> the patch below applied:
>
> nr_objects   master       patch
> 2^20         0m04.317s    0m5.109s
> 2^21         0m10.204s    0m9.702s
> 2^22         0m27.159s    0m17.911s
> 2^23         1m19.038s    0m35.080s
> 2^24         4m18.766s    1m10.233s

I get similar numbers.

Pre-sizing by putting this near the top of cmd_main() gets the time
for 1M down to 4 seconds:

	kh_resize_object_entry_set(&object_table, 1 << 18);

The more fair 1 << 16 does not cut it, the totally unfair 1 << 20 gives
a small extra boost.

>
> The curve on master is quadratic-ish (each line has double the number of
> objects of the previous one; the times don't multiply by 4, but that's
> because the hash table is only part of the work we're doing). With my
> patch, it's pretty linear.
>
> But I'm still disappointed that the smallest case is actually _slower_
> with the patch. The existing hash table is so simple I can imagine using
> khash has a little overhead. But I'm surprised it would be so much (or
> that the existing hash table does OK at 2^20; it only has 2^16 buckets).
>
> Maybe this email will nerd-snipe René into poking at it.
>
> The patch I tested is below (it's slightly different than what I showed
> before, in that it handles duplicate insertions). Maybe using hashmap.c
> would be better?
>
> ---
> diff --git a/fast-import.c b/fast-import.c
> index 202dda11a6..6ebac665a0 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -39,12 +39,25 @@
>
>  struct object_entry {
>  	struct pack_idx_entry idx;
> -	struct object_entry *next;
>  	uint32_t type : TYPE_BITS,
>  		pack_id : PACK_ID_BITS,
>  		depth : DEPTH_BITS;
>  };
>
> +static inline unsigned int object_entry_hash(struct object_entry *oe)
> +{
> +	return oidhash(&oe->idx.oid);
> +}
> +
> +static inline int object_entry_equal(struct object_entry *a,
> +				     struct object_entry *b)
> +{
> +	return oideq(&a->idx.oid, &b->idx.oid);
> +}
> +
> +KHASH_INIT(object_entry_set, struct object_entry *, int, 0,
> +	   object_entry_hash, object_entry_equal);
> +
>  struct object_entry_pool {
>  	struct object_entry_pool *next_pool;
>  	struct object_entry *next_free;
> @@ -178,7 +191,7 @@ static off_t pack_size;
>  /* Table of objects we've written. */
>  static unsigned int object_entry_alloc = 5000;
>  static struct object_entry_pool *blocks;
> -static struct object_entry *object_table[1 << 16];
> +static kh_object_entry_set_t object_table;
>  static struct mark_set *marks;
>  static const char *export_marks_file;
>  static const char *import_marks_file;
> @@ -455,44 +468,45 @@ static struct object_entry *new_object(struct object_id *oid)
>
>  static struct object_entry *find_object(struct object_id *oid)
>  {
> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
> -	struct object_entry *e;
> -	for (e = object_table[h]; e; e = e->next)
> -		if (oideq(oid, &e->idx.oid))
> -			return e;
> +	/*
> +	 * this cast works because we only look at the oid part of the entry,
> +	 * and it comes first in the struct
> +	 */
> +	khiter_t pos = kh_get_object_entry_set(&object_table,
> +					       (struct object_entry *)oid);

Dirty, but I can believe the comment.


> +	if (pos != kh_end(&object_table))
> +		return kh_key(&object_table, pos);
>  	return NULL;
>  }
>
>  static struct object_entry *insert_object(struct object_id *oid)
>  {
> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
> -	struct object_entry *e = object_table[h];
> +	struct object_entry *e;
> +	int was_empty;
> +	khiter_t pos;
>
> -	while (e) {
> -		if (oideq(oid, &e->idx.oid))
> -			return e;
> -		e = e->next;
> -	}
> +	pos = kh_put_object_entry_set(&object_table, (struct object_entry *)oid, &was_empty);

Now this looks illegal.  khash is surely reading a full object_entry from oid,
which only is a mere object_id, no?

> +	if (!was_empty)
> +		return kh_key(&object_table, pos);
>
>  	e = new_object(oid);
> -	e->next = object_table[h];
>  	e->idx.offset = 0;
> -	object_table[h] = e;
> +	kh_key(&object_table, pos) = e;
>  	return e;
>  }
>
>  static void invalidate_pack_id(unsigned int id)
>  {
> -	unsigned int h;
>  	unsigned long lu;
>  	struct tag *t;
> +	khiter_t iter;
>
> -	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
> -		struct object_entry *e;
> -
> -		for (e = object_table[h]; e; e = e->next)
> +	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
> +		if (kh_exist(&object_table, iter)) {
> +			struct object_entry *e = kh_key(&object_table, iter);
>  			if (e->pack_id == id)
>  				e->pack_id = MAX_PACK_ID;
> +		}
>  	}

Is this really the best way to handle that, independently of the hashmap
that's used?  I wonder how an extra hashmap or set of valid pack_id
values (or set of invalidated pack_id values?) would fare against having
to touch all object entries here.

>
>  	for (lu = 0; lu < branch_table_sz; lu++) {
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-03-31 19:14 ` René Scharfe
@ 2020-03-31 23:21   ` René Scharfe
  2020-04-01 10:24     ` Jeff King
  2020-04-01 10:35   ` Jeff King
  1 sibling, 1 reply; 12+ messages in thread
From: René Scharfe @ 2020-03-31 23:21 UTC (permalink / raw)
  To: Jeff King, git

Am 31.03.20 um 21:14 schrieb René Scharfe:
> Am 31.03.20 um 11:45 schrieb Jeff King:
>> diff --git a/fast-import.c b/fast-import.c
>> index 202dda11a6..6ebac665a0 100644
>> --- a/fast-import.c
>> +++ b/fast-import.c
>> @@ -39,12 +39,25 @@
>>
>>  struct object_entry {
>>  	struct pack_idx_entry idx;
>> -	struct object_entry *next;
>>  	uint32_t type : TYPE_BITS,
>>  		pack_id : PACK_ID_BITS,
>>  		depth : DEPTH_BITS;
>>  };
>>
>> +static inline unsigned int object_entry_hash(struct object_entry *oe)
>> +{
>> +	return oidhash(&oe->idx.oid);
>> +}
>> +
>> +static inline int object_entry_equal(struct object_entry *a,
>> +				     struct object_entry *b)
>> +{
>> +	return oideq(&a->idx.oid, &b->idx.oid);
>> +}
>> +
>> +KHASH_INIT(object_entry_set, struct object_entry *, int, 0,
>> +	   object_entry_hash, object_entry_equal);
>> +
>>  struct object_entry_pool {
>>  	struct object_entry_pool *next_pool;
>>  	struct object_entry *next_free;
>> @@ -178,7 +191,7 @@ static off_t pack_size;
>>  /* Table of objects we've written. */
>>  static unsigned int object_entry_alloc = 5000;
>>  static struct object_entry_pool *blocks;
>> -static struct object_entry *object_table[1 << 16];
>> +static kh_object_entry_set_t object_table;
>>  static struct mark_set *marks;
>>  static const char *export_marks_file;
>>  static const char *import_marks_file;
>> @@ -455,44 +468,45 @@ static struct object_entry *new_object(struct object_id *oid)
>>
>>  static struct object_entry *find_object(struct object_id *oid)
>>  {
>> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
>> -	struct object_entry *e;
>> -	for (e = object_table[h]; e; e = e->next)
>> -		if (oideq(oid, &e->idx.oid))
>> -			return e;
>> +	/*
>> +	 * this cast works because we only look at the oid part of the entry,
>> +	 * and it comes first in the struct
>> +	 */
>> +	khiter_t pos = kh_get_object_entry_set(&object_table,
>> +					       (struct object_entry *)oid);
>
> Dirty, but I can believe the comment.
>
>
>> +	if (pos != kh_end(&object_table))
>> +		return kh_key(&object_table, pos);
>>  	return NULL;
>>  }
>>
>>  static struct object_entry *insert_object(struct object_id *oid)
>>  {
>> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
>> -	struct object_entry *e = object_table[h];
>> +	struct object_entry *e;
>> +	int was_empty;
>> +	khiter_t pos;
>>
>> -	while (e) {
>> -		if (oideq(oid, &e->idx.oid))
>> -			return e;
>> -		e = e->next;
>> -	}
>> +	pos = kh_put_object_entry_set(&object_table, (struct object_entry *)oid, &was_empty);
>
> Now this looks illegal.  khash is surely reading a full object_entry from oid,
> which only is a mere object_id, no?

No, it's a set of pointers, and khash only accesses the referenced objects
via the hash and comparison functions.

Storing the objects directly in the set and getting rid of new_object()
could improve performance due to reduced dereferencing overhead -- or
make it slower because more data has to be copied when the hashmap needs
to grow.  Worth a shot.  Later.

René

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-03-31 23:21   ` René Scharfe
@ 2020-04-01 10:24     ` Jeff King
  2020-04-02 18:40       ` René Scharfe
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2020-04-01 10:24 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Wed, Apr 01, 2020 at 01:21:08AM +0200, René Scharfe wrote:

> >> +	pos = kh_put_object_entry_set(&object_table, (struct object_entry *)oid, &was_empty);
> >
> > Now this looks illegal.  khash is surely reading a full object_entry from oid,
> > which only is a mere object_id, no?
> 
> No, it's a set of pointers, and khash only accesses the referenced objects
> via the hash and comparison functions.
> 
> Storing the objects directly in the set and getting rid of new_object()
> could improve performance due to reduced dereferencing overhead -- or
> make it slower because more data has to be copied when the hashmap needs
> to grow.  Worth a shot.  Later.

Yeah. I tried that, too, but it caused tons of test failures. Quite
possibly just a bug in my patch, which I did as quickly as possible. But
I think it performed about the same. Here it is for reference (though
you may be better off to start from scratch).

Note the "this is OK to cast from oid to object_entry" comment is
leftover from the earlier attempt, but it probably _isn't_ OK. Even
though we don't do anything actionable on the non-oid bytes, they do get
passed by value, which could mean reading random bytes.

---
diff --git a/fast-import.c b/fast-import.c
index 202dda11a6..5a1b451971 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -39,18 +39,24 @@
 
 struct object_entry {
 	struct pack_idx_entry idx;
-	struct object_entry *next;
 	uint32_t type : TYPE_BITS,
 		pack_id : PACK_ID_BITS,
 		depth : DEPTH_BITS;
 };
 
-struct object_entry_pool {
-	struct object_entry_pool *next_pool;
-	struct object_entry *next_free;
-	struct object_entry *end;
-	struct object_entry entries[FLEX_ARRAY]; /* more */
-};
+static inline unsigned int object_entry_hash(struct object_entry oe)
+{
+	return oidhash(&oe.idx.oid);
+}
+
+static inline int object_entry_equal(struct object_entry a,
+				     struct object_entry b)
+{
+	return oideq(&a.idx.oid, &b.idx.oid);
+}
+
+KHASH_INIT(object_entry_set, struct object_entry, int, 0,
+	   object_entry_hash, object_entry_equal);
 
 struct mark_set {
 	union {
@@ -176,9 +182,7 @@ static struct packed_git **all_packs;
 static off_t pack_size;
 
 /* Table of objects we've written. */
-static unsigned int object_entry_alloc = 5000;
-static struct object_entry_pool *blocks;
-static struct object_entry *object_table[1 << 16];
+static kh_object_entry_set_t object_table;
 static struct mark_set *marks;
 static const char *export_marks_file;
 static const char *import_marks_file;
@@ -428,71 +432,44 @@ static void set_checkpoint_signal(void)
 
 #endif
 
-static void alloc_objects(unsigned int cnt)
-{
-	struct object_entry_pool *b;
-
-	b = xmalloc(sizeof(struct object_entry_pool)
-		+ cnt * sizeof(struct object_entry));
-	b->next_pool = blocks;
-	b->next_free = b->entries;
-	b->end = b->entries + cnt;
-	blocks = b;
-	alloc_count += cnt;
-}
-
-static struct object_entry *new_object(struct object_id *oid)
-{
-	struct object_entry *e;
-
-	if (blocks->next_free == blocks->end)
-		alloc_objects(object_entry_alloc);
-
-	e = blocks->next_free++;
-	oidcpy(&e->idx.oid, oid);
-	return e;
-}
-
 static struct object_entry *find_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e;
-	for (e = object_table[h]; e; e = e->next)
-		if (oideq(oid, &e->idx.oid))
-			return e;
+	/*
+	 * this cast works because we only look at the oid part of the entry,
+	 * and it comes first in the struct
+	 */
+	khiter_t pos = kh_get_object_entry_set(&object_table,
+					       *(struct object_entry *)oid);
+	if (pos != kh_end(&object_table))
+		return &kh_key(&object_table, pos);
 	return NULL;
 }
 
 static struct object_entry *insert_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e = object_table[h];
+	struct object_entry e;
+	int was_empty;
+	khiter_t pos;
 
-	while (e) {
-		if (oideq(oid, &e->idx.oid))
-			return e;
-		e = e->next;
-	}
+	oidcpy(&e.idx.oid, oid);
+	e.idx.offset = 0;
+	pos = kh_put_object_entry_set(&object_table, e, &was_empty);
 
-	e = new_object(oid);
-	e->next = object_table[h];
-	e->idx.offset = 0;
-	object_table[h] = e;
-	return e;
+	return &kh_key(&object_table, pos);
 }
 
 static void invalidate_pack_id(unsigned int id)
 {
-	unsigned int h;
 	unsigned long lu;
 	struct tag *t;
+	khiter_t iter;
 
-	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
-		struct object_entry *e;
-
-		for (e = object_table[h]; e; e = e->next)
+	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
+		if (kh_exist(&object_table, iter)) {
+			struct object_entry *e = &kh_key(&object_table, iter);
 			if (e->pack_id == id)
 				e->pack_id = MAX_PACK_ID;
+		}
 	}
 
 	for (lu = 0; lu < branch_table_sz; lu++) {
@@ -766,15 +743,18 @@ static const char *create_index(void)
 	const char *tmpfile;
 	struct pack_idx_entry **idx, **c, **last;
 	struct object_entry *e;
-	struct object_entry_pool *o;
+	khiter_t iter;
 
 	/* Build the table of object IDs. */
 	ALLOC_ARRAY(idx, object_count);
 	c = idx;
-	for (o = blocks; o; o = o->next_pool)
-		for (e = o->next_free; e-- != o->entries;)
+	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
+		if (kh_exist(&object_table, iter)) {
+			e = &kh_key(&object_table, iter);
 			if (pack_id == e->pack_id)
 				*c++ = &e->idx;
+		}
+	}
 	last = idx + object_count;
 	if (c != last)
 		die("internal consistency error creating the index");
@@ -3504,7 +3484,6 @@ int cmd_main(int argc, const char **argv)
 	reset_pack_idx_option(&pack_idx_opts);
 	git_pack_config();
 
-	alloc_objects(object_entry_alloc);
 	strbuf_init(&command_buf, 0);
 	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
 	branch_table = xcalloc(branch_table_sz, sizeof(struct branch*));

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-03-31 19:14 ` René Scharfe
  2020-03-31 23:21   ` René Scharfe
@ 2020-04-01 10:35   ` Jeff King
  2020-04-01 11:16     ` Jeff King
  1 sibling, 1 reply; 12+ messages in thread
From: Jeff King @ 2020-04-01 10:35 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Tue, Mar 31, 2020 at 09:14:58PM +0200, René Scharfe wrote:

> > nr_objects   master       patch
> > 2^20         0m04.317s    0m5.109s
> > 2^21         0m10.204s    0m9.702s
> > 2^22         0m27.159s    0m17.911s
> > 2^23         1m19.038s    0m35.080s
> > 2^24         4m18.766s    1m10.233s
> 
> I get similar numbers.
> 
> Pre-sizing by putting this near the top of cmd_main() gets the time
> for 1M down to 4 seconds:
> 
> 	kh_resize_object_entry_set(&object_table, 1 << 18);
> 
> The more fair 1 << 16 does not cut it, the totally unfair 1 << 20 gives
> a small extra boost.

Good call. I can reproduce those results, too ("1 << 20" gives me a 12%
overall speedup). I'm surprised the growth isn't aggressive enough for
this early expansion to get lost in the noise.

> > +	/*
> > +	 * this cast works because we only look at the oid part of the entry,
> > +	 * and it comes first in the struct
> > +	 */
> > +	khiter_t pos = kh_get_object_entry_set(&object_table,
> > +					       (struct object_entry *)oid);
> 
> Dirty, but I can believe the comment.

Our hashmap.c implementation gets around this by letting the equality
function take an extra parameter. It's annoying when you're writing
those functions, but it should allow this case without any casting (or
preemptively allocating a struct).

> > -	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
> > -		struct object_entry *e;
> > -
> > -		for (e = object_table[h]; e; e = e->next)
> > +	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
> > +		if (kh_exist(&object_table, iter)) {
> > +			struct object_entry *e = kh_key(&object_table, iter);
> >  			if (e->pack_id == id)
> >  				e->pack_id = MAX_PACK_ID;
> > +		}
> >  	}
> 
> Is this really the best way to handle that, independently of the hashmap
> that's used?  I wonder how an extra hashmap or set of valid pack_id
> values (or set of invalidated pack_id values?) would fare against having
> to touch all object entries here.

I think the invalidation is pretty infrequent. It only gets called by
end_packfile() when there are few enough objects to loosen them. So
usually that would only happen once per process. You can also trigger it
manually with a "checkpoint" command, but if you're checkpointing often
enough to dump loose objects, I suspect you have other performance
problems.

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-01 10:35   ` Jeff King
@ 2020-04-01 11:16     ` Jeff King
  2020-04-02 18:40       ` René Scharfe
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2020-04-01 11:16 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Wed, Apr 01, 2020 at 06:35:23AM -0400, Jeff King wrote:

> > > +	/*
> > > +	 * this cast works because we only look at the oid part of the entry,
> > > +	 * and it comes first in the struct
> > > +	 */
> > > +	khiter_t pos = kh_get_object_entry_set(&object_table,
> > > +					       (struct object_entry *)oid);
> > 
> > Dirty, but I can believe the comment.
> 
> Our hashmap.c implementation gets around this by letting the equality
> function take an extra parameter. It's annoying when you're writing
> those functions, but it should allow this case without any casting (or
> preemptively allocating a struct).

And here's a patch trying that. Much to my surprise, it outperforms
khash, which has generally been faster in previous tests.

Here are the numbers I get:

nr_objects   master       khash      hashmap
2^20         0m4.317s     0m5.109s   0m3.890s
2^21         0m10.204s    0m9.702s   0m7.933s
2^22         0m27.159s    0m17.911s  0m16.751s
2^23         1m19.038s    0m35.080s  0m31.963s
2^24         4m18.766s    1m10.233s  1m6.793s

And I didn't even have to pre-size the table. This really makes me
wonder if there's some silly inefficiency in khash which we could be
addressing. Or maybe open-addressing really does lose to chaining here,
but I think we keep the load factor low enough that it should be a win.

---
diff --git a/fast-import.c b/fast-import.c
index 202dda11a6..0ef6defc10 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -39,12 +39,28 @@
 
 struct object_entry {
 	struct pack_idx_entry idx;
-	struct object_entry *next;
+	struct hashmap_entry ent;
 	uint32_t type : TYPE_BITS,
 		pack_id : PACK_ID_BITS,
 		depth : DEPTH_BITS;
 };
 
+static int object_entry_hashcmp(const void *map_data,
+				const struct hashmap_entry *eptr,
+				const struct hashmap_entry *entry_or_key,
+				const void *keydata)
+{
+	const struct object_id *oid = keydata;
+	const struct object_entry *e1, *e2;
+
+	e1 = container_of(eptr, const struct object_entry, ent);
+	if (oid)
+		return oidcmp(&e1->idx.oid, oid);
+
+	e2 = container_of(entry_or_key, const struct object_entry, ent);
+	return oidcmp(&e1->idx.oid, &e2->idx.oid);
+}
+
 struct object_entry_pool {
 	struct object_entry_pool *next_pool;
 	struct object_entry *next_free;
@@ -178,7 +194,7 @@ static off_t pack_size;
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 5000;
 static struct object_entry_pool *blocks;
-static struct object_entry *object_table[1 << 16];
+static struct hashmap object_table;
 static struct mark_set *marks;
 static const char *export_marks_file;
 static const char *import_marks_file;
@@ -455,44 +471,42 @@ static struct object_entry *new_object(struct object_id *oid)
 
 static struct object_entry *find_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e;
-	for (e = object_table[h]; e; e = e->next)
-		if (oideq(oid, &e->idx.oid))
-			return e;
+	struct hashmap_entry lookup_entry, *e;
+
+	hashmap_entry_init(&lookup_entry, oidhash(oid));
+	e = hashmap_get(&object_table, &lookup_entry, oid);
+	if (e)
+		return container_of(e, struct object_entry, ent);
 	return NULL;
 }
 
 static struct object_entry *insert_object(struct object_id *oid)
 {
-	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
-	struct object_entry *e = object_table[h];
+	struct object_entry *e;
+	struct hashmap_entry lookup_entry, *hashent;
 
-	while (e) {
-		if (oideq(oid, &e->idx.oid))
-			return e;
-		e = e->next;
-	}
+	hashmap_entry_init(&lookup_entry, oidhash(oid));
+	hashent = hashmap_get(&object_table, &lookup_entry, oid);
+	if (hashent)
+		return container_of(hashent, struct object_entry, ent);
 
 	e = new_object(oid);
-	e->next = object_table[h];
 	e->idx.offset = 0;
-	object_table[h] = e;
+	e->ent.hash = lookup_entry.hash;
+	hashmap_add(&object_table, &e->ent);
 	return e;
 }
 
 static void invalidate_pack_id(unsigned int id)
 {
-	unsigned int h;
 	unsigned long lu;
 	struct tag *t;
+	struct hashmap_iter iter;
+	struct object_entry *e;
 
-	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
-		struct object_entry *e;
-
-		for (e = object_table[h]; e; e = e->next)
-			if (e->pack_id == id)
-				e->pack_id = MAX_PACK_ID;
+	hashmap_for_each_entry(&object_table, &iter, e, ent) {
+		if (e->pack_id == id)
+			e->pack_id = MAX_PACK_ID;
 	}
 
 	for (lu = 0; lu < branch_table_sz; lu++) {
@@ -3511,6 +3525,8 @@ int cmd_main(int argc, const char **argv)
 	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
 	marks = mem_pool_calloc(&fi_mem_pool, 1, sizeof(struct mark_set));
 
+	hashmap_init(&object_table, object_entry_hashcmp, NULL, 0);
+
 	/*
 	 * We don't parse most options until after we've seen the set of
 	 * "feature" lines at the start of the stream (which allows the command

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-01 11:16     ` Jeff King
@ 2020-04-02 18:40       ` René Scharfe
  2020-04-03 12:12         ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: René Scharfe @ 2020-04-02 18:40 UTC (permalink / raw)
  To: Jeff King; +Cc: git

Am 01.04.20 um 13:16 schrieb Jeff King:
> On Wed, Apr 01, 2020 at 06:35:23AM -0400, Jeff King wrote:
>
>>>> +	/*
>>>> +	 * this cast works because we only look at the oid part of the entry,
>>>> +	 * and it comes first in the struct
>>>> +	 */
>>>> +	khiter_t pos = kh_get_object_entry_set(&object_table,
>>>> +					       (struct object_entry *)oid);
>>>
>>> Dirty, but I can believe the comment.
>>
>> Our hashmap.c implementation gets around this by letting the equality
>> function take an extra parameter. It's annoying when you're writing
>> those functions, but it should allow this case without any casting (or
>> preemptively allocating a struct).
>
> And here's a patch trying that. Much to my surprise, it outperforms
> khash, which has generally been faster in previous tests.
>
> Here are the numbers I get:
>
> nr_objects   master       khash      hashmap
> 2^20         0m4.317s     0m5.109s   0m3.890s
> 2^21         0m10.204s    0m9.702s   0m7.933s
> 2^22         0m27.159s    0m17.911s  0m16.751s
> 2^23         1m19.038s    0m35.080s  0m31.963s
> 2^24         4m18.766s    1m10.233s  1m6.793s
>
> And I didn't even have to pre-size the table. This really makes me
> wonder if there's some silly inefficiency in khash which we could be
> addressing. Or maybe open-addressing really does lose to chaining here,
> but I think we keep the load factor low enough that it should be a win.

Or we're just unlucky.  I tried to find the difference between khash
with and without presizing using callgrind, but came up empty.  It did
reveal that fast-import spends 70% of its cycles in a million memset()
calls issued (indirectly) by git_deflate_init() which in turn is called
by store_object() which is called from parse_and_store_blob(), though.

Why is the won second when handling 1M objects not showing in its
output?  I suspect it's because it uses its custom allocator to gather
its data.  So I ran the test with jemalloc2 preloaded:

nr_objects   master       khash       khash+preload
2^20         0m5.812s     0m5.600s    0m5.604s
2^21         0m12.913s    0m10.884s   0m10.357s
2^22         0m31.257s    0m21.461s   0m21.031s
2^23         1m20.904s    0m40.181s   0m42.607s
2^24         3m59.201s    1m21.104s   1m23.814s

My measurements are noisy, but my point is simply that with a different
allocator you'd not even have seen any slowdown when switching to khash.

>
> ---
> diff --git a/fast-import.c b/fast-import.c
> index 202dda11a6..0ef6defc10 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -39,12 +39,28 @@
>
>  struct object_entry {
>  	struct pack_idx_entry idx;
> -	struct object_entry *next;
> +	struct hashmap_entry ent;

That uses 16 bytes more memory per entry on x64 than khash would.
That's 256MB for 2^24 objects -- not ideal, but bearable, I guess.

>  	uint32_t type : TYPE_BITS,
>  		pack_id : PACK_ID_BITS,
>  		depth : DEPTH_BITS;
>  };
>
> +static int object_entry_hashcmp(const void *map_data,
> +				const struct hashmap_entry *eptr,
> +				const struct hashmap_entry *entry_or_key,
> +				const void *keydata)
> +{
> +	const struct object_id *oid = keydata;
> +	const struct object_entry *e1, *e2;
> +
> +	e1 = container_of(eptr, const struct object_entry, ent);

That's nicer that the pointer alchemy in the khash conversion for sure.

But why const?  Can const change the layout of a structure?  Scary.

René

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-01 10:24     ` Jeff King
@ 2020-04-02 18:40       ` René Scharfe
  2020-04-03 12:14         ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: René Scharfe @ 2020-04-02 18:40 UTC (permalink / raw)
  To: Jeff King; +Cc: git

Am 01.04.20 um 12:24 schrieb Jeff King:
> On Wed, Apr 01, 2020 at 01:21:08AM +0200, René Scharfe wrote:
>
>>>> +	pos = kh_put_object_entry_set(&object_table, (struct object_entry *)oid, &was_empty);
>>>
>>> Now this looks illegal.  khash is surely reading a full object_entry from oid,
>>> which only is a mere object_id, no?
>>
>> No, it's a set of pointers, and khash only accesses the referenced objects
>> via the hash and comparison functions.
>>
>> Storing the objects directly in the set and getting rid of new_object()
>> could improve performance due to reduced dereferencing overhead -- or
>> make it slower because more data has to be copied when the hashmap needs
>> to grow.  Worth a shot.  Later.
>
> Yeah. I tried that, too, but it caused tons of test failures. Quite
> possibly just a bug in my patch, which I did as quickly as possible. But
> I think it performed about the same. Here it is for reference (though
> you may be better off to start from scratch).

Tried that earlier, ran into failures as well.  I think the pointer
returned from insert_object() is stored somewhere and still used after
the next call, which could have invalidated it by a rehash.  E.g.
insert_mark() seems to do that.

> Note the "this is OK to cast from oid to object_entry" comment is
> leftover from the earlier attempt, but it probably _isn't_ OK. Even
> though we don't do anything actionable on the non-oid bytes, they do get
> passed by value, which could mean reading random bytes.

"Value" meaning pointer value when KHASH_INIT is give a pointer type,
as was the case in the patch with that comment (unlike in the patch
below).  So it should be OK there.

>
> ---
> diff --git a/fast-import.c b/fast-import.c
> index 202dda11a6..5a1b451971 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -39,18 +39,24 @@
>
>  struct object_entry {
>  	struct pack_idx_entry idx;
> -	struct object_entry *next;
>  	uint32_t type : TYPE_BITS,
>  		pack_id : PACK_ID_BITS,
>  		depth : DEPTH_BITS;
>  };
>
> -struct object_entry_pool {
> -	struct object_entry_pool *next_pool;
> -	struct object_entry *next_free;
> -	struct object_entry *end;
> -	struct object_entry entries[FLEX_ARRAY]; /* more */
> -};
> +static inline unsigned int object_entry_hash(struct object_entry oe)
> +{
> +	return oidhash(&oe.idx.oid);
> +}
> +
> +static inline int object_entry_equal(struct object_entry a,
> +				     struct object_entry b)
> +{
> +	return oideq(&a.idx.oid, &b.idx.oid);
> +}
> +
> +KHASH_INIT(object_entry_set, struct object_entry, int, 0,
> +	   object_entry_hash, object_entry_equal);
>
>  struct mark_set {
>  	union {
> @@ -176,9 +182,7 @@ static struct packed_git **all_packs;
>  static off_t pack_size;
>
>  /* Table of objects we've written. */
> -static unsigned int object_entry_alloc = 5000;
> -static struct object_entry_pool *blocks;
> -static struct object_entry *object_table[1 << 16];
> +static kh_object_entry_set_t object_table;
>  static struct mark_set *marks;
>  static const char *export_marks_file;
>  static const char *import_marks_file;
> @@ -428,71 +432,44 @@ static void set_checkpoint_signal(void)
>
>  #endif
>
> -static void alloc_objects(unsigned int cnt)
> -{
> -	struct object_entry_pool *b;
> -
> -	b = xmalloc(sizeof(struct object_entry_pool)
> -		+ cnt * sizeof(struct object_entry));
> -	b->next_pool = blocks;
> -	b->next_free = b->entries;
> -	b->end = b->entries + cnt;
> -	blocks = b;
> -	alloc_count += cnt;
> -}
> -
> -static struct object_entry *new_object(struct object_id *oid)
> -{
> -	struct object_entry *e;
> -
> -	if (blocks->next_free == blocks->end)
> -		alloc_objects(object_entry_alloc);
> -
> -	e = blocks->next_free++;
> -	oidcpy(&e->idx.oid, oid);
> -	return e;
> -}
> -
>  static struct object_entry *find_object(struct object_id *oid)
>  {
> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
> -	struct object_entry *e;
> -	for (e = object_table[h]; e; e = e->next)
> -		if (oideq(oid, &e->idx.oid))
> -			return e;
> +	/*
> +	 * this cast works because we only look at the oid part of the entry,
> +	 * and it comes first in the struct
> +	 */
> +	khiter_t pos = kh_get_object_entry_set(&object_table,
> +					       *(struct object_entry *)oid);

Yeah, this one here is not OK.  We'd need to build a dummy entry.

> +	if (pos != kh_end(&object_table))
> +		return &kh_key(&object_table, pos);
>  	return NULL;
>  }
>
>  static struct object_entry *insert_object(struct object_id *oid)
>  {
> -	unsigned int h = oid->hash[0] << 8 | oid->hash[1];
> -	struct object_entry *e = object_table[h];
> +	struct object_entry e;
> +	int was_empty;
> +	khiter_t pos;
>
> -	while (e) {
> -		if (oideq(oid, &e->idx.oid))
> -			return e;
> -		e = e->next;
> -	}
> +	oidcpy(&e.idx.oid, oid);
> +	e.idx.offset = 0;
> +	pos = kh_put_object_entry_set(&object_table, e, &was_empty);
>
> -	e = new_object(oid);
> -	e->next = object_table[h];
> -	e->idx.offset = 0;
> -	object_table[h] = e;
> -	return e;
> +	return &kh_key(&object_table, pos);
>  }
>
>  static void invalidate_pack_id(unsigned int id)
>  {
> -	unsigned int h;
>  	unsigned long lu;
>  	struct tag *t;
> +	khiter_t iter;
>
> -	for (h = 0; h < ARRAY_SIZE(object_table); h++) {
> -		struct object_entry *e;
> -
> -		for (e = object_table[h]; e; e = e->next)
> +	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
> +		if (kh_exist(&object_table, iter)) {
> +			struct object_entry *e = &kh_key(&object_table, iter);
>  			if (e->pack_id == id)
>  				e->pack_id = MAX_PACK_ID;
> +		}
>  	}
>
>  	for (lu = 0; lu < branch_table_sz; lu++) {
> @@ -766,15 +743,18 @@ static const char *create_index(void)
>  	const char *tmpfile;
>  	struct pack_idx_entry **idx, **c, **last;
>  	struct object_entry *e;
> -	struct object_entry_pool *o;
> +	khiter_t iter;
>
>  	/* Build the table of object IDs. */
>  	ALLOC_ARRAY(idx, object_count);
>  	c = idx;
> -	for (o = blocks; o; o = o->next_pool)
> -		for (e = o->next_free; e-- != o->entries;)
> +	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
> +		if (kh_exist(&object_table, iter)) {
> +			e = &kh_key(&object_table, iter);
>  			if (pack_id == e->pack_id)
>  				*c++ = &e->idx;
> +		}
> +	}

The original code writes the objects in reverse order of their creation
(LIFO), right?  Is that relevant?

But anyway,  we need stable object locations anyway if their addresses are
stored in other structs, as mentioned above.  Those pointers would have to
be replaced by object_ids and pointer derefs by hashmap lookups.  Not a
promising direction.

René

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-02 18:40       ` René Scharfe
@ 2020-04-03 12:12         ` Jeff King
  2020-04-03 18:53           ` René Scharfe
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2020-04-03 12:12 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Thu, Apr 02, 2020 at 08:40:35PM +0200, René Scharfe wrote:

> > And I didn't even have to pre-size the table. This really makes me
> > wonder if there's some silly inefficiency in khash which we could be
> > addressing. Or maybe open-addressing really does lose to chaining here,
> > but I think we keep the load factor low enough that it should be a win.
> 
> Or we're just unlucky.  I tried to find the difference between khash
> with and without presizing using callgrind, but came up empty.  It did
> reveal that fast-import spends 70% of its cycles in a million memset()
> calls issued (indirectly) by git_deflate_init() which in turn is called
> by store_object() which is called from parse_and_store_blob(), though.

I think that 70% is outsized in this case because we're dumping millions
of 4-byte blobs. In a real repo you'd have larger blobs, as well as
actual trees and commits pulling them together.

> Why is the won second when handling 1M objects not showing in its
> output?  I suspect it's because it uses its custom allocator to gather
> its data.  So I ran the test with jemalloc2 preloaded:
> 
> nr_objects   master       khash       khash+preload
> 2^20         0m5.812s     0m5.600s    0m5.604s
> 2^21         0m12.913s    0m10.884s   0m10.357s
> 2^22         0m31.257s    0m21.461s   0m21.031s
> 2^23         1m20.904s    0m40.181s   0m42.607s
> 2^24         3m59.201s    1m21.104s   1m23.814s
> 
> My measurements are noisy, but my point is simply that with a different
> allocator you'd not even have seen any slowdown when switching to khash.

Yeah, that makes sense. I still prefer the hashmap solution for its lack
of pointer hackery, given that it seems to perform as well or better.
I'll send a cleaned-up patch in a moment.

> >  struct object_entry {
> >  	struct pack_idx_entry idx;
> > -	struct object_entry *next;
> > +	struct hashmap_entry ent;
> 
> That uses 16 bytes more memory per entry on x64 than khash would.
> That's 256MB for 2^24 objects -- not ideal, but bearable, I guess.

Isn't it 8? We're dropping the old pointer and replacing it with the
"next" pointer in hashmap_entry, plus our 4-byte hash code (which likely
gets padded to 8).

I think it's probably OK in practice.

> > +static int object_entry_hashcmp(const void *map_data,
> > +				const struct hashmap_entry *eptr,
> > +				const struct hashmap_entry *entry_or_key,
> > +				const void *keydata)
> > +{
> > +	const struct object_id *oid = keydata;
> > +	const struct object_entry *e1, *e2;
> > +
> > +	e1 = container_of(eptr, const struct object_entry, ent);
> 
> That's nicer that the pointer alchemy in the khash conversion for sure.
> 
> But why const?  Can const change the layout of a structure?  Scary.

No, I don't think it can. I mostly copied the "const" from the other
container_of() hashmap sites. I don't think it matters in practice,
because we're assigning the result to a const pointer anyway. But it
seems a little cleaner not to momentarily cast away the constness even
inside the macro.

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-02 18:40       ` René Scharfe
@ 2020-04-03 12:14         ` Jeff King
  0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2020-04-03 12:14 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Thu, Apr 02, 2020 at 08:40:58PM +0200, René Scharfe wrote:

> >> Storing the objects directly in the set and getting rid of new_object()
> >> could improve performance due to reduced dereferencing overhead -- or
> >> make it slower because more data has to be copied when the hashmap needs
> >> to grow.  Worth a shot.  Later.
> >
> > Yeah. I tried that, too, but it caused tons of test failures. Quite
> > possibly just a bug in my patch, which I did as quickly as possible. But
> > I think it performed about the same. Here it is for reference (though
> > you may be better off to start from scratch).
> 
> Tried that earlier, ran into failures as well.  I think the pointer
> returned from insert_object() is stored somewhere and still used after
> the next call, which could have invalidated it by a rehash.  E.g.
> insert_mark() seems to do that.

That doesn't surprise me. I didn't look very hard, but mostly just did
the minimum to see if it would work (and it didn't).

It could perhaps be overcome, and I certainly don't mind if you want to
dig further, but I'm happy enough with the hashmap solution.

> > Note the "this is OK to cast from oid to object_entry" comment is
> > leftover from the earlier attempt, but it probably _isn't_ OK. Even
> > though we don't do anything actionable on the non-oid bytes, they do get
> > passed by value, which could mean reading random bytes.
> 
> "Value" meaning pointer value when KHASH_INIT is give a pointer type,
> as was the case in the patch with that comment (unlike in the patch
> below).  So it should be OK there.

Right, I meant the comment no longer applies in the patch below, because
we're not using a pointer type anymore.

> > -	for (o = blocks; o; o = o->next_pool)
> > -		for (e = o->next_free; e-- != o->entries;)
> > +	for (iter = kh_begin(&object_table); iter != kh_end(&object_table); iter++) {
> > +		if (kh_exist(&object_table, iter)) {
> > +			e = &kh_key(&object_table, iter);
> >  			if (pack_id == e->pack_id)
> >  				*c++ = &e->idx;
> > +		}
> > +	}
> 
> The original code writes the objects in reverse order of their creation
> (LIFO), right?  Is that relevant?

Yeah, agreed this is another weakness of this approach.

> But anyway,  we need stable object locations anyway if their addresses are
> stored in other structs, as mentioned above.  Those pointers would have to
> be replaced by object_ids and pointer derefs by hashmap lookups.  Not a
> promising direction.

Agreed.

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-03 12:12         ` Jeff King
@ 2020-04-03 18:53           ` René Scharfe
  2020-04-03 19:01             ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: René Scharfe @ 2020-04-03 18:53 UTC (permalink / raw)
  To: Jeff King; +Cc: git

Am 03.04.20 um 14:12 schrieb Jeff King:
> On Thu, Apr 02, 2020 at 08:40:35PM +0200, René Scharfe wrote:
>
>>>  struct object_entry {
>>>  	struct pack_idx_entry idx;
>>> -	struct object_entry *next;
>>> +	struct hashmap_entry ent;
>>
>> That uses 16 bytes more memory per entry on x64 than khash would.
>> That's 256MB for 2^24 objects -- not ideal, but bearable, I guess.
>
> Isn't it 8? We're dropping the old pointer and replacing it with the
> "next" pointer in hashmap_entry, plus our 4-byte hash code (which likely
> gets padded to 8).

That's right, so the difference to your khash version is 16, as the
latter removes the pointer without any replacement.

See https://www.godbolt.org/z/xs6CLL for a comparison.

>>> +static int object_entry_hashcmp(const void *map_data,
>>> +				const struct hashmap_entry *eptr,
>>> +				const struct hashmap_entry *entry_or_key,
>>> +				const void *keydata)
>>> +{
>>> +	const struct object_id *oid = keydata;
>>> +	const struct object_entry *e1, *e2;
>>> +
>>> +	e1 = container_of(eptr, const struct object_entry, ent);
>>
>> That's nicer that the pointer alchemy in the khash conversion for sure.
>>
>> But why const?  Can const change the layout of a structure?  Scary.
>
> No, I don't think it can. I mostly copied the "const" from the other
> container_of() hashmap sites. I don't think it matters in practice,
> because we're assigning the result to a const pointer anyway. But it
> seems a little cleaner not to momentarily cast away the constness even
> inside the macro.

Makes sense.  I disregarded the final cast in container_of when I
wrote the above.  Silly me.

René

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: fast-import's hash table is slow
  2020-04-03 18:53           ` René Scharfe
@ 2020-04-03 19:01             ` Jeff King
  0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2020-04-03 19:01 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

On Fri, Apr 03, 2020 at 08:53:23PM +0200, René Scharfe wrote:

> Am 03.04.20 um 14:12 schrieb Jeff King:
> > On Thu, Apr 02, 2020 at 08:40:35PM +0200, René Scharfe wrote:
> >
> >>>  struct object_entry {
> >>>  	struct pack_idx_entry idx;
> >>> -	struct object_entry *next;
> >>> +	struct hashmap_entry ent;
> >>
> >> That uses 16 bytes more memory per entry on x64 than khash would.
> >> That's 256MB for 2^24 objects -- not ideal, but bearable, I guess.
> >
> > Isn't it 8? We're dropping the old pointer and replacing it with the
> > "next" pointer in hashmap_entry, plus our 4-byte hash code (which likely
> > gets padded to 8).
> 
> That's right, so the difference to your khash version is 16, as the
> latter removes the pointer without any replacement.

Ah, OK. We are on the same page, then.

The khash version removes the pointer, but it presumably it would use a
larger number of buckets to keep the load factor down. I doubt it's
worth spending time running real-world heap-profiling experiments
(especially not on the silly synthetic test I showed).

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-04-03 19:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-31  9:45 fast-import's hash table is slow Jeff King
2020-03-31 19:14 ` René Scharfe
2020-03-31 23:21   ` René Scharfe
2020-04-01 10:24     ` Jeff King
2020-04-02 18:40       ` René Scharfe
2020-04-03 12:14         ` Jeff King
2020-04-01 10:35   ` Jeff King
2020-04-01 11:16     ` Jeff King
2020-04-02 18:40       ` René Scharfe
2020-04-03 12:12         ` Jeff King
2020-04-03 18:53           ` René Scharfe
2020-04-03 19:01             ` Jeff King

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).