git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH] fetch: replace string-list used as a look-up table with a hashmap
@ 2018-09-26 21:28 Junio C Hamano
  2018-09-26 22:59 ` Stefan Beller
  2018-09-27  5:34 ` Jeff King
  0 siblings, 2 replies; 11+ messages in thread
From: Junio C Hamano @ 2018-09-26 21:28 UTC (permalink / raw)
  To: git

In find_non_local_tags() helper function (used to implement the
"follow tags"), we use string_list_has_string() on two string lists
as a way to see if a refname has already been processed, etc.

All this code predates more modern in-core lookup API like hashmap;
replace them with two hashmaps and one string list---the hashmaps
are used for look-ups and the string list is to keep the order of
items in the returned result stable (which is the only single thing
hashmap does worse than lookups on string-list).

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---

 * Just to remind ourselves that we talked about getting rid of
   string-list used as look-up tables by replacing them with hashmap
   but haven't seen much effort expended on it.  I think this is my
   first semi-serious use of hashmap, and the code is expected to be
   full of newbie mistakes, inefficiencies and ignorance of idioms;
   pointing out any of which is much appreciated.

 builtin/fetch.c | 116 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 26 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 0696abfc2a..5d1fd8dd49 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -246,16 +246,73 @@ static int will_fetch(struct ref **head, const unsigned char *sha1)
 	return 0;
 }
 
+struct refname_hash_entry {
+	struct hashmap_entry ent; /* must be the first member */
+	struct object_id oid;
+	char refname[FLEX_ARRAY];
+};
+
+static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
+				  const struct refname_hash_entry *e1,
+				  const struct refname_hash_entry *e2,
+				  const void *keydata)
+{
+	return strcmp(e1->refname, keydata ? keydata : e2->refname);
+}
+
+static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
+						   const char *refname,
+						   const struct object_id *oid)
+{
+	struct refname_hash_entry *ent;
+	size_t len = strlen(refname);
+
+	FLEX_ALLOC_MEM(ent, refname, refname, len);
+	hashmap_entry_init(ent, memhash(refname, len));
+	oidcpy(&ent->oid, oid);
+	hashmap_add(map, ent);
+	return ent;
+}
+
+static int add_one_refname(const char *refname,
+			   const struct object_id *oid,
+			   int flag, void *cbdata)
+{
+	struct hashmap *refname_map = cbdata;
+
+	(void) refname_hash_add(refname_map, refname, oid);
+	return 0;
+}
+
+static void refname_hash_init(struct hashmap *map)
+{
+	hashmap_init(map, (hashmap_cmp_fn)refname_hash_entry_cmp, NULL, 0);
+}
+
+static int refname_hash_exists(struct hashmap *map, const char *refname)
+{
+	struct hashmap_entry key;
+	size_t len = strlen(refname);
+	hashmap_entry_init(&key, memhash(refname, len));
+
+	return !!hashmap_get(map, &key, refname);
+}
+
 static void find_non_local_tags(const struct ref *refs,
 				struct ref **head,
 				struct ref ***tail)
 {
-	struct string_list existing_refs = STRING_LIST_INIT_DUP;
-	struct string_list remote_refs = STRING_LIST_INIT_NODUP;
+	struct hashmap existing_refs;
+	struct hashmap remote_refs;
+	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
+	struct string_list_item *remote_ref_item;
 	const struct ref *ref;
-	struct string_list_item *item = NULL;
+	struct refname_hash_entry *item = NULL;
 
-	for_each_ref(add_existing, &existing_refs);
+	refname_hash_init(&existing_refs);
+	refname_hash_init(&remote_refs);
+
+	for_each_ref(add_one_refname, &existing_refs);
 	for (ref = refs; ref; ref = ref->next) {
 		if (!starts_with(ref->name, "refs/tags/"))
 			continue;
@@ -271,10 +328,10 @@ static void find_non_local_tags(const struct ref *refs,
 			    !has_object_file_with_flags(&ref->old_oid,
 							OBJECT_INFO_QUICK) &&
 			    !will_fetch(head, ref->old_oid.hash) &&
-			    !has_sha1_file_with_flags(item->util,
+			    !has_sha1_file_with_flags(item->oid.hash,
 						      OBJECT_INFO_QUICK) &&
-			    !will_fetch(head, item->util))
-				item->util = NULL;
+			    !will_fetch(head, item->oid.hash))
+				oidclr(&item->oid);
 			item = NULL;
 			continue;
 		}
@@ -286,47 +343,54 @@ static void find_non_local_tags(const struct ref *refs,
 		 * fetch.
 		 */
 		if (item &&
-		    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-		    !will_fetch(head, item->util))
-			item->util = NULL;
+		    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+		    !will_fetch(head, item->oid.hash))
+			oidclr(&item->oid);
 
 		item = NULL;
 
 		/* skip duplicates and refs that we already have */
-		if (string_list_has_string(&remote_refs, ref->name) ||
-		    string_list_has_string(&existing_refs, ref->name))
+		if (refname_hash_exists(&remote_refs, ref->name) ||
+		    refname_hash_exists(&existing_refs, ref->name))
 			continue;
 
-		item = string_list_insert(&remote_refs, ref->name);
-		item->util = (void *)&ref->old_oid;
+		item = refname_hash_add(&remote_refs, ref->name, &ref->old_oid);
+		string_list_insert(&remote_refs_list, ref->name);
 	}
-	string_list_clear(&existing_refs, 1);
+	hashmap_free(&existing_refs, 1);
 
 	/*
 	 * We may have a final lightweight tag that needs to be
 	 * checked to see if it needs fetching.
 	 */
 	if (item &&
-	    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-	    !will_fetch(head, item->util))
-		item->util = NULL;
+	    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+	    !will_fetch(head, item->oid.hash))
+		oidclr(&item->oid);
 
 	/*
-	 * For all the tags in the remote_refs string list,
+	 * For all the tags in the remote_refs_list,
 	 * add them to the list of refs to be fetched
 	 */
-	for_each_string_list_item(item, &remote_refs) {
+	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
+		const char *refname = remote_ref_item->string;
+		struct hashmap_entry key;
+
+		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
+		item = hashmap_get(&remote_refs, &key, refname);
+		if (!item)
+			continue; /* can this happen??? */
+
 		/* Unless we have already decided to ignore this item... */
-		if (item->util)
-		{
-			struct ref *rm = alloc_ref(item->string);
-			rm->peer_ref = alloc_ref(item->string);
-			oidcpy(&rm->old_oid, item->util);
+		if (!is_null_oid(&item->oid)) {
+			struct ref *rm = alloc_ref(item->refname);
+			rm->peer_ref = alloc_ref(item->refname);
+			oidcpy(&rm->old_oid, &item->oid);
 			**tail = rm;
 			*tail = &rm->next;
 		}
 	}
-
+	hashmap_free(&remote_refs, 1);
 	string_list_clear(&remote_refs, 0);
 }
 
-- 
2.19.0-271-gfe8321ec05


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-26 21:28 [PATCH] fetch: replace string-list used as a look-up table with a hashmap Junio C Hamano
@ 2018-09-26 22:59 ` Stefan Beller
  2018-09-27  5:54   ` Jeff King
  2018-09-27  5:34 ` Jeff King
  1 sibling, 1 reply; 11+ messages in thread
From: Stefan Beller @ 2018-09-26 22:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, Sep 26, 2018 at 2:28 PM Junio C Hamano <gitster@pobox.com> wrote:

>
> +struct refname_hash_entry {
> +       struct hashmap_entry ent; /* must be the first member */

$ git grep "struct hashmap_entry" reveals that we have another
convention that we follow but not document, which is to stress
the importance of putting the hashmap_entry first. ;-)

> +       struct object_id oid;
> +       char refname[FLEX_ARRAY];
> +};
> +
> +static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
> +                                 const struct refname_hash_entry *e1,
> +                                 const struct refname_hash_entry *e2,
> +                                 const void *keydata)
> +{
> +       return strcmp(e1->refname, keydata ? keydata : e2->refname);
> +}

and later

    hashmap_init(...  (hashmap_cmp_fn)refname_hash_entry_cmp, ...);

I wonder if we'd want to stick to this style, as that seems to be the easiest
to do, and drop the efforts started in 55c965f3a2 (Merge branch
'sb/hashmap-cleanup', 2017-08-11), that avoids the cast in the init,
but puts the (implicit) casts in the _cmp function as we'd take
void pointers as the function signature and recast to a local variable.

> +
> +static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
> +                                                  const char *refname,
> +                                                  const struct object_id *oid)
> +{
> +       struct refname_hash_entry *ent;
> +       size_t len = strlen(refname);
> +
> +       FLEX_ALLOC_MEM(ent, refname, refname, len);
> +       hashmap_entry_init(ent, memhash(refname, len));

memhash is preferred over strhash as we already know the length.
For a second I wondered why we use it instead of strhash

> +static int refname_hash_exists(struct hashmap *map, const char *refname)
> +{
> +       struct hashmap_entry key;
> +       size_t len = strlen(refname);
> +       hashmap_entry_init(&key, memhash(refname, len));

Here we could use strhash, but as they could change to
differing implementation, we keep using memhash.

>         /*
> -        * For all the tags in the remote_refs string list,
> +        * For all the tags in the remote_refs_list,
>          * add them to the list of refs to be fetched
>          */
> -       for_each_string_list_item(item, &remote_refs) {
> +       for_each_string_list_item(remote_ref_item, &remote_refs_list) {
> +               const char *refname = remote_ref_item->string;
> +               struct hashmap_entry key;
> +
> +               hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> +               item = hashmap_get(&remote_refs, &key, refname);

> +               if (!item)
> +                       continue; /* can this happen??? */

I thought this could happen when we have hash collisions,
I convinced myself otherwise, as we pass the refname to hashmap_get
as the 'keydata', so when there is a hash collision we keep comparing
to the real value.

And as whenever we have an insert to remote_refs_list, we also
have a refname_hash_add to the remote_refs hashmap,
I think the return of NULL is not possible, and we could switch
it to BUG(...);


> All this code predates more modern in-core lookup API like hashmap;
> replace them with two hashmaps and one string list---the hashmaps
> are used for look-ups and the string list is to keep the order of
> items in the returned result stable (which is the only single thing
> hashmap does worse than lookups on string-list).
>
> Signed-off-by: Junio C Hamano <gitster@pobox.com>
> ---

I wonder if we want to add an API to the hashmap that allows for ordered
iteration (as hashmap_iter_* relies on the hashing function for its output)

I think it may be too early to do so, but there are 2 thoughts on how to do it:
* Each element keeps (prev/next) pointers to the previously inserted
  element and updates the next pointer when the next element is inserted.
  When removing elements we'd have to update the next and prev pointer
  of the adjacent elements.
  Then iteration can be done in insertion-order.
  Probably this would go into its own file ordered-hashmap.{c, h}
* provide an order function to hashmap_iter_init, which then
  orders according to that function before the first call of
  hashmap_iter_next returns.

>  * Just to remind ourselves that we talked about getting rid of
>    string-list used as look-up tables by replacing them with hashmap
>    but haven't seen much effort expended on it.  I think this is my
>    first semi-serious use of hashmap, and the code is expected to be
>    full of newbie mistakes, inefficiencies and ignorance of idioms;
>    pointing out any of which is much appreciated.

I could not find newbie mistakes, but gave my thoughts anyway.

Stefan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-26 21:28 [PATCH] fetch: replace string-list used as a look-up table with a hashmap Junio C Hamano
  2018-09-26 22:59 ` Stefan Beller
@ 2018-09-27  5:34 ` Jeff King
  2018-09-30  2:11   ` Junio C Hamano
  1 sibling, 1 reply; 11+ messages in thread
From: Jeff King @ 2018-09-27  5:34 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, Sep 26, 2018 at 02:28:28PM -0700, Junio C Hamano wrote:

> In find_non_local_tags() helper function (used to implement the
> "follow tags"), we use string_list_has_string() on two string lists
> as a way to see if a refname has already been processed, etc.
> 
> All this code predates more modern in-core lookup API like hashmap;
> replace them with two hashmaps and one string list---the hashmaps
> are used for look-ups and the string list is to keep the order of
> items in the returned result stable (which is the only single thing
> hashmap does worse than lookups on string-list).
> 
> Signed-off-by: Junio C Hamano <gitster@pobox.com>
> ---
> 
>  * Just to remind ourselves that we talked about getting rid of
>    string-list used as look-up tables by replacing them with hashmap
>    but haven't seen much effort expended on it.  I think this is my
>    first semi-serious use of hashmap, and the code is expected to be
>    full of newbie mistakes, inefficiencies and ignorance of idioms;
>    pointing out any of which is much appreciated.

As you probably noticed, there's a bit of boilerplate required to use a
hashmap. I had figured we could replace most of these with a single
"strmap" API to map a string to a void pointer (which is basically
what string-list gives you).

That would save on the boilerplate. But your solution here replaces a
"void *" pointer with an actual "struct object_id" member, which
improves type-safety. It also removes questions about memory lifetimes
(at the minor cost of copying the oids). So this path is probably better
if we don't mind a little extra code.

I do note that your struct just has the same information as "struct
ref":

> +struct refname_hash_entry {
> +	struct hashmap_entry ent; /* must be the first member */
> +	struct object_id oid;
> +	char refname[FLEX_ARRAY];
> +};

So yet another alternative here is to just define a single hashmap that
stores void pointers. That also throws away some type safety, but is
maybe conceptually simpler. I dunno. It's actually a pain to do that
with "struct hashmap" because it requires the caller to handle
allocation. An open-addressed hash table, as we use elsewhere (and in
khash.h) is a bit simpler, since it doesn't need to do any per-entry
malloc.

To be clear, I'm perfectly happy with the approach in your patch here.
I'm just musing on what might might be the least painful thing for doing
more of them.

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-26 22:59 ` Stefan Beller
@ 2018-09-27  5:54   ` Jeff King
  2018-09-29 18:31     ` Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff King @ 2018-09-27  5:54 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Junio C Hamano, git

On Wed, Sep 26, 2018 at 03:59:08PM -0700, Stefan Beller wrote:

> > +struct refname_hash_entry {
> > +       struct hashmap_entry ent; /* must be the first member */
> 
> $ git grep "struct hashmap_entry" reveals that we have another
> convention that we follow but not document, which is to stress
> the importance of putting the hashmap_entry first. ;-)

One thing I've liked about the list.h implementation is that you can
store the list pointer anywhere in the struct, or even have the same
struct in multiple lists.

The only funny thing is that you have to "dereference" the iterator like
this:

  struct list_head *pos;
  struct actual_thing *item;
  ...
  item = list_entry(pos, struct actual_thing, list_member);

which is a minor pain, but it's reasonably hard to get it wrong.

I wonder if we could do the same here. The hashmap would only ever see
the "struct hashmap_entry", and then the caller would convert that back
to the actual type. I think we could even get away with not converting
existing callers; if the hashmap _is_ at the front, then that
list_entry() really just devolves to a cast. So as long as the struct
definition and the users of the struct agree, it would just work.

> > +static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
> > +                                 const struct refname_hash_entry *e1,
> > +                                 const struct refname_hash_entry *e2,
> > +                                 const void *keydata)
> > +{
> > +       return strcmp(e1->refname, keydata ? keydata : e2->refname);
> > +}
> 
> and later
> 
>     hashmap_init(...  (hashmap_cmp_fn)refname_hash_entry_cmp, ...);
> 
> I wonder if we'd want to stick to this style, as that seems to be the easiest
> to do, and drop the efforts started in 55c965f3a2 (Merge branch
> 'sb/hashmap-cleanup', 2017-08-11), that avoids the cast in the init,
> but puts the (implicit) casts in the _cmp function as we'd take
> void pointers as the function signature and recast to a local variable.

I think that casting the function pointer technically breaks the C
standard, though probably not in a way that is a problem on any real
systems.

The other thing about the "cast inside the cmp function" approach from
sb/hashmap-cleanup is that it is less likely to go stale. If we change
the definition of hashmap_cmp_fn, then any functions which are manually
cast will suppress the compiler errors. When the function takes void
pointers, the same can happen if "struct refname_hash_entry" is swapped
out for another struct, but IMHO that's a less likely error to make.

I admit that's just a gut feeling, though. It would be nice if we could
avoid casting altogether, but I think that puts us into macro territory
(which I'm not altogether opposed to, but it has its own drawbacks).

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-27  5:54   ` Jeff King
@ 2018-09-29 18:31     ` Junio C Hamano
  2018-09-30  4:57       ` Jeff King
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2018-09-29 18:31 UTC (permalink / raw)
  To: Jeff King; +Cc: Stefan Beller, git

Jeff King <peff@peff.net> writes:

> On Wed, Sep 26, 2018 at 03:59:08PM -0700, Stefan Beller wrote:
>
>> > +struct refname_hash_entry {
>> > +       struct hashmap_entry ent; /* must be the first member */
>> 
>> $ git grep "struct hashmap_entry" reveals that we have another
>> convention that we follow but not document, which is to stress
>> the importance of putting the hashmap_entry first. ;-)
>
> One thing I've liked about the list.h implementation is that you can
> store the list pointer anywhere in the struct, or even have the same
> struct in multiple lists.
>
> The only funny thing is that you have to "dereference" the iterator like
> this:
>
>   struct list_head *pos;
>   struct actual_thing *item;
>   ...
>   item = list_entry(pos, struct actual_thing, list_member);
>
> which is a minor pain, but it's reasonably hard to get it wrong.
>
> I wonder if we could do the same here. The hashmap would only ever see
> the "struct hashmap_entry", and then the caller would convert that back
> to the actual type.

Hmph, how would hashmap_cmp_fn look like with that scheme?  It would
get one entry, another entry (or just the skeleton of it) and
optionally a separate keydata (if the second one is skeleton), and
the first two points at the embedded hashmap struct, not the
surrounding one.  The callback function is now responsible for
calling a hashmap_entry() macro that adjusts for the negative offset
like list_entry() does?

> I think we could even get away with not converting
> existing callers; if the hashmap _is_ at the front, then that
> list_entry() really just devolves to a cast.

Yes.

> So as long as the struct
> definition and the users of the struct agree, it would just work.

Yes, too.

Was it ever a consideration, when allowing struct list-head anywhere
in the enclosing struct, that it would allow an element to be on
more than one list?  Would it benefit us to be able to place an
element in multiple hashmaps because we do not have to have the
embedded hashmap_entry always at the beginning if we did this
change?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-27  5:34 ` Jeff King
@ 2018-09-30  2:11   ` Junio C Hamano
  2018-10-19  3:48     ` [PATCH v3] " Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2018-09-30  2:11 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Stefan Beller, Ramsay Jones

Jeff King <peff@peff.net> writes:

> So yet another alternative here is to just define a single hashmap that
> stores void pointers. That also throws away some type safety, but is
> maybe conceptually simpler. I dunno.

I was tempted to try that route, which would essentially give us "if
you are abusing string-list as a look-up table, and if either you do
not need to iterate over all the elements, or you do not care about
the order in which such an interation is made, then use this
instead" API that can more easily substitute string-list without
boilerplate.

But I am not sure if that is worth it.  Perhaps we may end up doing
so when we find the second thing to move from string-list to hashmap,
but not with this patch (and yes, there is another string-list user
in the same source file, which I think can reuse what I added with
this patch).

In any case, I expect to be offline more than half of the next week,
so before I forget, here is an updated patch.  Two changes since the
version posted earlier are:

 - remote_refs_list (not remote_refs which is a hashmap) is passed
   to string_list_clear() at the end.  This is a fix for an outright
   bug Ramsay noticed.

 - The callback function now takes (const void *) and casts its
   parameters to correct type before they are used, instead of
   casting the pointer to a function whose signature is wrong in the
   call to hashmap_init().  This was noticed by Stefan and I agree
   that doing it this way makes more sense.

-- >8 --
Subject: [PATCH v2] fetch: replace string-list used as a look-up table with a hashmap

In find_non_local_tags() helper function (used to implement the
"follow tags"), we use string_list_has_string() on two string lists
as a way to see if a refname has already been processed, etc.

All this code predates more modern in-core lookup API like hashmap;
replace them with two hashmaps and one string list---the hashmaps
are used for look-ups and the string list is to keep the order of
items in the returned result stable (which is the only single thing
hashmap does worse than lookups on string-list).

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/fetch.c | 121 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 94 insertions(+), 27 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 0696abfc2a..489531ec93 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -246,16 +246,76 @@ static int will_fetch(struct ref **head, const unsigned char *sha1)
 	return 0;
 }
 
+struct refname_hash_entry {
+	struct hashmap_entry ent; /* must be the first member */
+	struct object_id oid;
+	char refname[FLEX_ARRAY];
+};
+
+static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
+				  const void *e1_,
+				  const void *e2_,
+				  const void *keydata)
+{
+	const struct refname_hash_entry *e1 = e1_;
+	const struct refname_hash_entry *e2 = e2_;
+
+	return strcmp(e1->refname, keydata ? keydata : e2->refname);
+}
+
+static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
+						   const char *refname,
+						   const struct object_id *oid)
+{
+	struct refname_hash_entry *ent;
+	size_t len = strlen(refname);
+
+	FLEX_ALLOC_MEM(ent, refname, refname, len);
+	hashmap_entry_init(ent, memhash(refname, len));
+	oidcpy(&ent->oid, oid);
+	hashmap_add(map, ent);
+	return ent;
+}
+
+static int add_one_refname(const char *refname,
+			   const struct object_id *oid,
+			   int flag, void *cbdata)
+{
+	struct hashmap *refname_map = cbdata;
+
+	(void) refname_hash_add(refname_map, refname, oid);
+	return 0;
+}
+
+static void refname_hash_init(struct hashmap *map)
+{
+	hashmap_init(map, refname_hash_entry_cmp, NULL, 0);
+}
+
+static int refname_hash_exists(struct hashmap *map, const char *refname)
+{
+	struct hashmap_entry key;
+	size_t len = strlen(refname);
+	hashmap_entry_init(&key, memhash(refname, len));
+
+	return !!hashmap_get(map, &key, refname);
+}
+
 static void find_non_local_tags(const struct ref *refs,
 				struct ref **head,
 				struct ref ***tail)
 {
-	struct string_list existing_refs = STRING_LIST_INIT_DUP;
-	struct string_list remote_refs = STRING_LIST_INIT_NODUP;
+	struct hashmap existing_refs;
+	struct hashmap remote_refs;
+	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
+	struct string_list_item *remote_ref_item;
 	const struct ref *ref;
-	struct string_list_item *item = NULL;
+	struct refname_hash_entry *item = NULL;
 
-	for_each_ref(add_existing, &existing_refs);
+	refname_hash_init(&existing_refs);
+	refname_hash_init(&remote_refs);
+
+	for_each_ref(add_one_refname, &existing_refs);
 	for (ref = refs; ref; ref = ref->next) {
 		if (!starts_with(ref->name, "refs/tags/"))
 			continue;
@@ -271,10 +331,10 @@ static void find_non_local_tags(const struct ref *refs,
 			    !has_object_file_with_flags(&ref->old_oid,
 							OBJECT_INFO_QUICK) &&
 			    !will_fetch(head, ref->old_oid.hash) &&
-			    !has_sha1_file_with_flags(item->util,
+			    !has_sha1_file_with_flags(item->oid.hash,
 						      OBJECT_INFO_QUICK) &&
-			    !will_fetch(head, item->util))
-				item->util = NULL;
+			    !will_fetch(head, item->oid.hash))
+				oidclr(&item->oid);
 			item = NULL;
 			continue;
 		}
@@ -286,48 +346,55 @@ static void find_non_local_tags(const struct ref *refs,
 		 * fetch.
 		 */
 		if (item &&
-		    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-		    !will_fetch(head, item->util))
-			item->util = NULL;
+		    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+		    !will_fetch(head, item->oid.hash))
+			oidclr(&item->oid);
 
 		item = NULL;
 
 		/* skip duplicates and refs that we already have */
-		if (string_list_has_string(&remote_refs, ref->name) ||
-		    string_list_has_string(&existing_refs, ref->name))
+		if (refname_hash_exists(&remote_refs, ref->name) ||
+		    refname_hash_exists(&existing_refs, ref->name))
 			continue;
 
-		item = string_list_insert(&remote_refs, ref->name);
-		item->util = (void *)&ref->old_oid;
+		item = refname_hash_add(&remote_refs, ref->name, &ref->old_oid);
+		string_list_insert(&remote_refs_list, ref->name);
 	}
-	string_list_clear(&existing_refs, 1);
+	hashmap_free(&existing_refs, 1);
 
 	/*
 	 * We may have a final lightweight tag that needs to be
 	 * checked to see if it needs fetching.
 	 */
 	if (item &&
-	    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-	    !will_fetch(head, item->util))
-		item->util = NULL;
+	    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+	    !will_fetch(head, item->oid.hash))
+		oidclr(&item->oid);
 
 	/*
-	 * For all the tags in the remote_refs string list,
+	 * For all the tags in the remote_refs_list,
 	 * add them to the list of refs to be fetched
 	 */
-	for_each_string_list_item(item, &remote_refs) {
+	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
+		const char *refname = remote_ref_item->string;
+		struct hashmap_entry key;
+
+		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
+		item = hashmap_get(&remote_refs, &key, refname);
+		if (!item)
+			continue; /* can this happen??? */
+
 		/* Unless we have already decided to ignore this item... */
-		if (item->util)
-		{
-			struct ref *rm = alloc_ref(item->string);
-			rm->peer_ref = alloc_ref(item->string);
-			oidcpy(&rm->old_oid, item->util);
+		if (!is_null_oid(&item->oid)) {
+			struct ref *rm = alloc_ref(item->refname);
+			rm->peer_ref = alloc_ref(item->refname);
+			oidcpy(&rm->old_oid, &item->oid);
 			**tail = rm;
 			*tail = &rm->next;
 		}
 	}
-
-	string_list_clear(&remote_refs, 0);
+	hashmap_free(&remote_refs, 1);
+	string_list_clear(&remote_refs_list, 0);
 }
 
 static struct ref *get_ref_map(struct remote *remote,
-- 
2.19.0-271-gfe8321ec05



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-29 18:31     ` Junio C Hamano
@ 2018-09-30  4:57       ` Jeff King
  0 siblings, 0 replies; 11+ messages in thread
From: Jeff King @ 2018-09-30  4:57 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Stefan Beller, git

On Sat, Sep 29, 2018 at 11:31:08AM -0700, Junio C Hamano wrote:

> > The only funny thing is that you have to "dereference" the iterator like
> > this:
> >
> >   struct list_head *pos;
> >   struct actual_thing *item;
> >   ...
> >   item = list_entry(pos, struct actual_thing, list_member);
> >
> > which is a minor pain, but it's reasonably hard to get it wrong.
> >
> > I wonder if we could do the same here. The hashmap would only ever see
> > the "struct hashmap_entry", and then the caller would convert that back
> > to the actual type.
> 
> Hmph, how would hashmap_cmp_fn look like with that scheme?  It would
> get one entry, another entry (or just the skeleton of it) and
> optionally a separate keydata (if the second one is skeleton), and
> the first two points at the embedded hashmap struct, not the
> surrounding one.  The callback function is now responsible for
> calling a hashmap_entry() macro that adjusts for the negative offset
> like list_entry() does?

Exactly. The comparison functions currently look something like this:

  int foo_hash_cmp(const void *data,
		   const void *e1, const void *e2,
		   const void *keydata)
  {
	const struct foo *f1 = (const struct foo *)e1;
	const struct foo *f2 = (const struct foo *)e2;
	return strcmp(foo->field, keydata ? keydata : foo->field);
  }

(this is using the correct function signature; you can drop the casts if
you violate that, but IMHO this style is more maintainable in the long
run). With the hashmap_entry at an arbitrary position, the body is:

  const struct foo *f1 = hash_entry(e1, struct foo, hash_field);
  const struct foo *f2 = hash_entry(e2, struct foo, hash_field);
  return strcmp(foo->field, keydata ? keydata : foo->field);

where hash_field is the member of "struct foo" that holds the "struct
hashmap_entry". So that's not so different, but there is an interesting
implication here. The comparison callback has to know which
hashmap_entry name to use! So if you have a struct that can be in two
hashes, like:

  struct foo {
    char *name;
    struct hashmap_entry h1;
    struct hashmap_entry h2;
  };

then you cannot use a single foo_hash_cmp() function. You have to use a
different one for each field. Which seems kind of nasty and error-prone.

So I dunno. Maybe this is not a good direction. I have to admit that I'm
not wild about our hashmap implementation in general:

  - the way it handles allocations is often awkward, or causes you to
    write confusing boilerplate

  - it's not type-safe

  - it seems slower than open-addressed alternatives (e.g., over in [1]
    we determined that oidset can be made almost twice as fast using
    khash)

  - the chained implementation in hashmap.c is probably better for cases
    where there's a lot of deletions, because removing from the hashmap
    truly removes everything (whereas with open-addressed schemes, we
    either didn't implement deletion at all, or it sets a marker which
    allows the item to be dropped next time the hash is resized)

[1] https://public-inbox.org/git/20180814015842.GA27055@sigill.intra.peff.net/

> Was it ever a consideration, when allowing struct list-head anywhere
> in the enclosing struct, that it would allow an element to be on
> more than one list?

I don't know the history of that list code in that kernel, but I always
assumed that was one of the goals. We don't use it yet, but there are
places where we could (e.g., packed_git is part of the regular packed
list, as well as the mru; the former is implemented as a singly-linked
list, but that's mostly historical).

It also allows us to have an item in a list and a hashmap (since if they
both needed to be at the start of the struct, that wouldn't work). We do
use that ability for delta_base_cache_entry.

> Would it benefit us to be able to place an
> element in multiple hashmaps because we do not have to have the
> embedded hashmap_entry always at the beginning if we did this
> change?

I'm not sure. I think it would allow us to more aggressively stick the
hashmap_entry inside existing structs. For instance, in the patch from
this thread, could we actually shove the hashmap_entry structs into
"struct ref", and save having to allocate the extra refname_hash struct?

I guess that pollutes "struct ref", though. Unless we generically say
"it can be a part of up to 3 hashes" and provide struct hashmap_entry
h1, struct hashmap_entry h2, etc. And then your new code does:

  struct hashmap existing_refs;
  struct hashmap remote_refs;

  /*
   * These need to be different compare functions to access the
   * h1 and h2 fields in "struct ref"!
   */
  hashmap_init(&existing_refs, ref_hash_cmp_h1);
  hashmap_init(&remote_refs, ref_hash_cmp_h2);

  ...

  hashmap_add(&existing_refs, &some_ref->h1);

But that seems pretty error-prone (not just confusing h1 and h2 here,
but how do we know that some other part of the code isn't using "h1" for
its own hash already?)

So again, maybe this is just a bad direction.

At least with an open-addressed scheme, the hash table itself contains
everything, and you can just store pointers to the existing refs.

-Peff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v3] fetch: replace string-list used as a look-up table with a hashmap
  2018-09-30  2:11   ` Junio C Hamano
@ 2018-10-19  3:48     ` Junio C Hamano
  2018-10-22  9:57       ` Johannes Schindelin
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2018-10-19  3:48 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Stefan Beller, Ramsay Jones

In find_non_local_tags() helper function (used to implement the
"follow tags"), we use string_list_has_string() on two string lists
as a way to see if a refname has already been processed, etc.

All this code predates more modern in-core lookup API like hashmap;
replace them with two hashmaps and one string list---the hashmaps
are used for look-ups and the string list is to keep the order of
items in the returned result stable (which is the only single thing
hashmap does worse than lookups on string-list).

Similarly, get_ref_map() uses another string-list as a look-up table
for existing refs.  Replace it with a hashmap.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---

 * This converts another string-list user in the same file.  For now
   I am done with this topic; I'm willing to fix bugs in this patch,
   but do not intend to spend time killing more string-lists used as
   look-up tables.

 builtin/fetch.c | 152 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 106 insertions(+), 46 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 0696abfc2a..0f8e333022 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -223,18 +223,6 @@ static void add_merge_config(struct ref **head,
 	}
 }
 
-static int add_existing(const char *refname, const struct object_id *oid,
-			int flag, void *cbdata)
-{
-	struct string_list *list = (struct string_list *)cbdata;
-	struct string_list_item *item = string_list_insert(list, refname);
-	struct object_id *old_oid = xmalloc(sizeof(*old_oid));
-
-	oidcpy(old_oid, oid);
-	item->util = old_oid;
-	return 0;
-}
-
 static int will_fetch(struct ref **head, const unsigned char *sha1)
 {
 	struct ref *rm = *head;
@@ -246,16 +234,76 @@ static int will_fetch(struct ref **head, const unsigned char *sha1)
 	return 0;
 }
 
+struct refname_hash_entry {
+	struct hashmap_entry ent; /* must be the first member */
+	struct object_id oid;
+	char refname[FLEX_ARRAY];
+};
+
+static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
+				  const void *e1_,
+				  const void *e2_,
+				  const void *keydata)
+{
+	const struct refname_hash_entry *e1 = e1_;
+	const struct refname_hash_entry *e2 = e2_;
+
+	return strcmp(e1->refname, keydata ? keydata : e2->refname);
+}
+
+static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
+						   const char *refname,
+						   const struct object_id *oid)
+{
+	struct refname_hash_entry *ent;
+	size_t len = strlen(refname);
+
+	FLEX_ALLOC_MEM(ent, refname, refname, len);
+	hashmap_entry_init(ent, memhash(refname, len));
+	oidcpy(&ent->oid, oid);
+	hashmap_add(map, ent);
+	return ent;
+}
+
+static int add_one_refname(const char *refname,
+			   const struct object_id *oid,
+			   int flag, void *cbdata)
+{
+	struct hashmap *refname_map = cbdata;
+
+	(void) refname_hash_add(refname_map, refname, oid);
+	return 0;
+}
+
+static void refname_hash_init(struct hashmap *map)
+{
+	hashmap_init(map, refname_hash_entry_cmp, NULL, 0);
+}
+
+static int refname_hash_exists(struct hashmap *map, const char *refname)
+{
+	struct hashmap_entry key;
+	size_t len = strlen(refname);
+	hashmap_entry_init(&key, memhash(refname, len));
+
+	return !!hashmap_get(map, &key, refname);
+}
+
 static void find_non_local_tags(const struct ref *refs,
 				struct ref **head,
 				struct ref ***tail)
 {
-	struct string_list existing_refs = STRING_LIST_INIT_DUP;
-	struct string_list remote_refs = STRING_LIST_INIT_NODUP;
+	struct hashmap existing_refs;
+	struct hashmap remote_refs;
+	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
+	struct string_list_item *remote_ref_item;
 	const struct ref *ref;
-	struct string_list_item *item = NULL;
+	struct refname_hash_entry *item = NULL;
 
-	for_each_ref(add_existing, &existing_refs);
+	refname_hash_init(&existing_refs);
+	refname_hash_init(&remote_refs);
+
+	for_each_ref(add_one_refname, &existing_refs);
 	for (ref = refs; ref; ref = ref->next) {
 		if (!starts_with(ref->name, "refs/tags/"))
 			continue;
@@ -271,10 +319,10 @@ static void find_non_local_tags(const struct ref *refs,
 			    !has_object_file_with_flags(&ref->old_oid,
 							OBJECT_INFO_QUICK) &&
 			    !will_fetch(head, ref->old_oid.hash) &&
-			    !has_sha1_file_with_flags(item->util,
+			    !has_sha1_file_with_flags(item->oid.hash,
 						      OBJECT_INFO_QUICK) &&
-			    !will_fetch(head, item->util))
-				item->util = NULL;
+			    !will_fetch(head, item->oid.hash))
+				oidclr(&item->oid);
 			item = NULL;
 			continue;
 		}
@@ -286,48 +334,55 @@ static void find_non_local_tags(const struct ref *refs,
 		 * fetch.
 		 */
 		if (item &&
-		    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-		    !will_fetch(head, item->util))
-			item->util = NULL;
+		    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+		    !will_fetch(head, item->oid.hash))
+			oidclr(&item->oid);
 
 		item = NULL;
 
 		/* skip duplicates and refs that we already have */
-		if (string_list_has_string(&remote_refs, ref->name) ||
-		    string_list_has_string(&existing_refs, ref->name))
+		if (refname_hash_exists(&remote_refs, ref->name) ||
+		    refname_hash_exists(&existing_refs, ref->name))
 			continue;
 
-		item = string_list_insert(&remote_refs, ref->name);
-		item->util = (void *)&ref->old_oid;
+		item = refname_hash_add(&remote_refs, ref->name, &ref->old_oid);
+		string_list_insert(&remote_refs_list, ref->name);
 	}
-	string_list_clear(&existing_refs, 1);
+	hashmap_free(&existing_refs, 1);
 
 	/*
 	 * We may have a final lightweight tag that needs to be
 	 * checked to see if it needs fetching.
 	 */
 	if (item &&
-	    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
-	    !will_fetch(head, item->util))
-		item->util = NULL;
+	    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
+	    !will_fetch(head, item->oid.hash))
+		oidclr(&item->oid);
 
 	/*
-	 * For all the tags in the remote_refs string list,
+	 * For all the tags in the remote_refs_list,
 	 * add them to the list of refs to be fetched
 	 */
-	for_each_string_list_item(item, &remote_refs) {
+	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
+		const char *refname = remote_ref_item->string;
+		struct hashmap_entry key;
+
+		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
+		item = hashmap_get(&remote_refs, &key, refname);
+		if (!item)
+			continue; /* can this happen??? */
+
 		/* Unless we have already decided to ignore this item... */
-		if (item->util)
-		{
-			struct ref *rm = alloc_ref(item->string);
-			rm->peer_ref = alloc_ref(item->string);
-			oidcpy(&rm->old_oid, item->util);
+		if (!is_null_oid(&item->oid)) {
+			struct ref *rm = alloc_ref(item->refname);
+			rm->peer_ref = alloc_ref(item->refname);
+			oidcpy(&rm->old_oid, &item->oid);
 			**tail = rm;
 			*tail = &rm->next;
 		}
 	}
-
-	string_list_clear(&remote_refs, 0);
+	hashmap_free(&remote_refs, 1);
+	string_list_clear(&remote_refs_list, 0);
 }
 
 static struct ref *get_ref_map(struct remote *remote,
@@ -343,7 +398,7 @@ static struct ref *get_ref_map(struct remote *remote,
 	/* opportunistically-updated references: */
 	struct ref *orefs = NULL, **oref_tail = &orefs;
 
-	struct string_list existing_refs = STRING_LIST_INIT_DUP;
+	struct hashmap existing_refs;
 
 	if (rs->nr) {
 		struct refspec *fetch_refspec;
@@ -437,19 +492,24 @@ static struct ref *get_ref_map(struct remote *remote,
 
 	ref_map = ref_remove_duplicates(ref_map);
 
-	for_each_ref(add_existing, &existing_refs);
+	refname_hash_init(&existing_refs);
+	for_each_ref(add_one_refname, &existing_refs);
+
 	for (rm = ref_map; rm; rm = rm->next) {
 		if (rm->peer_ref) {
-			struct string_list_item *peer_item =
-				string_list_lookup(&existing_refs,
-						   rm->peer_ref->name);
+			struct hashmap_entry key;
+			const char *refname = rm->peer_ref->name;
+			struct refname_hash_entry *peer_item;
+
+			hashmap_entry_init(&key, memhash(refname, strlen(refname)));
+			peer_item = hashmap_get(&existing_refs, &key, refname);
 			if (peer_item) {
-				struct object_id *old_oid = peer_item->util;
+				struct object_id *old_oid = &peer_item->oid;
 				oidcpy(&rm->peer_ref->old_oid, old_oid);
 			}
 		}
 	}
-	string_list_clear(&existing_refs, 1);
+	hashmap_free(&existing_refs, 1);
 
 	return ref_map;
 }
-- 
2.19.1-450-ga4b8ab5363


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] fetch: replace string-list used as a look-up table with a hashmap
  2018-10-19  3:48     ` [PATCH v3] " Junio C Hamano
@ 2018-10-22  9:57       ` Johannes Schindelin
  2018-10-27  6:47         ` Re*: " Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Johannes Schindelin @ 2018-10-22  9:57 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King, Stefan Beller, Ramsay Jones

Hi Junio,

On Fri, 19 Oct 2018, Junio C Hamano wrote:

> In find_non_local_tags() helper function (used to implement the
> "follow tags"), we use string_list_has_string() on two string lists
> as a way to see if a refname has already been processed, etc.
> 
> All this code predates more modern in-core lookup API like hashmap;
> replace them with two hashmaps and one string list---the hashmaps
> are used for look-ups and the string list is to keep the order of
> items in the returned result stable (which is the only single thing
> hashmap does worse than lookups on string-list).

This sounds as if you would need a linked hash map, i.e. a hash map whose
iterator remembers the order in which items were inserted.

I am fine, however, with your patch as a first step in that direction, and
only implementing a linked hash map when we realize that we need it
somewhere else, too.

Just one thing^W^Wa couple of things:

> Similarly, get_ref_map() uses another string-list as a look-up table
> for existing refs.  Replace it with a hashmap.
> 
> Signed-off-by: Junio C Hamano <gitster@pobox.com>
> ---
> 
>  * This converts another string-list user in the same file.  For now
>    I am done with this topic; I'm willing to fix bugs in this patch,
>    but do not intend to spend time killing more string-lists used as
>    look-up tables.
> 
>  builtin/fetch.c | 152 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 106 insertions(+), 46 deletions(-)
> 
> diff --git a/builtin/fetch.c b/builtin/fetch.c
> index 0696abfc2a..0f8e333022 100644
> --- a/builtin/fetch.c
> +++ b/builtin/fetch.c
> @@ -223,18 +223,6 @@ static void add_merge_config(struct ref **head,
>  	}
>  }
>  
> -static int add_existing(const char *refname, const struct object_id *oid,
> -			int flag, void *cbdata)
> -{
> -	struct string_list *list = (struct string_list *)cbdata;
> -	struct string_list_item *item = string_list_insert(list, refname);
> -	struct object_id *old_oid = xmalloc(sizeof(*old_oid));
> -
> -	oidcpy(old_oid, oid);
> -	item->util = old_oid;
> -	return 0;
> -}
> -
>  static int will_fetch(struct ref **head, const unsigned char *sha1)
>  {
>  	struct ref *rm = *head;
> @@ -246,16 +234,76 @@ static int will_fetch(struct ref **head, const unsigned char *sha1)
>  	return 0;
>  }
>  
> +struct refname_hash_entry {
> +	struct hashmap_entry ent; /* must be the first member */
> +	struct object_id oid;
> +	char refname[FLEX_ARRAY];
> +};
> +
> +static int refname_hash_entry_cmp(const void *hashmap_cmp_fn_data,
> +				  const void *e1_,
> +				  const void *e2_,
> +				  const void *keydata)
> +{
> +	const struct refname_hash_entry *e1 = e1_;
> +	const struct refname_hash_entry *e2 = e2_;
> +
> +	return strcmp(e1->refname, keydata ? keydata : e2->refname);
> +}
> +
> +static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
> +						   const char *refname,
> +						   const struct object_id *oid)
> +{
> +	struct refname_hash_entry *ent;
> +	size_t len = strlen(refname);
> +
> +	FLEX_ALLOC_MEM(ent, refname, refname, len);
> +	hashmap_entry_init(ent, memhash(refname, len));
> +	oidcpy(&ent->oid, oid);
> +	hashmap_add(map, ent);
> +	return ent;
> +}
> +
> +static int add_one_refname(const char *refname,
> +			   const struct object_id *oid,
> +			   int flag, void *cbdata)
> +{
> +	struct hashmap *refname_map = cbdata;
> +
> +	(void) refname_hash_add(refname_map, refname, oid);
> +	return 0;
> +}
> +
> +static void refname_hash_init(struct hashmap *map)
> +{
> +	hashmap_init(map, refname_hash_entry_cmp, NULL, 0);
> +}
> +
> +static int refname_hash_exists(struct hashmap *map, const char *refname)
> +{
> +	struct hashmap_entry key;
> +	size_t len = strlen(refname);
> +	hashmap_entry_init(&key, memhash(refname, len));
> +
> +	return !!hashmap_get(map, &key, refname);

It would probably make more sense to `hashmap_get_from_hash()` and
`strhash()` here (and `strhash()` should probably be used everywhere
instead of `memhash(str, strlen(str))`).

> +}
> +
>  static void find_non_local_tags(const struct ref *refs,
>  				struct ref **head,
>  				struct ref ***tail)
>  {
> -	struct string_list existing_refs = STRING_LIST_INIT_DUP;
> -	struct string_list remote_refs = STRING_LIST_INIT_NODUP;
> +	struct hashmap existing_refs;
> +	struct hashmap remote_refs;
> +	struct string_list remote_refs_list = STRING_LIST_INIT_NODUP;
> +	struct string_list_item *remote_ref_item;
>  	const struct ref *ref;
> -	struct string_list_item *item = NULL;
> +	struct refname_hash_entry *item = NULL;
>  
> -	for_each_ref(add_existing, &existing_refs);
> +	refname_hash_init(&existing_refs);
> +	refname_hash_init(&remote_refs);
> +
> +	for_each_ref(add_one_refname, &existing_refs);
>  	for (ref = refs; ref; ref = ref->next) {
>  		if (!starts_with(ref->name, "refs/tags/"))
>  			continue;
> @@ -271,10 +319,10 @@ static void find_non_local_tags(const struct ref *refs,
>  			    !has_object_file_with_flags(&ref->old_oid,
>  							OBJECT_INFO_QUICK) &&
>  			    !will_fetch(head, ref->old_oid.hash) &&
> -			    !has_sha1_file_with_flags(item->util,
> +			    !has_sha1_file_with_flags(item->oid.hash,

I am not sure that we need to test for null OIDs here, given that...

>  						      OBJECT_INFO_QUICK) &&
> -			    !will_fetch(head, item->util))
> -				item->util = NULL;
> +			    !will_fetch(head, item->oid.hash))
> +				oidclr(&item->oid);

we no longer can set `util` to `NULL`, but have to use a null OID to
indicate that condition now.

Of course, `has_sha1_file_with_flags()` is supposed to return `false` for
null OIDs, I guess.

>  			item = NULL;
>  			continue;
>  		}
> @@ -286,48 +334,55 @@ static void find_non_local_tags(const struct ref *refs,
>  		 * fetch.
>  		 */
>  		if (item &&
> -		    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
> -		    !will_fetch(head, item->util))
> -			item->util = NULL;
> +		    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
> +		    !will_fetch(head, item->oid.hash))
> +			oidclr(&item->oid);
>  
>  		item = NULL;
>  
>  		/* skip duplicates and refs that we already have */
> -		if (string_list_has_string(&remote_refs, ref->name) ||
> -		    string_list_has_string(&existing_refs, ref->name))
> +		if (refname_hash_exists(&remote_refs, ref->name) ||
> +		    refname_hash_exists(&existing_refs, ref->name))
>  			continue;
>  
> -		item = string_list_insert(&remote_refs, ref->name);
> -		item->util = (void *)&ref->old_oid;
> +		item = refname_hash_add(&remote_refs, ref->name, &ref->old_oid);
> +		string_list_insert(&remote_refs_list, ref->name);
>  	}
> -	string_list_clear(&existing_refs, 1);
> +	hashmap_free(&existing_refs, 1);
>  
>  	/*
>  	 * We may have a final lightweight tag that needs to be
>  	 * checked to see if it needs fetching.
>  	 */
>  	if (item &&
> -	    !has_sha1_file_with_flags(item->util, OBJECT_INFO_QUICK) &&
> -	    !will_fetch(head, item->util))
> -		item->util = NULL;
> +	    !has_sha1_file_with_flags(item->oid.hash, OBJECT_INFO_QUICK) &&
> +	    !will_fetch(head, item->oid.hash))
> +		oidclr(&item->oid);
>  
>  	/*
> -	 * For all the tags in the remote_refs string list,
> +	 * For all the tags in the remote_refs_list,
>  	 * add them to the list of refs to be fetched
>  	 */
> -	for_each_string_list_item(item, &remote_refs) {
> +	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
> +		const char *refname = remote_ref_item->string;
> +		struct hashmap_entry key;
> +
> +		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> +		item = hashmap_get(&remote_refs, &key, refname);
> +		if (!item)
> +			continue; /* can this happen??? */

This would indicate a BUG, no?

Thank you for working on this,
Dscho

> +
>  		/* Unless we have already decided to ignore this item... */
> -		if (item->util)
> -		{
> -			struct ref *rm = alloc_ref(item->string);
> -			rm->peer_ref = alloc_ref(item->string);
> -			oidcpy(&rm->old_oid, item->util);
> +		if (!is_null_oid(&item->oid)) {
> +			struct ref *rm = alloc_ref(item->refname);
> +			rm->peer_ref = alloc_ref(item->refname);
> +			oidcpy(&rm->old_oid, &item->oid);
>  			**tail = rm;
>  			*tail = &rm->next;
>  		}
>  	}
> -
> -	string_list_clear(&remote_refs, 0);
> +	hashmap_free(&remote_refs, 1);
> +	string_list_clear(&remote_refs_list, 0);
>  }
>  
>  static struct ref *get_ref_map(struct remote *remote,
> @@ -343,7 +398,7 @@ static struct ref *get_ref_map(struct remote *remote,
>  	/* opportunistically-updated references: */
>  	struct ref *orefs = NULL, **oref_tail = &orefs;
>  
> -	struct string_list existing_refs = STRING_LIST_INIT_DUP;
> +	struct hashmap existing_refs;
>  
>  	if (rs->nr) {
>  		struct refspec *fetch_refspec;
> @@ -437,19 +492,24 @@ static struct ref *get_ref_map(struct remote *remote,
>  
>  	ref_map = ref_remove_duplicates(ref_map);
>  
> -	for_each_ref(add_existing, &existing_refs);
> +	refname_hash_init(&existing_refs);
> +	for_each_ref(add_one_refname, &existing_refs);
> +
>  	for (rm = ref_map; rm; rm = rm->next) {
>  		if (rm->peer_ref) {
> -			struct string_list_item *peer_item =
> -				string_list_lookup(&existing_refs,
> -						   rm->peer_ref->name);
> +			struct hashmap_entry key;
> +			const char *refname = rm->peer_ref->name;
> +			struct refname_hash_entry *peer_item;
> +
> +			hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> +			peer_item = hashmap_get(&existing_refs, &key, refname);
>  			if (peer_item) {
> -				struct object_id *old_oid = peer_item->util;
> +				struct object_id *old_oid = &peer_item->oid;
>  				oidcpy(&rm->peer_ref->old_oid, old_oid);
>  			}
>  		}
>  	}
> -	string_list_clear(&existing_refs, 1);
> +	hashmap_free(&existing_refs, 1);
>  
>  	return ref_map;
>  }
> -- 
> 2.19.1-450-ga4b8ab5363
> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re*: [PATCH v3] fetch: replace string-list used as a look-up table with a hashmap
  2018-10-22  9:57       ` Johannes Schindelin
@ 2018-10-27  6:47         ` Junio C Hamano
  2018-10-31 14:50           ` Johannes Schindelin
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2018-10-27  6:47 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git, Jeff King, Stefan Beller, Ramsay Jones

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Just one thing^W^Wa couple of things:
>
> It would probably make more sense to `hashmap_get_from_hash()` and
> `strhash()` here (and `strhash()` should probably be used everywhere
> instead of `memhash(str, strlen(str))`).

hashmap_get_from_hash() certainly is much better suited for simpler
usage pattern like these callsites, and the ones in sequencer.c.  It
is a shame that a more complex variant takes the shorter-and-sweeter
name hashmap_get().

I wish we named the latter hashmap_get_fullblown_feature_rich() and
called the _from_hash() thing a simple hashmap_get() from day one,
but it is way too late.

I looked briefly the users of the _get() variant, and some of their
uses are legitimately not-simple and cannot be reduced to use the
simpler _get_from_hash variant, it seems.  But others like those in
builtin/difftool.c should be straight-forward to convert to use the
simpler get_from_hash variant.  It could be a low-hanging fruit left
for later clean-up, perhaps.

>> @@ -271,10 +319,10 @@ static void find_non_local_tags(const struct ref *refs,
>>  			    !has_object_file_with_flags(&ref->old_oid,
>>  							OBJECT_INFO_QUICK) &&
>>  			    !will_fetch(head, ref->old_oid.hash) &&
>> -			    !has_sha1_file_with_flags(item->util,
>> +			    !has_sha1_file_with_flags(item->oid.hash,
>
> I am not sure that we need to test for null OIDs here, given that...
> ...
> Of course, `has_sha1_file_with_flags()` is supposed to return `false` for
> null OIDs, I guess.

Yup.  An alternative is to make item->oid a pointer to oid, not an
oid object itself, so that we can express "no OID for this ref" in a
more explicit way, but is_null_oid() is already used as "no OID" in
many other codepaths, so...

>> +	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
>> +		const char *refname = remote_ref_item->string;
>> +		struct hashmap_entry key;
>> +
>> +		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
>> +		item = hashmap_get(&remote_refs, &key, refname);
>> +		if (!item)
>> +			continue; /* can this happen??? */
>
> This would indicate a BUG, no?

Possibly.  Alternatively, we can just use item without checking and
let the runtime segfault.

Here is an incremental on top that can be squashed in to turn v3
into v4.

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 0f8e333022..aee1d9bf21 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -259,7 +259,7 @@ static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
 	size_t len = strlen(refname);
 
 	FLEX_ALLOC_MEM(ent, refname, refname, len);
-	hashmap_entry_init(ent, memhash(refname, len));
+	hashmap_entry_init(ent, strhash(refname));
 	oidcpy(&ent->oid, oid);
 	hashmap_add(map, ent);
 	return ent;
@@ -282,11 +282,7 @@ static void refname_hash_init(struct hashmap *map)
 
 static int refname_hash_exists(struct hashmap *map, const char *refname)
 {
-	struct hashmap_entry key;
-	size_t len = strlen(refname);
-	hashmap_entry_init(&key, memhash(refname, len));
-
-	return !!hashmap_get(map, &key, refname);
+	return !!hashmap_get_from_hash(map, strhash(refname), refname);
 }
 
 static void find_non_local_tags(const struct ref *refs,
@@ -365,12 +361,10 @@ static void find_non_local_tags(const struct ref *refs,
 	 */
 	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
 		const char *refname = remote_ref_item->string;
-		struct hashmap_entry key;
 
-		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
-		item = hashmap_get(&remote_refs, &key, refname);
+		item = hashmap_get_from_hash(&remote_refs, strhash(refname), refname);
 		if (!item)
-			continue; /* can this happen??? */
+			BUG("unseen remote ref?");
 
 		/* Unless we have already decided to ignore this item... */
 		if (!is_null_oid(&item->oid)) {
@@ -497,12 +491,12 @@ static struct ref *get_ref_map(struct remote *remote,
 
 	for (rm = ref_map; rm; rm = rm->next) {
 		if (rm->peer_ref) {
-			struct hashmap_entry key;
 			const char *refname = rm->peer_ref->name;
 			struct refname_hash_entry *peer_item;
 
-			hashmap_entry_init(&key, memhash(refname, strlen(refname)));
-			peer_item = hashmap_get(&existing_refs, &key, refname);
+			peer_item = hashmap_get_from_hash(&existing_refs,
+							  strhash(refname),
+							  refname);
 			if (peer_item) {
 				struct object_id *old_oid = &peer_item->oid;
 				oidcpy(&rm->peer_ref->old_oid, old_oid);

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: Re*: [PATCH v3] fetch: replace string-list used as a look-up table with a hashmap
  2018-10-27  6:47         ` Re*: " Junio C Hamano
@ 2018-10-31 14:50           ` Johannes Schindelin
  0 siblings, 0 replies; 11+ messages in thread
From: Johannes Schindelin @ 2018-10-31 14:50 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King, Stefan Beller, Ramsay Jones

Hi Junio,

On Sat, 27 Oct 2018, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> 
> > Just one thing^W^Wa couple of things:
> >
> > It would probably make more sense to `hashmap_get_from_hash()` and
> > `strhash()` here (and `strhash()` should probably be used everywhere
> > instead of `memhash(str, strlen(str))`).
> 
> hashmap_get_from_hash() certainly is much better suited for simpler
> usage pattern like these callsites, and the ones in sequencer.c.  It
> is a shame that a more complex variant takes the shorter-and-sweeter
> name hashmap_get().

I agree, at least in part.

From what I understand, hashmap_get_from_hash() needs a little assistance
from the comparison function with which the hashmap is configured, see
e.g. this function in the sequencer:

	static int labels_cmp(const void *fndata, const struct labels_entry *a,
			      const struct labels_entry *b, const void *key)
	{
		return key ? strcmp(a->label, key) : strcmp(a->label, b->label);
	}

See how that first tests whether `key` is non-`NULL`, and then takes a
shortcut, not even looking at `b`? This is important, because `b` does not
refer to a complete `labels_entry` when we call `hashmap_get_from_hash()`.
It only refers to a `hashmap_entry`. Looking at `b->label` would access
some random memory, and do most certainly the wrong thing.

> I wish we named the latter hashmap_get_fullblown_feature_rich() and
> called the _from_hash() thing a simple hashmap_get() from day one,
> but it is way too late.
> 
> I looked briefly the users of the _get() variant, and some of their
> uses are legitimately not-simple and cannot be reduced to use the
> simpler _get_from_hash variant, it seems.  But others like those in
> builtin/difftool.c should be straight-forward to convert to use the
> simpler get_from_hash variant.  It could be a low-hanging fruit left
> for later clean-up, perhaps.

Right. #leftoverbits

> >> @@ -271,10 +319,10 @@ static void find_non_local_tags(const struct ref *refs,
> >>  			    !has_object_file_with_flags(&ref->old_oid,
> >>  							OBJECT_INFO_QUICK) &&
> >>  			    !will_fetch(head, ref->old_oid.hash) &&
> >> -			    !has_sha1_file_with_flags(item->util,
> >> +			    !has_sha1_file_with_flags(item->oid.hash,
> >
> > I am not sure that we need to test for null OIDs here, given that...
> > ...
> > Of course, `has_sha1_file_with_flags()` is supposed to return `false` for
> > null OIDs, I guess.
> 
> Yup.  An alternative is to make item->oid a pointer to oid, not an
> oid object itself, so that we can express "no OID for this ref" in a
> more explicit way, but is_null_oid() is already used as "no OID" in
> many other codepaths, so...

Right, and it would complicate the code. So I am fine with your version of
it.

> >> +	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
> >> +		const char *refname = remote_ref_item->string;
> >> +		struct hashmap_entry key;
> >> +
> >> +		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> >> +		item = hashmap_get(&remote_refs, &key, refname);
> >> +		if (!item)
> >> +			continue; /* can this happen??? */
> >
> > This would indicate a BUG, no?
> 
> Possibly.  Alternatively, we can just use item without checking and
> let the runtime segfault.

Hahaha! Yep. We could also cause a crash. I do prefer the BUG() call.

> Here is an incremental on top that can be squashed in to turn v3
> into v4.

Nice.

Thanks!
Dscho

> 
> diff --git a/builtin/fetch.c b/builtin/fetch.c
> index 0f8e333022..aee1d9bf21 100644
> --- a/builtin/fetch.c
> +++ b/builtin/fetch.c
> @@ -259,7 +259,7 @@ static struct refname_hash_entry *refname_hash_add(struct hashmap *map,
>  	size_t len = strlen(refname);
>  
>  	FLEX_ALLOC_MEM(ent, refname, refname, len);
> -	hashmap_entry_init(ent, memhash(refname, len));
> +	hashmap_entry_init(ent, strhash(refname));
>  	oidcpy(&ent->oid, oid);
>  	hashmap_add(map, ent);
>  	return ent;
> @@ -282,11 +282,7 @@ static void refname_hash_init(struct hashmap *map)
>  
>  static int refname_hash_exists(struct hashmap *map, const char *refname)
>  {
> -	struct hashmap_entry key;
> -	size_t len = strlen(refname);
> -	hashmap_entry_init(&key, memhash(refname, len));
> -
> -	return !!hashmap_get(map, &key, refname);
> +	return !!hashmap_get_from_hash(map, strhash(refname), refname);
>  }
>  
>  static void find_non_local_tags(const struct ref *refs,
> @@ -365,12 +361,10 @@ static void find_non_local_tags(const struct ref *refs,
>  	 */
>  	for_each_string_list_item(remote_ref_item, &remote_refs_list) {
>  		const char *refname = remote_ref_item->string;
> -		struct hashmap_entry key;
>  
> -		hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> -		item = hashmap_get(&remote_refs, &key, refname);
> +		item = hashmap_get_from_hash(&remote_refs, strhash(refname), refname);
>  		if (!item)
> -			continue; /* can this happen??? */
> +			BUG("unseen remote ref?");
>  
>  		/* Unless we have already decided to ignore this item... */
>  		if (!is_null_oid(&item->oid)) {
> @@ -497,12 +491,12 @@ static struct ref *get_ref_map(struct remote *remote,
>  
>  	for (rm = ref_map; rm; rm = rm->next) {
>  		if (rm->peer_ref) {
> -			struct hashmap_entry key;
>  			const char *refname = rm->peer_ref->name;
>  			struct refname_hash_entry *peer_item;
>  
> -			hashmap_entry_init(&key, memhash(refname, strlen(refname)));
> -			peer_item = hashmap_get(&existing_refs, &key, refname);
> +			peer_item = hashmap_get_from_hash(&existing_refs,
> +							  strhash(refname),
> +							  refname);
>  			if (peer_item) {
>  				struct object_id *old_oid = &peer_item->oid;
>  				oidcpy(&rm->peer_ref->old_oid, old_oid);
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-10-31 14:50 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-26 21:28 [PATCH] fetch: replace string-list used as a look-up table with a hashmap Junio C Hamano
2018-09-26 22:59 ` Stefan Beller
2018-09-27  5:54   ` Jeff King
2018-09-29 18:31     ` Junio C Hamano
2018-09-30  4:57       ` Jeff King
2018-09-27  5:34 ` Jeff King
2018-09-30  2:11   ` Junio C Hamano
2018-10-19  3:48     ` [PATCH v3] " Junio C Hamano
2018-10-22  9:57       ` Johannes Schindelin
2018-10-27  6:47         ` Re*: " Junio C Hamano
2018-10-31 14:50           ` Johannes Schindelin

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).