[RFC/WIP PATCH] object store classification

git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed

* [RFC/WIP PATCH] object store classification
@ 2017-07-06 20:27 Stefan Beller
  2017-07-07  2:33 ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Beller @ 2017-07-06 20:27 UTC (permalink / raw)
  To: git; +Cc: peff, Stefan Beller

This continues the efforts of bw/repo-object, making our code base
more object oriented by adding the object store to the the repository struct.

Long term goal of the series that would evolve from this discussion:
* Easier to implement submodule features as it can be in the same process.

Short term goal:
* get rid of 'add_submodule_odb' in submodule.c
  When fetching or pushing we need to determine if a submodule needs processing
  by finding out if certain commits are present.  To do this we add the submodule
  objects as an alternate object database and do the processing in the same
  process. In case of multiple submodules, this would pollute the object store
  hence being slower and increasing memory usage, too.

This patch only changes the object store code to be object oriented based
on the repository struct.

To go for the short term goal we'd need to convert a few more places, mostly
all the construction (blob.c, tree.c, commit.c)

This is marked RFC as I'd want to gather feedback if the approach, presented
in the header files is sound.

Thanks!

Signed-off-by: Stefan Beller <sbeller@google.com>
---

  Peff, this area of the code seems performance sensitive for a variety of
  use cases, specifically yours. So I cc'd you here. :)

 object.c     | 77 +++++++++++++++++++++++++++++++++++-------------------------
 object.h     | 51 ++++++++++++++++++++++++++++++++--------
 repository.h |  6 +++++
 3 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/object.c b/object.c
index 06ba3a11d8..b5ec0bb2f9 100644
--- a/object.c
+++ b/object.c
@@ -5,17 +5,15 @@
 #include "commit.h"
 #include "tag.h"
 
-static struct object **obj_hash;
-static int nr_objs, obj_hash_size;
-
-unsigned int get_max_object_index(void)
+unsigned int object_store_get_max_index(struct repository *r)
 {
-	return obj_hash_size;
+	return r->objects.obj_hash_size;
 }
 
-struct object *get_indexed_object(unsigned int idx)
+struct object *object_store_get_indexed(struct repository *r,
+					unsigned int idx)
 {
-	return obj_hash[idx];
+	return r->objects.obj_hash[idx];
 }
 
 static const char *object_type_strings[] = {
@@ -82,11 +80,15 @@ static void insert_obj_hash(struct object *obj, struct object **hash, unsigned i
  * Look up the record for the given sha1 in the hash map stored in
  * obj_hash.  Return NULL if it was not found.
  */
-struct object *lookup_object(const unsigned char *sha1)
+struct object *object_store_lookup(struct repository *r,
+				   const unsigned char *sha1)
 {
 	unsigned int i, first;
 	struct object *obj;
 
+	struct object **obj_hash = r->objects.obj_hash;
+	int obj_hash_size = r->objects.obj_hash_size;
+
 	if (!obj_hash)
 		return NULL;
 
@@ -114,29 +116,31 @@ struct object *lookup_object(const unsigned char *sha1)
  * power of 2 (but at least 32).  Copy the existing values to the new
  * hash map.
  */
-static void grow_object_hash(void)
+static void grow_object_hash(struct repository *r)
 {
 	int i;
 	/*
 	 * Note that this size must always be power-of-2 to match hash_obj
 	 * above.
 	 */
-	int new_hash_size = obj_hash_size < 32 ? 32 : 2 * obj_hash_size;
+	int new_hash_size = r->objects.obj_hash_size < 32 ?
+			32 : 2 * r->objects.obj_hash_size;
 	struct object **new_hash;
 
 	new_hash = xcalloc(new_hash_size, sizeof(struct object *));
-	for (i = 0; i < obj_hash_size; i++) {
-		struct object *obj = obj_hash[i];
+	for (i = 0; i < r->objects.obj_hash_size; i++) {
+		struct object *obj = r->objects.obj_hash[i];
 		if (!obj)
 			continue;
 		insert_obj_hash(obj, new_hash, new_hash_size);
 	}
-	free(obj_hash);
-	obj_hash = new_hash;
-	obj_hash_size = new_hash_size;
+	free(r->objects.obj_hash);
+	r->objects.obj_hash = new_hash;
+	r->objects.obj_hash_size = new_hash_size;
 }
 
-void *create_object(const unsigned char *sha1, void *o)
+void *object_store_create(struct repository *r,
+			  const unsigned char *sha1, void *o)
 {
 	struct object *obj = o;
 
@@ -145,15 +149,17 @@ void *create_object(const unsigned char *sha1, void *o)
 	obj->flags = 0;
 	hashcpy(obj->oid.hash, sha1);
 
-	if (obj_hash_size - 1 <= nr_objs * 2)
-		grow_object_hash();
+	if (r->objects.obj_hash_size - 1 <= r->objects.nr_objs * 2)
+		grow_object_hash(r);
 
-	insert_obj_hash(obj, obj_hash, obj_hash_size);
-	nr_objs++;
+	insert_obj_hash(obj, r->objects.obj_hash, r->objects.obj_hash_size);
+	r->objects.nr_objs++;
 	return obj;
 }
 
-void *object_as_type(struct object *obj, enum object_type type, int quiet)
+void *object_as_type(struct object *obj,
+		     enum object_type type,
+		     int quiet)
 {
 	if (obj->type == type)
 		return obj;
@@ -172,15 +178,20 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet)
 	}
 }
 
-struct object *lookup_unknown_object(const unsigned char *sha1)
+struct object *object_store_lookup_unknown_object(struct repository *r,
+						  const unsigned char *sha1)
 {
-	struct object *obj = lookup_object(sha1);
+	struct object *obj = object_store_lookup(r, sha1);
 	if (!obj)
-		obj = create_object(sha1, alloc_object_node());
+		obj = object_store_create(r, sha1, alloc_object_node());
 	return obj;
 }
 
-struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p)
+struct object *object_store_parse_object_buffer(struct repository *r,
+						const struct object_id *oid,
+						enum object_type type,
+						unsigned long size,
+						void *buffer, int *eaten_p)
 {
 	struct object *obj;
 	*eaten_p = 0;
@@ -230,17 +241,19 @@ struct object *parse_object_buffer(const struct object_id *oid, enum object_type
 	return obj;
 }
 
-struct object *parse_object_or_die(const struct object_id *oid,
-				   const char *name)
+struct object *object_store_parse_object_or_die(struct repository *r,
+						const struct object_id *oid,
+						const char *name)
 {
-	struct object *o = parse_object(oid);
+	struct object *o = object_store_parse_object(r, oid);
 	if (o)
 		return o;
 
 	die(_("unable to parse object: %s"), name ? name : oid_to_hex(oid));
 }
 
-struct object *parse_object(const struct object_id *oid)
+struct object *object_store_parse_object(struct repository *r,
+					 const struct object_id *oid)
 {
 	unsigned long size;
 	enum object_type type;
@@ -413,12 +426,12 @@ void object_array_remove_duplicates(struct object_array *array)
 	}
 }
 
-void clear_object_flags(unsigned flags)
+void object_store_clear_object_flags(struct repository *r, unsigned flags)
 {
 	int i;
 
-	for (i=0; i < obj_hash_size; i++) {
-		struct object *obj = obj_hash[i];
+	for (i = 0; i < r->objects.obj_hash_size; i++) {
+		struct object *obj = r->objects.obj_hash[i];
 		if (obj)
 			obj->flags &= ~flags;
 	}
diff --git a/object.h b/object.h
index 33e5cc9943..1b2ab8ee88 100644
--- a/object.h
+++ b/object.h
@@ -1,6 +1,8 @@
 #ifndef OBJECT_H
 #define OBJECT_H
 
+#include "repository.h"
+
 struct object_list {
 	struct object *item;
 	struct object_list *next;
@@ -59,12 +61,15 @@ extern int type_from_string_gently(const char *str, ssize_t, int gentle);
 /*
  * Return the current number of buckets in the object hashmap.
  */
-extern unsigned int get_max_object_index(void);
+extern unsigned int object_store_get_max_index(struct repository *r);
+#define get_max_object_index() object_store_get_max_index(the_repository)
 
 /*
  * Return the object from the specified bucket in the object hashmap.
  */
-extern struct object *get_indexed_object(unsigned int);
+extern struct object *object_store_get_indexed(struct repository *r,
+					       unsigned int);
+#define get_indexed_object(i) object_store_get_indexed(the_repository, i)
 
 /*
  * This can be used to see if we have heard of the object before, but
@@ -78,34 +83,59 @@ extern struct object *get_indexed_object(unsigned int);
  * half-initialised objects, the caller is expected to initialize them
  * by calling parse_object() on them.
  */
-struct object *lookup_object(const unsigned char *sha1);
+extern struct object *object_store_lookup(struct repository *r,
+					  const unsigned char *sha1);
+#define lookup_object(hash) object_store_lookup(the_repository, hash)
 
-extern void *create_object(const unsigned char *sha1, void *obj);
+extern void *object_store_create(struct repository *r,
+				 const unsigned char *sha1,
+				 void *obj);
+#define create_object(sha1, obj) object_store_create(the_repository, sha1, obj)
 
-void *object_as_type(struct object *obj, enum object_type type, int quiet);
+extern void *object_as_type(struct object *obj,
+			    enum object_type type,
+			    int quiet);
 
 /*
  * Returns the object, having parsed it to find out what it is.
  *
  * Returns NULL if the object is missing or corrupt.
  */
-struct object *parse_object(const struct object_id *oid);
+extern struct object *object_store_parse_object(struct repository *r,
+						const struct object_id *oid);
+#define parse_object(sha1) \
+	object_store_parse_object(the_repository, sha1)
 
 /*
  * Like parse_object, but will die() instead of returning NULL. If the
  * "name" parameter is not NULL, it is included in the error message
  * (otherwise, the hex object ID is given).
  */
-struct object *parse_object_or_die(const struct object_id *oid, const char *name);
+extern struct object *object_store_parse_object_or_die(struct repository *r,
+						       const struct object_id *oid,
+						       const char *name);
+#define parse_object_or_die(oid, name) \
+	object_store_parse_object_or_die(the_repository, oid, name)
 
 /* Given the result of read_sha1_file(), returns the object after
  * parsing it.  eaten_p indicates if the object has a borrowed copy
  * of buffer and the caller should not free() it.
  */
-struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
+struct object *object_store_parse_object_buffer(struct repository *r,
+						const struct object_id *oid,
+						enum object_type type,
+						unsigned long size,
+						void *buffer,
+						int *eaten_p);
+#define parse_object_buffer(oid, type, size, buffer, eaten_p) \
+	object_store_parse_object_buffer(the_repository, oid, type, \
+					 size, buffer, eaten_p)
 
 /** Returns the object, with potentially excess memory allocated. **/
-struct object *lookup_unknown_object(const unsigned  char *sha1);
+struct object *object_store_lookup_unknown_object(struct repository *r,
+						  const unsigned char *sha1);
+#define lookup_unknown_object(sha1) \
+	object_store_lookup_unknown_object(the_repository, sha1)
 
 struct object_list *object_list_insert(struct object *item,
 				       struct object_list **list_p);
@@ -138,6 +168,7 @@ void object_array_remove_duplicates(struct object_array *array);
  */
 void object_array_clear(struct object_array *array);
 
-void clear_object_flags(unsigned flags);
+extern void object_store_clear_object_flags(struct repository *r, unsigned flags);
+#define clear_object_flags(flags) object_store_clear_object_flags(the_repository, flags);
 
 #endif /* OBJECT_H */
diff --git a/repository.h b/repository.h
index 417787f3ef..aeb50a19bc 100644
--- a/repository.h
+++ b/repository.h
@@ -1,6 +1,8 @@
 #ifndef REPOSITORY_H
 #define REPOSITORY_H
 
+#include "object.h"
+
 struct config_set;
 struct index_state;
 struct submodule_cache;
@@ -24,6 +26,10 @@ struct repository {
 	 * Cannot be NULL after initialization.
 	 */
 	char *objectdir;
+	struct object_store {
+		struct object **obj_hash;
+		int nr_objs, obj_hash_size;
+	} objects;
 
 	/*
 	 * Path to the repository's graft file.
-- 
2.13.2.559.g3a0a3146e4


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-06 20:27 [RFC/WIP PATCH] object store classification Stefan Beller
@ 2017-07-07  2:33 ` Junio C Hamano
  2017-07-07 14:56   ` Ben Peart
  0 siblings, 1 reply; 9+ messages in thread
From: Junio C Hamano @ 2017-07-07  2:33 UTC (permalink / raw)
  To: Stefan Beller; +Cc: git, peff

Stefan Beller <sbeller@google.com> writes:

> Subject: Re: [RFC/WIP PATCH] object store classification

I thought you are writing different object-store backends and
classifying them into many categories (e.g. local, networked,
telepathic, etc.)

It is a logical next step to put per-repository things into
the_repository.  I skimmed the patch and the changes smelled sane,
but I didn't read it really carefully with fine toothed comb.

Thanks.  The remainder kept for reference, but no additional
comments in there.

> This continues the efforts of bw/repo-object, making our code base
> more object oriented by adding the object store to the the repository struct.
>
> Long term goal of the series that would evolve from this discussion:
> * Easier to implement submodule features as it can be in the same process.
>
> Short term goal:
> * get rid of 'add_submodule_odb' in submodule.c
>   When fetching or pushing we need to determine if a submodule needs processing
>   by finding out if certain commits are present.  To do this we add the submodule
>   objects as an alternate object database and do the processing in the same
>   process. In case of multiple submodules, this would pollute the object store
>   hence being slower and increasing memory usage, too.
>
> This patch only changes the object store code to be object oriented based
> on the repository struct.
>
> To go for the short term goal we'd need to convert a few more places, mostly
> all the construction (blob.c, tree.c, commit.c)
>
> This is marked RFC as I'd want to gather feedback if the approach, presented
> in the header files is sound.
>
> Thanks!
>
> Signed-off-by: Stefan Beller <sbeller@google.com>
> ---
>
>   Peff, this area of the code seems performance sensitive for a variety of
>   use cases, specifically yours. So I cc'd you here. :)
>
>  object.c     | 77 +++++++++++++++++++++++++++++++++++-------------------------
>  object.h     | 51 ++++++++++++++++++++++++++++++++--------
>  repository.h |  6 +++++
>  3 files changed, 92 insertions(+), 42 deletions(-)
>
> diff --git a/object.c b/object.c
> index 06ba3a11d8..b5ec0bb2f9 100644
> --- a/object.c
> +++ b/object.c
> @@ -5,17 +5,15 @@
>  #include "commit.h"
>  #include "tag.h"
>  
> -static struct object **obj_hash;
> -static int nr_objs, obj_hash_size;
> -
> -unsigned int get_max_object_index(void)
> +unsigned int object_store_get_max_index(struct repository *r)
>  {
> -	return obj_hash_size;
> +	return r->objects.obj_hash_size;
>  }
>  
> -struct object *get_indexed_object(unsigned int idx)
> +struct object *object_store_get_indexed(struct repository *r,
> +					unsigned int idx)
>  {
> -	return obj_hash[idx];
> +	return r->objects.obj_hash[idx];
>  }
>  
>  static const char *object_type_strings[] = {
> @@ -82,11 +80,15 @@ static void insert_obj_hash(struct object *obj, struct object **hash, unsigned i
>   * Look up the record for the given sha1 in the hash map stored in
>   * obj_hash.  Return NULL if it was not found.
>   */
> -struct object *lookup_object(const unsigned char *sha1)
> +struct object *object_store_lookup(struct repository *r,
> +				   const unsigned char *sha1)
>  {
>  	unsigned int i, first;
>  	struct object *obj;
>  
> +	struct object **obj_hash = r->objects.obj_hash;
> +	int obj_hash_size = r->objects.obj_hash_size;
> +
>  	if (!obj_hash)
>  		return NULL;
>  
> @@ -114,29 +116,31 @@ struct object *lookup_object(const unsigned char *sha1)
>   * power of 2 (but at least 32).  Copy the existing values to the new
>   * hash map.
>   */
> -static void grow_object_hash(void)
> +static void grow_object_hash(struct repository *r)
>  {
>  	int i;
>  	/*
>  	 * Note that this size must always be power-of-2 to match hash_obj
>  	 * above.
>  	 */
> -	int new_hash_size = obj_hash_size < 32 ? 32 : 2 * obj_hash_size;
> +	int new_hash_size = r->objects.obj_hash_size < 32 ?
> +			32 : 2 * r->objects.obj_hash_size;
>  	struct object **new_hash;
>  
>  	new_hash = xcalloc(new_hash_size, sizeof(struct object *));
> -	for (i = 0; i < obj_hash_size; i++) {
> -		struct object *obj = obj_hash[i];
> +	for (i = 0; i < r->objects.obj_hash_size; i++) {
> +		struct object *obj = r->objects.obj_hash[i];
>  		if (!obj)
>  			continue;
>  		insert_obj_hash(obj, new_hash, new_hash_size);
>  	}
> -	free(obj_hash);
> -	obj_hash = new_hash;
> -	obj_hash_size = new_hash_size;
> +	free(r->objects.obj_hash);
> +	r->objects.obj_hash = new_hash;
> +	r->objects.obj_hash_size = new_hash_size;
>  }
>  
> -void *create_object(const unsigned char *sha1, void *o)
> +void *object_store_create(struct repository *r,
> +			  const unsigned char *sha1, void *o)
>  {
>  	struct object *obj = o;
>  
> @@ -145,15 +149,17 @@ void *create_object(const unsigned char *sha1, void *o)
>  	obj->flags = 0;
>  	hashcpy(obj->oid.hash, sha1);
>  
> -	if (obj_hash_size - 1 <= nr_objs * 2)
> -		grow_object_hash();
> +	if (r->objects.obj_hash_size - 1 <= r->objects.nr_objs * 2)
> +		grow_object_hash(r);
>  
> -	insert_obj_hash(obj, obj_hash, obj_hash_size);
> -	nr_objs++;
> +	insert_obj_hash(obj, r->objects.obj_hash, r->objects.obj_hash_size);
> +	r->objects.nr_objs++;
>  	return obj;
>  }
>  
> -void *object_as_type(struct object *obj, enum object_type type, int quiet)
> +void *object_as_type(struct object *obj,
> +		     enum object_type type,
> +		     int quiet)
>  {
>  	if (obj->type == type)
>  		return obj;
> @@ -172,15 +178,20 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet)
>  	}
>  }
>  
> -struct object *lookup_unknown_object(const unsigned char *sha1)
> +struct object *object_store_lookup_unknown_object(struct repository *r,
> +						  const unsigned char *sha1)
>  {
> -	struct object *obj = lookup_object(sha1);
> +	struct object *obj = object_store_lookup(r, sha1);
>  	if (!obj)
> -		obj = create_object(sha1, alloc_object_node());
> +		obj = object_store_create(r, sha1, alloc_object_node());
>  	return obj;
>  }
>  
> -struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p)
> +struct object *object_store_parse_object_buffer(struct repository *r,
> +						const struct object_id *oid,
> +						enum object_type type,
> +						unsigned long size,
> +						void *buffer, int *eaten_p)
>  {
>  	struct object *obj;
>  	*eaten_p = 0;
> @@ -230,17 +241,19 @@ struct object *parse_object_buffer(const struct object_id *oid, enum object_type
>  	return obj;
>  }
>  
> -struct object *parse_object_or_die(const struct object_id *oid,
> -				   const char *name)
> +struct object *object_store_parse_object_or_die(struct repository *r,
> +						const struct object_id *oid,
> +						const char *name)
>  {
> -	struct object *o = parse_object(oid);
> +	struct object *o = object_store_parse_object(r, oid);
>  	if (o)
>  		return o;
>  
>  	die(_("unable to parse object: %s"), name ? name : oid_to_hex(oid));
>  }
>  
> -struct object *parse_object(const struct object_id *oid)
> +struct object *object_store_parse_object(struct repository *r,
> +					 const struct object_id *oid)
>  {
>  	unsigned long size;
>  	enum object_type type;
> @@ -413,12 +426,12 @@ void object_array_remove_duplicates(struct object_array *array)
>  	}
>  }
>  
> -void clear_object_flags(unsigned flags)
> +void object_store_clear_object_flags(struct repository *r, unsigned flags)
>  {
>  	int i;
>  
> -	for (i=0; i < obj_hash_size; i++) {
> -		struct object *obj = obj_hash[i];
> +	for (i = 0; i < r->objects.obj_hash_size; i++) {
> +		struct object *obj = r->objects.obj_hash[i];
>  		if (obj)
>  			obj->flags &= ~flags;
>  	}
> diff --git a/object.h b/object.h
> index 33e5cc9943..1b2ab8ee88 100644
> --- a/object.h
> +++ b/object.h
> @@ -1,6 +1,8 @@
>  #ifndef OBJECT_H
>  #define OBJECT_H
>  
> +#include "repository.h"
> +
>  struct object_list {
>  	struct object *item;
>  	struct object_list *next;
> @@ -59,12 +61,15 @@ extern int type_from_string_gently(const char *str, ssize_t, int gentle);
>  /*
>   * Return the current number of buckets in the object hashmap.
>   */
> -extern unsigned int get_max_object_index(void);
> +extern unsigned int object_store_get_max_index(struct repository *r);
> +#define get_max_object_index() object_store_get_max_index(the_repository)
>  
>  /*
>   * Return the object from the specified bucket in the object hashmap.
>   */
> -extern struct object *get_indexed_object(unsigned int);
> +extern struct object *object_store_get_indexed(struct repository *r,
> +					       unsigned int);
> +#define get_indexed_object(i) object_store_get_indexed(the_repository, i)
>  
>  /*
>   * This can be used to see if we have heard of the object before, but
> @@ -78,34 +83,59 @@ extern struct object *get_indexed_object(unsigned int);
>   * half-initialised objects, the caller is expected to initialize them
>   * by calling parse_object() on them.
>   */
> -struct object *lookup_object(const unsigned char *sha1);
> +extern struct object *object_store_lookup(struct repository *r,
> +					  const unsigned char *sha1);
> +#define lookup_object(hash) object_store_lookup(the_repository, hash)
>  
> -extern void *create_object(const unsigned char *sha1, void *obj);
> +extern void *object_store_create(struct repository *r,
> +				 const unsigned char *sha1,
> +				 void *obj);
> +#define create_object(sha1, obj) object_store_create(the_repository, sha1, obj)
>  
> -void *object_as_type(struct object *obj, enum object_type type, int quiet);
> +extern void *object_as_type(struct object *obj,
> +			    enum object_type type,
> +			    int quiet);
>  
>  /*
>   * Returns the object, having parsed it to find out what it is.
>   *
>   * Returns NULL if the object is missing or corrupt.
>   */
> -struct object *parse_object(const struct object_id *oid);
> +extern struct object *object_store_parse_object(struct repository *r,
> +						const struct object_id *oid);
> +#define parse_object(sha1) \
> +	object_store_parse_object(the_repository, sha1)
>  
>  /*
>   * Like parse_object, but will die() instead of returning NULL. If the
>   * "name" parameter is not NULL, it is included in the error message
>   * (otherwise, the hex object ID is given).
>   */
> -struct object *parse_object_or_die(const struct object_id *oid, const char *name);
> +extern struct object *object_store_parse_object_or_die(struct repository *r,
> +						       const struct object_id *oid,
> +						       const char *name);
> +#define parse_object_or_die(oid, name) \
> +	object_store_parse_object_or_die(the_repository, oid, name)
>  
>  /* Given the result of read_sha1_file(), returns the object after
>   * parsing it.  eaten_p indicates if the object has a borrowed copy
>   * of buffer and the caller should not free() it.
>   */
> -struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
> +struct object *object_store_parse_object_buffer(struct repository *r,
> +						const struct object_id *oid,
> +						enum object_type type,
> +						unsigned long size,
> +						void *buffer,
> +						int *eaten_p);
> +#define parse_object_buffer(oid, type, size, buffer, eaten_p) \
> +	object_store_parse_object_buffer(the_repository, oid, type, \
> +					 size, buffer, eaten_p)
>  
>  /** Returns the object, with potentially excess memory allocated. **/
> -struct object *lookup_unknown_object(const unsigned  char *sha1);
> +struct object *object_store_lookup_unknown_object(struct repository *r,
> +						  const unsigned char *sha1);
> +#define lookup_unknown_object(sha1) \
> +	object_store_lookup_unknown_object(the_repository, sha1)
>  
>  struct object_list *object_list_insert(struct object *item,
>  				       struct object_list **list_p);
> @@ -138,6 +168,7 @@ void object_array_remove_duplicates(struct object_array *array);
>   */
>  void object_array_clear(struct object_array *array);
>  
> -void clear_object_flags(unsigned flags);
> +extern void object_store_clear_object_flags(struct repository *r, unsigned flags);
> +#define clear_object_flags(flags) object_store_clear_object_flags(the_repository, flags);
>  
>  #endif /* OBJECT_H */
> diff --git a/repository.h b/repository.h
> index 417787f3ef..aeb50a19bc 100644
> --- a/repository.h
> +++ b/repository.h
> @@ -1,6 +1,8 @@
>  #ifndef REPOSITORY_H
>  #define REPOSITORY_H
>  
> +#include "object.h"
> +
>  struct config_set;
>  struct index_state;
>  struct submodule_cache;
> @@ -24,6 +26,10 @@ struct repository {
>  	 * Cannot be NULL after initialization.
>  	 */
>  	char *objectdir;
> +	struct object_store {
> +		struct object **obj_hash;
> +		int nr_objs, obj_hash_size;
> +	} objects;
>  
>  	/*
>  	 * Path to the repository's graft file.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-07  2:33 ` Junio C Hamano
@ 2017-07-07 14:56   ` Ben Peart
  2017-07-07 16:50     ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Ben Peart @ 2017-07-07 14:56 UTC (permalink / raw)
  To: Junio C Hamano, Stefan Beller; +Cc: git, peff



On 7/6/2017 10:33 PM, Junio C Hamano wrote:
> Stefan Beller <sbeller@google.com> writes:
> 
>> Subject: Re: [RFC/WIP PATCH] object store classification
> 
> I thought you are writing different object-store backends and
> classifying them into many categories (e.g. local, networked,
> telepathic, etc.)
> 
> It is a logical next step to put per-repository things into
> the_repository.  I skimmed the patch and the changes smelled sane,
> but I didn't read it really carefully with fine toothed comb.
> 
> Thanks.  The remainder kept for reference, but no additional
> comments in there.
> 
>> This continues the efforts of bw/repo-object, making our code base
>> more object oriented by adding the object store to the the repository struct.
>>

It's great to see this effort continuing.

>> Long term goal of the series that would evolve from this discussion:
>> * Easier to implement submodule features as it can be in the same process.
>>
>> Short term goal:
>> * get rid of 'add_submodule_odb' in submodule.c
>>    When fetching or pushing we need to determine if a submodule needs processing
>>    by finding out if certain commits are present.  To do this we add the submodule
>>    objects as an alternate object database and do the processing in the same
>>    process. In case of multiple submodules, this would pollute the object store
>>    hence being slower and increasing memory usage, too.
>>
>> This patch only changes the object store code to be object oriented based
>> on the repository struct.
>>
>> To go for the short term goal we'd need to convert a few more places, mostly
>> all the construction (blob.c, tree.c, commit.c)
>>

For more API/state design purity, I wonder if there should be an 
object_store structure that is passed to each of the object store APIs 
instead of passing the repository object. The repository object could 
then contain an instance of the object_store structure.

That said, I haven't take a close look at all the code in object.c to 
see if all the data needed can be cleanly abstracted into an 
object_store structure.

One concern I have is that the global state refactoring effort will just 
result in all the global state getting moved into a single (global) 
repository object thus limiting it's usefulness.

I'd also love to see this followed up with additional patches that would 
remove the back compat macros.  Should be a relatively straight forward 
"search and replace" series of patches.  That will eliminate the code 
readability/maintenance issues that comes along with macros in addition 
to the APIs themselves.

No additional comments below...

>> This is marked RFC as I'd want to gather feedback if the approach, presented
>> in the header files is sound.
>>
>> Thanks!
>>
>> Signed-off-by: Stefan Beller <sbeller@google.com>
>> ---
>>
>>    Peff, this area of the code seems performance sensitive for a variety of
>>    use cases, specifically yours. So I cc'd you here. :)
>>
>>   object.c     | 77 +++++++++++++++++++++++++++++++++++-------------------------
>>   object.h     | 51 ++++++++++++++++++++++++++++++++--------
>>   repository.h |  6 +++++
>>   3 files changed, 92 insertions(+), 42 deletions(-)
>>
>> diff --git a/object.c b/object.c
>> index 06ba3a11d8..b5ec0bb2f9 100644
>> --- a/object.c
>> +++ b/object.c
>> @@ -5,17 +5,15 @@
>>   #include "commit.h"
>>   #include "tag.h"
>>   
>> -static struct object **obj_hash;
>> -static int nr_objs, obj_hash_size;
>> -
>> -unsigned int get_max_object_index(void)
>> +unsigned int object_store_get_max_index(struct repository *r)
>>   {
>> -	return obj_hash_size;
>> +	return r->objects.obj_hash_size;
>>   }
>>   
>> -struct object *get_indexed_object(unsigned int idx)
>> +struct object *object_store_get_indexed(struct repository *r,
>> +					unsigned int idx)
>>   {
>> -	return obj_hash[idx];
>> +	return r->objects.obj_hash[idx];
>>   }
>>   
>>   static const char *object_type_strings[] = {
>> @@ -82,11 +80,15 @@ static void insert_obj_hash(struct object *obj, struct object **hash, unsigned i
>>    * Look up the record for the given sha1 in the hash map stored in
>>    * obj_hash.  Return NULL if it was not found.
>>    */
>> -struct object *lookup_object(const unsigned char *sha1)
>> +struct object *object_store_lookup(struct repository *r,
>> +				   const unsigned char *sha1)
>>   {
>>   	unsigned int i, first;
>>   	struct object *obj;
>>   
>> +	struct object **obj_hash = r->objects.obj_hash;
>> +	int obj_hash_size = r->objects.obj_hash_size;
>> +
>>   	if (!obj_hash)
>>   		return NULL;
>>   
>> @@ -114,29 +116,31 @@ struct object *lookup_object(const unsigned char *sha1)
>>    * power of 2 (but at least 32).  Copy the existing values to the new
>>    * hash map.
>>    */
>> -static void grow_object_hash(void)
>> +static void grow_object_hash(struct repository *r)
>>   {
>>   	int i;
>>   	/*
>>   	 * Note that this size must always be power-of-2 to match hash_obj
>>   	 * above.
>>   	 */
>> -	int new_hash_size = obj_hash_size < 32 ? 32 : 2 * obj_hash_size;
>> +	int new_hash_size = r->objects.obj_hash_size < 32 ?
>> +			32 : 2 * r->objects.obj_hash_size;
>>   	struct object **new_hash;
>>   
>>   	new_hash = xcalloc(new_hash_size, sizeof(struct object *));
>> -	for (i = 0; i < obj_hash_size; i++) {
>> -		struct object *obj = obj_hash[i];
>> +	for (i = 0; i < r->objects.obj_hash_size; i++) {
>> +		struct object *obj = r->objects.obj_hash[i];
>>   		if (!obj)
>>   			continue;
>>   		insert_obj_hash(obj, new_hash, new_hash_size);
>>   	}
>> -	free(obj_hash);
>> -	obj_hash = new_hash;
>> -	obj_hash_size = new_hash_size;
>> +	free(r->objects.obj_hash);
>> +	r->objects.obj_hash = new_hash;
>> +	r->objects.obj_hash_size = new_hash_size;
>>   }
>>   
>> -void *create_object(const unsigned char *sha1, void *o)
>> +void *object_store_create(struct repository *r,
>> +			  const unsigned char *sha1, void *o)
>>   {
>>   	struct object *obj = o;
>>   
>> @@ -145,15 +149,17 @@ void *create_object(const unsigned char *sha1, void *o)
>>   	obj->flags = 0;
>>   	hashcpy(obj->oid.hash, sha1);
>>   
>> -	if (obj_hash_size - 1 <= nr_objs * 2)
>> -		grow_object_hash();
>> +	if (r->objects.obj_hash_size - 1 <= r->objects.nr_objs * 2)
>> +		grow_object_hash(r);
>>   
>> -	insert_obj_hash(obj, obj_hash, obj_hash_size);
>> -	nr_objs++;
>> +	insert_obj_hash(obj, r->objects.obj_hash, r->objects.obj_hash_size);
>> +	r->objects.nr_objs++;
>>   	return obj;
>>   }
>>   
>> -void *object_as_type(struct object *obj, enum object_type type, int quiet)
>> +void *object_as_type(struct object *obj,
>> +		     enum object_type type,
>> +		     int quiet)
>>   {
>>   	if (obj->type == type)
>>   		return obj;
>> @@ -172,15 +178,20 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet)
>>   	}
>>   }
>>   
>> -struct object *lookup_unknown_object(const unsigned char *sha1)
>> +struct object *object_store_lookup_unknown_object(struct repository *r,
>> +						  const unsigned char *sha1)
>>   {
>> -	struct object *obj = lookup_object(sha1);
>> +	struct object *obj = object_store_lookup(r, sha1);
>>   	if (!obj)
>> -		obj = create_object(sha1, alloc_object_node());
>> +		obj = object_store_create(r, sha1, alloc_object_node());
>>   	return obj;
>>   }
>>   
>> -struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p)
>> +struct object *object_store_parse_object_buffer(struct repository *r,
>> +						const struct object_id *oid,
>> +						enum object_type type,
>> +						unsigned long size,
>> +						void *buffer, int *eaten_p)
>>   {
>>   	struct object *obj;
>>   	*eaten_p = 0;
>> @@ -230,17 +241,19 @@ struct object *parse_object_buffer(const struct object_id *oid, enum object_type
>>   	return obj;
>>   }
>>   
>> -struct object *parse_object_or_die(const struct object_id *oid,
>> -				   const char *name)
>> +struct object *object_store_parse_object_or_die(struct repository *r,
>> +						const struct object_id *oid,
>> +						const char *name)
>>   {
>> -	struct object *o = parse_object(oid);
>> +	struct object *o = object_store_parse_object(r, oid);
>>   	if (o)
>>   		return o;
>>   
>>   	die(_("unable to parse object: %s"), name ? name : oid_to_hex(oid));
>>   }
>>   
>> -struct object *parse_object(const struct object_id *oid)
>> +struct object *object_store_parse_object(struct repository *r,
>> +					 const struct object_id *oid)
>>   {
>>   	unsigned long size;
>>   	enum object_type type;
>> @@ -413,12 +426,12 @@ void object_array_remove_duplicates(struct object_array *array)
>>   	}
>>   }
>>   
>> -void clear_object_flags(unsigned flags)
>> +void object_store_clear_object_flags(struct repository *r, unsigned flags)
>>   {
>>   	int i;
>>   
>> -	for (i=0; i < obj_hash_size; i++) {
>> -		struct object *obj = obj_hash[i];
>> +	for (i = 0; i < r->objects.obj_hash_size; i++) {
>> +		struct object *obj = r->objects.obj_hash[i];
>>   		if (obj)
>>   			obj->flags &= ~flags;
>>   	}
>> diff --git a/object.h b/object.h
>> index 33e5cc9943..1b2ab8ee88 100644
>> --- a/object.h
>> +++ b/object.h
>> @@ -1,6 +1,8 @@
>>   #ifndef OBJECT_H
>>   #define OBJECT_H
>>   
>> +#include "repository.h"
>> +
>>   struct object_list {
>>   	struct object *item;
>>   	struct object_list *next;
>> @@ -59,12 +61,15 @@ extern int type_from_string_gently(const char *str, ssize_t, int gentle);
>>   /*
>>    * Return the current number of buckets in the object hashmap.
>>    */
>> -extern unsigned int get_max_object_index(void);
>> +extern unsigned int object_store_get_max_index(struct repository *r);
>> +#define get_max_object_index() object_store_get_max_index(the_repository)
>>   
>>   /*
>>    * Return the object from the specified bucket in the object hashmap.
>>    */
>> -extern struct object *get_indexed_object(unsigned int);
>> +extern struct object *object_store_get_indexed(struct repository *r,
>> +					       unsigned int);
>> +#define get_indexed_object(i) object_store_get_indexed(the_repository, i)
>>   
>>   /*
>>    * This can be used to see if we have heard of the object before, but
>> @@ -78,34 +83,59 @@ extern struct object *get_indexed_object(unsigned int);
>>    * half-initialised objects, the caller is expected to initialize them
>>    * by calling parse_object() on them.
>>    */
>> -struct object *lookup_object(const unsigned char *sha1);
>> +extern struct object *object_store_lookup(struct repository *r,
>> +					  const unsigned char *sha1);
>> +#define lookup_object(hash) object_store_lookup(the_repository, hash)
>>   
>> -extern void *create_object(const unsigned char *sha1, void *obj);
>> +extern void *object_store_create(struct repository *r,
>> +				 const unsigned char *sha1,
>> +				 void *obj);
>> +#define create_object(sha1, obj) object_store_create(the_repository, sha1, obj)
>>   
>> -void *object_as_type(struct object *obj, enum object_type type, int quiet);
>> +extern void *object_as_type(struct object *obj,
>> +			    enum object_type type,
>> +			    int quiet);
>>   
>>   /*
>>    * Returns the object, having parsed it to find out what it is.
>>    *
>>    * Returns NULL if the object is missing or corrupt.
>>    */
>> -struct object *parse_object(const struct object_id *oid);
>> +extern struct object *object_store_parse_object(struct repository *r,
>> +						const struct object_id *oid);
>> +#define parse_object(sha1) \
>> +	object_store_parse_object(the_repository, sha1)
>>   
>>   /*
>>    * Like parse_object, but will die() instead of returning NULL. If the
>>    * "name" parameter is not NULL, it is included in the error message
>>    * (otherwise, the hex object ID is given).
>>    */
>> -struct object *parse_object_or_die(const struct object_id *oid, const char *name);
>> +extern struct object *object_store_parse_object_or_die(struct repository *r,
>> +						       const struct object_id *oid,
>> +						       const char *name);
>> +#define parse_object_or_die(oid, name) \
>> +	object_store_parse_object_or_die(the_repository, oid, name)
>>   
>>   /* Given the result of read_sha1_file(), returns the object after
>>    * parsing it.  eaten_p indicates if the object has a borrowed copy
>>    * of buffer and the caller should not free() it.
>>    */
>> -struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
>> +struct object *object_store_parse_object_buffer(struct repository *r,
>> +						const struct object_id *oid,
>> +						enum object_type type,
>> +						unsigned long size,
>> +						void *buffer,
>> +						int *eaten_p);
>> +#define parse_object_buffer(oid, type, size, buffer, eaten_p) \
>> +	object_store_parse_object_buffer(the_repository, oid, type, \
>> +					 size, buffer, eaten_p)
>>   
>>   /** Returns the object, with potentially excess memory allocated. **/
>> -struct object *lookup_unknown_object(const unsigned  char *sha1);
>> +struct object *object_store_lookup_unknown_object(struct repository *r,
>> +						  const unsigned char *sha1);
>> +#define lookup_unknown_object(sha1) \
>> +	object_store_lookup_unknown_object(the_repository, sha1)
>>   
>>   struct object_list *object_list_insert(struct object *item,
>>   				       struct object_list **list_p);
>> @@ -138,6 +168,7 @@ void object_array_remove_duplicates(struct object_array *array);
>>    */
>>   void object_array_clear(struct object_array *array);
>>   
>> -void clear_object_flags(unsigned flags);
>> +extern void object_store_clear_object_flags(struct repository *r, unsigned flags);
>> +#define clear_object_flags(flags) object_store_clear_object_flags(the_repository, flags);
>>   
>>   #endif /* OBJECT_H */
>> diff --git a/repository.h b/repository.h
>> index 417787f3ef..aeb50a19bc 100644
>> --- a/repository.h
>> +++ b/repository.h
>> @@ -1,6 +1,8 @@
>>   #ifndef REPOSITORY_H
>>   #define REPOSITORY_H
>>   
>> +#include "object.h"
>> +
>>   struct config_set;
>>   struct index_state;
>>   struct submodule_cache;
>> @@ -24,6 +26,10 @@ struct repository {
>>   	 * Cannot be NULL after initialization.
>>   	 */
>>   	char *objectdir;
>> +	struct object_store {
>> +		struct object **obj_hash;
>> +		int nr_objs, obj_hash_size;
>> +	} objects;
>>   
>>   	/*
>>   	 * Path to the repository's graft file.
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-07 14:56   ` Ben Peart
@ 2017-07-07 16:50     ` Junio C Hamano
  2017-07-11  1:17       ` Stefan Beller
  0 siblings, 1 reply; 9+ messages in thread
From: Junio C Hamano @ 2017-07-07 16:50 UTC (permalink / raw)
  To: Ben Peart; +Cc: Stefan Beller, git, peff

Ben Peart <peartben@gmail.com> writes:

> For more API/state design purity, I wonder if there should be an
> object_store structure that is passed to each of the object store APIs
> instead of passing the repository object. The repository object could
> then contain an instance of the object_store structure.
>
> That said, I haven't take a close look at all the code in object.c to
> see if all the data needed can be cleanly abstracted into an
> object_store structure.

My gut feeling was it is just the large hashtable that keeps track of
objects we have seen, but the object replacement/grafts and other
things may also want to become per-repository.

> One concern I have is that the global state refactoring effort will
> just result in all the global state getting moved into a single
> (global) repository object thus limiting it's usefulness.

I actually am not worried about it that much, and I say this with
the background of having done the same "grouping a set of global
state variables into a single structure and turning them into a
single default instance" for the_index.  Whether you like it or not,
the majority of operations do work on the default instance---that
was why the operations could live with just "a set of global state
variables" in the first place, and the convenience compatibility
macros that allow you to operate on the fields of the default
instance as if they were separate variables have been a huge
typesaver that also reduces the cognitive burden.  I'd expect that
the same will hold for the new "repository" and the "object_store"
abstractions.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-07 16:50     ` Junio C Hamano
@ 2017-07-11  1:17       ` Stefan Beller
  2017-07-11 18:01         ` Brandon Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Beller @ 2017-07-11  1:17 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, git@vger.kernel.org, Jeff King

On Fri, Jul 7, 2017 at 9:50 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Ben Peart <peartben@gmail.com> writes:
>
>> For more API/state design purity, I wonder if there should be an
>> object_store structure that is passed to each of the object store APIs
>> instead of passing the repository object. The repository object could
>> then contain an instance of the object_store structure.
>>
>> That said, I haven't take a close look at all the code in object.c to
>> see if all the data needed can be cleanly abstracted into an
>> object_store structure.
>
> My gut feeling was it is just the large hashtable that keeps track of
> objects we have seen, but the object replacement/grafts and other
> things may also want to become per-repository.

This is similar to the_index which is referenced by the_repository.
But as we do not have anything like the_object_store already, we are
free to design it, as the required work that needs to be put in is the
same.

With the object replacements/grafts coming up as well as alternates,
we definitely want that to be per repository, the question is if we rather
want

  the_repository -> many object_stores (one for each, alternate, grafts,
      and the usual place at $GIT_DIR/objects
  where the object_store is a hashmap, maybe an additional describing
  string or path.

or

  the_repository -> the_object_store
  but the object store is a complex beast having different hash tables
  for the different alternates.

or

  the_repository -> the_object_store_hash_map
  which is this patch that would try to put any object related to this
  repository into the same hashmap and the hashmap is not special
  for each of the different object locations.


>
>> One concern I have is that the global state refactoring effort will
>> just result in all the global state getting moved into a single
>> (global) repository object thus limiting it's usefulness.
>
> I actually am not worried about it that much, and I say this with
> the background of having done the same "grouping a set of global
> state variables into a single structure and turning them into a
> single default instance" for the_index.  Whether you like it or not,
> the majority of operations do work on the default instance---that
> was why the operations could live with just "a set of global state
> variables" in the first place, and the convenience compatibility
> macros that allow you to operate on the fields of the default
> instance as if they were separate variables have been a huge
> typesaver that also reduces the cognitive burden.  I'd expect that
> the same will hold for the new "repository" and the "object_store"
> abstractions.

Sounds reasonable to expect.

Thanks,
Stefan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-11  1:17       ` Stefan Beller
@ 2017-07-11 18:01         ` Brandon Williams
  2017-07-11 18:46           ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Brandon Williams @ 2017-07-11 18:01 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Junio C Hamano, Ben Peart, git@vger.kernel.org, Jeff King

On 07/10, Stefan Beller wrote:
> On Fri, Jul 7, 2017 at 9:50 AM, Junio C Hamano <gitster@pobox.com> wrote:
> > Ben Peart <peartben@gmail.com> writes:
> >
> >> For more API/state design purity, I wonder if there should be an
> >> object_store structure that is passed to each of the object store APIs
> >> instead of passing the repository object. The repository object could
> >> then contain an instance of the object_store structure.
> >>
> >> That said, I haven't take a close look at all the code in object.c to
> >> see if all the data needed can be cleanly abstracted into an
> >> object_store structure.
> >
> > My gut feeling was it is just the large hashtable that keeps track of
> > objects we have seen, but the object replacement/grafts and other
> > things may also want to become per-repository.
> 
> This is similar to the_index which is referenced by the_repository.
> But as we do not have anything like the_object_store already, we are
> free to design it, as the required work that needs to be put in is the
> same.
> 
> With the object replacements/grafts coming up as well as alternates,
> we definitely want that to be per repository, the question is if we rather
> want
> 
>   the_repository -> many object_stores (one for each, alternate, grafts,
>       and the usual place at $GIT_DIR/objects
>   where the object_store is a hashmap, maybe an additional describing
>   string or path.
> 
> or
> 
>   the_repository -> the_object_store
>   but the object store is a complex beast having different hash tables
>   for the different alternates.

After looking at the patch and some of the comments here I think that
this is probably the best approach with a few tweaks (which may be
completely unfounded because I'm not familiar with all the object store
code).

In an OO world I would envision a single object (let's say 'struct
object_store') which is responsible for managing a repository's objects
no matter where the individual objects came from (main object store or
an alternate for that repository).  And if I understand correctly the
single hash table that exists now caches objects like this.

I also think that such a 'struct object_store' should probably be an
opaque type to a majority of the code base.  This means that it probably
shouldn't have its definition in 'repository.h'.

As far as API, I think it should be similar to the new repo_config (old
one too, though it was implicit) API in that the code base doesn't need
to know about 'struct configset', it just passes a pointer to the
repository and then the 'struct configset' which is stored in the
repository is operated on under the hood.  This way the code base would
just query for an object using the repository as a handle like:

  get_object(repo, OID);

  and not:

  get_object(repo->object_store, OID);

Of course under the hood it would be preferable to have the functions
operate on the object_store struct explicitly.

> 
> or
> 
>   the_repository -> the_object_store_hash_map
>   which is this patch that would try to put any object related to this
>   repository into the same hashmap and the hashmap is not special
>   for each of the different object locations.
> 
> 
> >
> >> One concern I have is that the global state refactoring effort will
> >> just result in all the global state getting moved into a single
> >> (global) repository object thus limiting it's usefulness.

I think we do need to think about this, but it shouldn't be too much of
a concern right now.  The first step is to get enough of the object
store object oriented such that you can have two object stores
corresponding to two different repositories working in parallel.

> >
> > I actually am not worried about it that much, and I say this with
> > the background of having done the same "grouping a set of global
> > state variables into a single structure and turning them into a
> > single default instance" for the_index.  Whether you like it or not,
> > the majority of operations do work on the default instance---that
> > was why the operations could live with just "a set of global state
> > variables" in the first place, and the convenience compatibility
> > macros that allow you to operate on the fields of the default
> > instance as if they were separate variables have been a huge
> > typesaver that also reduces the cognitive burden.  I'd expect that
> > the same will hold for the new "repository" and the "object_store"
> > abstractions.
> 
> Sounds reasonable to expect.
> 
> Thanks,
> Stefan

-- 
Brandon Williams

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-11 18:01         ` Brandon Williams
@ 2017-07-11 18:46           ` Junio C Hamano
  2017-07-11 19:49             ` Stefan Beller
  0 siblings, 1 reply; 9+ messages in thread
From: Junio C Hamano @ 2017-07-11 18:46 UTC (permalink / raw)
  To: Brandon Williams; +Cc: Stefan Beller, Ben Peart, git@vger.kernel.org, Jeff King

Brandon Williams <bmwill@google.com> writes:

>>   the_repository -> the_object_store
>>   but the object store is a complex beast having different hash tables
>>   for the different alternates.
>
> After looking at the patch and some of the comments here I think that
> this is probably the best approach with a few tweaks (which may be
> completely unfounded because I'm not familiar with all the object store
> code).
>
> In an OO world I would envision a single object (let's say 'struct
> object_store') which is responsible for managing a repository's objects
> no matter where the individual objects came from (main object store or
> an alternate for that repository).  And if I understand correctly the
> single hash table that exists now caches objects like this.

I would say that conceptually an object-store object has an
interface to answer "give me info on the object I can refer to with
this object name".  At the implementation level, it should have a
hashtable (lazily populated) for all the objects in a single
$GIT_OBJECT_DIRECTORY, grafts/replace info, and a set of pointers to
other object-store instances that are its alternate object stores.
You'd need to have a mechanism to avoid creating cycles in these
pointers, of course.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-11 18:46           ` Junio C Hamano
@ 2017-07-11 19:49             ` Stefan Beller
  2017-07-11 20:27               ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Beller @ 2017-07-11 19:49 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Brandon Williams, Ben Peart, git@vger.kernel.org, Jeff King

On Tue, Jul 11, 2017 at 11:46 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Brandon Williams <bmwill@google.com> writes:
>
>>>   the_repository -> the_object_store
>>>   but the object store is a complex beast having different hash tables
>>>   for the different alternates.
>>
>> After looking at the patch and some of the comments here I think that
>> this is probably the best approach with a few tweaks (which may be
>> completely unfounded because I'm not familiar with all the object store
>> code).
>>
>> In an OO world I would envision a single object (let's say 'struct
>> object_store') which is responsible for managing a repository's objects
>> no matter where the individual objects came from (main object store or
>> an alternate for that repository).  And if I understand correctly the
>> single hash table that exists now caches objects like this.
>
> I would say that conceptually an object-store object has an
> interface to answer "give me info on the object I can refer to with
> this object name".

ok.

> At the implementation level, it should have a
> hashtable (lazily populated) for all the objects in a single
> $GIT_OBJECT_DIRECTORY, grafts/replace info, and a set of pointers to
> other object-store instances that are its alternate object stores.

So one repository has one or more object stores?

I would expect that most of the time the question from above
"give me info on the object I can refer to with this object name"
is asked with the additional information: "and I know it is in this
repository", so we rather want to have

  lookup_object(struct *repo, char *name);

instead of

  lookup_object(struct *object_store, char *name);

because the caller most likely would not care if the object
comes from the alternate or from the main object store?
(There may be special cases in e.g. repacking/gc where
we want to instruct the repacker to only repack the main
object store, ignoring any alternates or such, but any other
command would not care, AFAICT)

So we could have the convenience function

    lookup_object(struct *repo, char *name)
    {
        foreach_objectstore(repo, lookup_object_in_single_store, object)
            if (object)
                return object;
        return NULL;
    }

but such code is related to storing objects, that I would prefer to see
it in the object store (object.{h,c}) instead of the repository code.
Also my initial plan was to have all objects (including from any alternate)
in a single hashmap per repository as that seems to be most efficient
assuming all alternates fit into memory.

This whole object store object orientation is only started to not have
the memory pressure from lots of submodule objects polluting the
main object store. When we have its own hashmap for each alternate
our performance would degrade with the number of alternates.
(Assuming the hypothetical worst case of one alternate per object,
then the lookup time would be linear in time given the above function,
I wonder if there are users that heavily use lots of alternates such that
this performance characteristics would be measurable in the code to
be written)

> You'd need to have a mechanism to avoid creating cycles in these
> pointers, of course.

This is another complication with many hashmaps (=object stores).
In the future, is it likely that we would want to drop an alternate
store from within a running process? If so we'd want to have
hashmaps per alternate, otherwise I only see disadvantages
for more than one hashmap (-> more than one object store)
per repository. And with that said, I think we can make a wrapper
struct object_store, that would encapsulate all needed variables.

+       struct object_store {
+               struct object **obj_hash;
+               int nr_objs, obj_hash_size;
+       } objects;

But instead of defining it at the repository, we'd rather define it
in object.h.

Stefan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC/WIP PATCH] object store classification
  2017-07-11 19:49             ` Stefan Beller
@ 2017-07-11 20:27               ` Junio C Hamano
  0 siblings, 0 replies; 9+ messages in thread
From: Junio C Hamano @ 2017-07-11 20:27 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Brandon Williams, Ben Peart, git@vger.kernel.org, Jeff King

Stefan Beller <sbeller@google.com> writes:

>> At the implementation level, it should have a
>> hashtable (lazily populated) for all the objects in a single
>> $GIT_OBJECT_DIRECTORY, grafts/replace info, and a set of pointers to
>> other object-store instances that are its alternate object stores.
>
> So one repository has one or more object stores?

One repository foo/.git/ has one foo/.git/objects/ directory, so it
has its own single object store.  That object store may refer to
another object store by having foo/.git/objects/info/alternates.

Similarly, foo/.git/objects/info/grafts and foo/.git/refs/replace/
would belong to the single object store repository foo/.git/ has.

> I would expect that most of the time the question from above
> "give me info on the object I can refer to with this object name"
> is asked with the additional information: "and I know it is in this
> repository", so we rather want to have
>
>   lookup_object(struct *repo, char *name);
>
> instead of
>
>   lookup_object(struct *object_store, char *name);

Absolutely.  That is why repository has its own single object_store,
which may refer to other object_stores.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-07-11 20:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-06 20:27 [RFC/WIP PATCH] object store classification Stefan Beller
2017-07-07  2:33 ` Junio C Hamano
2017-07-07 14:56   ` Ben Peart
2017-07-07 16:50     ` Junio C Hamano
2017-07-11  1:17       ` Stefan Beller
2017-07-11 18:01         ` Brandon Williams
2017-07-11 18:46           ` Junio C Hamano
2017-07-11 19:49             ` Stefan Beller
2017-07-11 20:27               ` Junio C Hamano

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).