git@vger.kernel.org mailing list mirror (one of many)
 help / Atom feed
* [PATCH v1] read-cache: speed up index load through parallelization
@ 2018-08-23 15:41 Ben Peart
  2018-08-23 17:31 ` Stefan Beller
                   ` (8 more replies)
  0 siblings, 9 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-23 15:41 UTC (permalink / raw)
  To: git; +Cc: gitster, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them. Once the
threads are complete and the cache entries are loaded, the rest of the
extensions can be loaded and processed normally on the primary thread.

Performance impact:

read cache .git/index times on a synthetic repo with:

100,000 entries
FALSE       TRUE        Savings     %Savings
0.014798767 0.009580433 0.005218333 35.26%

1,000,000 entries
FALSE       TRUE        Savings     %Savings
0.240896533 0.1751243   0.065772233 27.30%

read cache .git/index times on an actual repo with:

~3M entries
FALSE       TRUE        Savings     %Savings
0.59898098  0.4513169   0.14766408  24.65%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---

Notes:
    Base Ref: master
    Web-Diff: https://github.com/benpeart/git/commit/67a700419b
    Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b

 Documentation/config.txt |   8 ++
 config.c                 |  13 +++
 config.h                 |   1 +
 read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
 4 files changed, 216 insertions(+), 24 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..3344685cc4 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
 index comparison to the filesystem data in parallel, allowing
 overlapping IO's.  Defaults to true.
 
+core.fastIndex::
+       Enable parallel index loading
++
+This can speed up operations like 'git diff' and 'git status' especially
+when the index is very large.  When enabled, Git will do the index
+loading from the on disk format to the in-memory format in parallel.
+Defaults to true.
+
 core.createObject::
 	You can set this to 'link', in which case a hardlink followed by
 	a delete of the source are used to make sure that object creation
diff --git a/config.c b/config.c
index 9a0b10d4bc..883092fdd3 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,19 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_fast_index(void)
+{
+	int val;
+
+	if (!git_config_get_maybe_bool("core.fastindex", &val))
+		return val;
+
+	if (getenv("GIT_FASTINDEX_TEST"))
+		return 1;
+
+	return -1; /* default value */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..74ca4e7db5 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_fast_index(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..0fa7e1a04c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -24,6 +24,10 @@
 #include "utf8.h"
 #include "fsmonitor.h"
 
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
 /* Mask for the name length in ce_flags in the on-disk index */
 
 #define CE_NAMEMASK  (0x0fff)
@@ -1889,16 +1893,203 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifdef NO_PTHREADS
+
+#define load_cache_entries load_all_cache_entries
+
+#else
+
+#include "thread-utils.h"
+
+/*
+* Mostly randomly chosen maximum thread counts: we
+* cap the parallelism to online_cpus() threads, and we want
+* to have at least 7500 cache entries per thread for it to
+* be worth starting a thread.
+*/
+#define THREAD_COST		(7500)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+* A thread proc to run the load_cache_entries() computation
+* across multiple background threads.
+*/
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int threads, cpus, thread_nr;
+	unsigned long consumed;
+	int i, thread;
+
+	cpus = online_cpus();
+	threads = istate->cache_nr / THREAD_COST;
+	if (threads > cpus)
+		threads = cpus;
+
+	/* enable testing with fewer than default minimum of entries */
+	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
+		threads = 2;
+
+	if (threads < 2 || !git_config_get_fast_index())
+		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	thread_nr = (istate->cache_nr + threads - 1) / threads;
+	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
+
+	/* loop through index entries starting a thread for every thread_nr entries */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
+		if (0 == i % thread_nr) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = min(thread_nr, istate->cache_nr - i);
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+			if (++thread == threads || p->nr != thread_nr)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1935,29 +2126,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
 	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
+	src_offset += load_cache_entries(istate, mmap, mmap_size, src_offset);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 

base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
@ 2018-08-23 17:31 ` Stefan Beller
  2018-08-23 19:44   ` Ben Peart
  2018-08-24 18:40   ` Duy Nguyen
  2018-08-23 18:06 ` Junio C Hamano
                   ` (7 subsequent siblings)
  8 siblings, 2 replies; 153+ messages in thread
From: Stefan Beller @ 2018-08-23 17:31 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, Junio C Hamano

On Thu, Aug 23, 2018 at 8:45 AM Ben Peart <Ben.Peart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them. Once the
> threads are complete and the cache entries are loaded, the rest of the
> extensions can be loaded and processed normally on the primary thread.
>
> Performance impact:
>
> read cache .git/index times on a synthetic repo with:
>
> 100,000 entries
> FALSE       TRUE        Savings     %Savings
> 0.014798767 0.009580433 0.005218333 35.26%
>
> 1,000,000 entries
> FALSE       TRUE        Savings     %Savings
> 0.240896533 0.1751243   0.065772233 27.30%
>
> read cache .git/index times on an actual repo with:
>
> ~3M entries
> FALSE       TRUE        Savings     %Savings
> 0.59898098  0.4513169   0.14766408  24.65%
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>
> Notes:
>     Base Ref: master
>     Web-Diff: https://github.com/benpeart/git/commit/67a700419b
>     Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b
>
>  Documentation/config.txt |   8 ++
>  config.c                 |  13 +++
>  config.h                 |   1 +
>  read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
>  4 files changed, 216 insertions(+), 24 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..3344685cc4 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
>  index comparison to the filesystem data in parallel, allowing
>  overlapping IO's.  Defaults to true.
>
> +core.fastIndex::
> +       Enable parallel index loading
> ++
> +This can speed up operations like 'git diff' and 'git status' especially
> +when the index is very large.  When enabled, Git will do the index
> +loading from the on disk format to the in-memory format in parallel.
> +Defaults to true.

"fast" is a non-descriptive word as we try to be fast in any operation?
Maybe core.parallelIndexReading as that just describes what it
turns on/off, without second guessing its effects?
(Are there still computers with just a single CPU, where this would not
make it faster? ;-))


> +int git_config_get_fast_index(void)
> +{
> +       int val;
> +
> +       if (!git_config_get_maybe_bool("core.fastindex", &val))
> +               return val;
> +
> +       if (getenv("GIT_FASTINDEX_TEST"))
> +               return 1;

We look at this env value just before calling this function,
can be write it to only look at the evn variable once?

> +++ b/config.h
> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>  extern int git_config_get_split_index(void);
>  extern int git_config_get_max_percent_split_change(void);
>  extern int git_config_get_fsmonitor(void);
> +extern int git_config_get_fast_index(void);

Oh. nd/no-extern did not cover config.h


>
> +#ifndef min
> +#define min(a,b) (((a) < (b)) ? (a) : (b))
> +#endif

We do not have a minimum function in the tree,
except for xdiff/xmacros.h:29: XDL_MIN.
I wonder what the rationale is for not having a MIN()
definition, I think we discussed that on the list a couple
times but the rationale escaped me.

If we introduce a min/max macro, can we put it somewhere
more prominent? (I would find it useful elsewhere)

> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST            (7500)

This reads very similar to preload-index.c THREAD_COST

> +       /* loop through index entries starting a thread for every thread_nr entries */
> +       consumed = thread = 0;
> +       for (i = 0; ; i++) {
> +               struct ondisk_cache_entry *ondisk;
> +               const char *name;
> +               unsigned int flags;
> +
> +               /* we've reached the begining of a block of cache entries, kick off a thread to process them */

beginning

Thanks,
Stefan

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
  2018-08-23 17:31 ` Stefan Beller
@ 2018-08-23 18:06 ` Junio C Hamano
  2018-08-23 20:33   ` Ben Peart
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-08-23 18:06 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\

Ben Peart <Ben.Peart@microsoft.com> writes:

> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.

Nice.

> +int git_config_get_fast_index(void)
> +{
> +	int val;
> +
> +	if (!git_config_get_maybe_bool("core.fastindex", &val))
> +		return val;
> +
> +	if (getenv("GIT_FASTINDEX_TEST"))
> +		return 1;

It probably makes sense to use git_env_bool() to be consistent,
which allows GIT_FASTINDEX_TEST=0 to turn it off after this becomes
the default.

> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..0fa7e1a04c 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -24,6 +24,10 @@
>  #include "utf8.h"
>  #include "fsmonitor.h"
>  
> +#ifndef min
> +#define min(a,b) (((a) < (b)) ? (a) : (b))
> +#endif

Let's lose this, which is used only once, even though it could be
used elsewhere but not used (e.g. threads vs cpus near the beginning
of load_cache_entries()).

> +static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)

Wrap and possibly add comment before the function to describe what
it does and what its parameters mean?

> +{
> +	int i;
> +	unsigned long src_offset = start_offset;
> +
> +	for (i = offset; i < offset + nr; i++) {
> +		struct ondisk_cache_entry *disk_ce;
> +		struct cache_entry *ce;
> +		unsigned long consumed;
> +
> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +		set_index_entry(istate, i, ce);
> +
> +		src_offset += consumed;
> +	}
> +	return src_offset - start_offset;
> +}

OK.

> +static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
> +{

(following aloud) This "all" variant is "one thread does all", iow,
unthreaded version.  Makes sense.

> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	unsigned long consumed;
> +
> +	if (istate->version == 4) {
> +		previous_name = &previous_name_buf;
> +		mem_pool_init(&istate->ce_mem_pool,
> +			      estimate_cache_size_from_compressed(istate->cache_nr));
> +	} else {
> +		previous_name = NULL;
> +		mem_pool_init(&istate->ce_mem_pool,
> +			      estimate_cache_size(mmap_size, istate->cache_nr));
> +	}

I count there are three instances of "if version 4 use the strbuf
for name-buf, otherwise..." in this patch, which made me wonder if
we can make them shared more and/or if it makes sense to attempt to
do so.

> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
> +	strbuf_release(&previous_name_buf);
> +	return consumed;
> +}
> +
> +#ifdef NO_PTHREADS
> +
> +#define load_cache_entries load_all_cache_entries
> +
> +#else
> +
> +#include "thread-utils.h"
> +
> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST		(7500)
> +
> +struct load_cache_entries_thread_data
> +{
> +	pthread_t pthread;
> +	struct index_state *istate;
> +	struct mem_pool *ce_mem_pool;
> +	int offset, nr;
> +	void *mmap;
> +	unsigned long start_offset;
> +	struct strbuf previous_name_buf;
> +	struct strbuf *previous_name;
> +	unsigned long consumed;	/* return # of bytes in index file processed */
> +};
> +
> +/*
> +* A thread proc to run the load_cache_entries() computation
> +* across multiple background threads.
> +*/
> +static void *load_cache_entries_thread(void *_data)
> +{
> +	struct load_cache_entries_thread_data *p = _data;
> +
> +	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
> +	return NULL;
> +}

(following aloud) And the threaded version chews the block of ce's
given to each thread.  Makes sense.

> +static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	struct load_cache_entries_thread_data *data;
> +	int threads, cpus, thread_nr;
> +	unsigned long consumed;
> +	int i, thread;
> +
> +	cpus = online_cpus();
> +	threads = istate->cache_nr / THREAD_COST;
> +	if (threads > cpus)
> +		threads = cpus;

No other caller of online_cpus() is prepared to deal with faulty
return from the function (e.g. 0 or negative), so it is perfectly
fine for this caller to trust it would return at least 1.  OK.

Not using min() and it still is very readable ;-).

> +	/* enable testing with fewer than default minimum of entries */
> +	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
> +		threads = 2;

Another good place to use git_env_bool().

> +	if (threads < 2 || !git_config_get_fast_index())
> +		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);

config_get_fast_index() can return -1 to signal "no strong
preference either way".  A caller that negates the value without
paying special attention to negative return makes the reader wonder
if the code is buggy or actively interpreting "do not care" as "I do
not mind if you use it" (it is the latter in this case).

I actually think git_config_get_fast_index() is a helper that does a
bit too little.  Perhaps the above two if() statements can be
combined into a single call to

	threads = use_fast_index(istate);
	if (threads < 2)
		return load_all_cache_entries(...);

and let it call online_cpus(), determination of thread-count taking
THREADS_COST into account, and also reading the configuration
variable?  The configuration variable might even want to say how
many threads it wants to cap us at maximum in the future.

> +	mem_pool_init(&istate->ce_mem_pool, 0);
> +	if (istate->version == 4)
> +		previous_name = &previous_name_buf;
> +	else
> +		previous_name = NULL;
> +
> +	thread_nr = (istate->cache_nr + threads - 1) / threads;

(following aloud) threads is the number of threads that we are going
to spawn.  thread_nr is not any number about threads---it is number
of cache entries each thread will work on.  The latter is
confusingly named.

ce_per_thread perhaps?

As the division is rounded up, among "threads" threads, we know we
will cover all "cache_nr" cache entries.  The last thread may handle
fewer than "thread_nr" entries, or even just a single entry in the
worst case.

When cache_nr == 1 and FASTINDEX_TEST tells us to use threads == 2,
then thread_nr = (1 + 2 - 1) / 2 = 1.

The first one in the loop is given (offset, nr) = (0, 1) in the loop
The second one is given (offset, nr) = (1, 0) in the loop.  Two
questions come to mind:

 - Is load_cache_entries_thread() prepared to be given offset that
   is beyond the end of istate->cache[] and become a no-op?

 - Does the next loop even terminate without running beyond the end
   of istate->cache[]?

> +	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
> +
> +	/* loop through index entries starting a thread for every thread_nr entries */
> +	consumed = thread = 0;
> +	for (i = 0; ; i++) {

Uncapped for() loop makes readers a bit nervous.
An extra "i < istate->cache_nr" would not hurt, perhaps?

> +		struct ondisk_cache_entry *ondisk;
> +		const char *name;
> +		unsigned int flags;
> +
> +		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
> +		if (0 == i % thread_nr) {
> +			struct load_cache_entries_thread_data *p = &data[thread];
> +
> +			p->istate = istate;
> +			p->offset = i;
> +			p->nr = min(thread_nr, istate->cache_nr - i);

(following aloud) p->nr is the number of entries this thread will
work on.

> +			/* create a mem_pool for each thread */
> +			if (istate->version == 4)
> +				mem_pool_init(&p->ce_mem_pool,
> +						  estimate_cache_size_from_compressed(p->nr));
> +			else
> +				mem_pool_init(&p->ce_mem_pool,
> +						  estimate_cache_size(mmap_size, p->nr));
> +
> +			p->mmap = mmap;
> +			p->start_offset = src_offset;
> +			if (previous_name) {
> +				strbuf_addbuf(&p->previous_name_buf, previous_name);
> +				p->previous_name = &p->previous_name_buf;
> +			}
> +
> +			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
> +				die("unable to create load_cache_entries_thread");
> +			if (++thread == threads || p->nr != thread_nr)
> +				break;
> +		}
> +
> +		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +
> +		/* On-disk flags are just 16 bits */
> +		flags = get_be16(&ondisk->flags);
> +
> +		if (flags & CE_EXTENDED) {
> +			struct ondisk_cache_entry_extended *ondisk2;
> +			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
> +			name = ondisk2->name;
> +		} else
> +			name = ondisk->name;
> +
> +		if (!previous_name) {
> +			size_t len;
> +
> +			/* v3 and earlier */
> +			len = flags & CE_NAMEMASK;
> +			if (len == CE_NAMEMASK)
> +				len = strlen(name);
> +			src_offset += (flags & CE_EXTENDED) ?
> +				ondisk_cache_entry_extended_size(len) :
> +				ondisk_cache_entry_size(len);
> +		} else
> +			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);

Nice to see this done without a new index extension that records
offsets, so that we can load existing index files in parallel.

> +	}
> +
> +	for (i = 0; i < threads; i++) {
> +		struct load_cache_entries_thread_data *p = data + i;
> +		if (pthread_join(p->pthread, NULL))
> +			die("unable to join load_cache_entries_thread");
> +		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
> +		strbuf_release(&p->previous_name_buf);
> +		consumed += p->consumed;
> +	}
> +
> +	free(data);
> +	strbuf_release(&previous_name_buf);
> +
> +	return consumed;
> +}
> +
> +#endif

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 17:31 ` Stefan Beller
@ 2018-08-23 19:44   ` Ben Peart
  2018-08-24 18:40   ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-23 19:44 UTC (permalink / raw)
  To: Stefan Beller, Ben Peart; +Cc: git, Junio C Hamano



On 8/23/2018 1:31 PM, Stefan Beller wrote:
> On Thu, Aug 23, 2018 at 8:45 AM Ben Peart <Ben.Peart@microsoft.com> wrote:
>>
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
>>
>> It accomplishes this by having the primary thread loop across the index file
>> tracking the offset and (for V4 indexes) expanding the name. It creates a
>> thread to process each block of entries as it comes to them. Once the
>> threads are complete and the cache entries are loaded, the rest of the
>> extensions can be loaded and processed normally on the primary thread.
>>
>> Performance impact:
>>
>> read cache .git/index times on a synthetic repo with:
>>
>> 100,000 entries
>> FALSE       TRUE        Savings     %Savings
>> 0.014798767 0.009580433 0.005218333 35.26%
>>
>> 1,000,000 entries
>> FALSE       TRUE        Savings     %Savings
>> 0.240896533 0.1751243   0.065772233 27.30%
>>
>> read cache .git/index times on an actual repo with:
>>
>> ~3M entries
>> FALSE       TRUE        Savings     %Savings
>> 0.59898098  0.4513169   0.14766408  24.65%
>>
>> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
>> ---
>>
>> Notes:
>>      Base Ref: master
>>      Web-Diff: https://github.com/benpeart/git/commit/67a700419b
>>      Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b
>>
>>   Documentation/config.txt |   8 ++
>>   config.c                 |  13 +++
>>   config.h                 |   1 +
>>   read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
>>   4 files changed, 216 insertions(+), 24 deletions(-)
>>
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 1c42364988..3344685cc4 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
>>   index comparison to the filesystem data in parallel, allowing
>>   overlapping IO's.  Defaults to true.
>>
>> +core.fastIndex::
>> +       Enable parallel index loading
>> ++
>> +This can speed up operations like 'git diff' and 'git status' especially
>> +when the index is very large.  When enabled, Git will do the index
>> +loading from the on disk format to the in-memory format in parallel.
>> +Defaults to true.
> 
> "fast" is a non-descriptive word as we try to be fast in any operation?
> Maybe core.parallelIndexReading as that just describes what it
> turns on/off, without second guessing its effects?
> (Are there still computers with just a single CPU, where this would not
> make it faster? ;-))
> 

How about core.parallelReadIndex?  Slightly shorter and matches the 
function names better.

> 
>> +int git_config_get_fast_index(void)
>> +{
>> +       int val;
>> +
>> +       if (!git_config_get_maybe_bool("core.fastindex", &val))
>> +               return val;
>> +
>> +       if (getenv("GIT_FASTINDEX_TEST"))
>> +               return 1;
> 
> We look at this env value just before calling this function,
> can be write it to only look at the evn variable once?
> 

Sure, I didn't like the fact that it was called twice but didn't get 
around to cleaning it up.

>> +++ b/config.h
>> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>>   extern int git_config_get_split_index(void);
>>   extern int git_config_get_max_percent_split_change(void);
>>   extern int git_config_get_fsmonitor(void);
>> +extern int git_config_get_fast_index(void);
> 
> Oh. nd/no-extern did not cover config.h
> 
> 
>>
>> +#ifndef min
>> +#define min(a,b) (((a) < (b)) ? (a) : (b))
>> +#endif
> 
> We do not have a minimum function in the tree,
> except for xdiff/xmacros.h:29: XDL_MIN.
> I wonder what the rationale is for not having a MIN()
> definition, I think we discussed that on the list a couple
> times but the rationale escaped me.
> 
> If we introduce a min/max macro, can we put it somewhere
> more prominent? (I would find it useful elsewhere)
>

I'll avoid that particular rabbit hole and just remove the min macro 
definition.  ;-)

>> +/*
>> +* Mostly randomly chosen maximum thread counts: we
>> +* cap the parallelism to online_cpus() threads, and we want
>> +* to have at least 7500 cache entries per thread for it to
>> +* be worth starting a thread.
>> +*/
>> +#define THREAD_COST            (7500)
> 
> This reads very similar to preload-index.c THREAD_COST
> 
>> +       /* loop through index entries starting a thread for every thread_nr entries */
>> +       consumed = thread = 0;
>> +       for (i = 0; ; i++) {
>> +               struct ondisk_cache_entry *ondisk;
>> +               const char *name;
>> +               unsigned int flags;
>> +
>> +               /* we've reached the begining of a block of cache entries, kick off a thread to process them */
> 
> beginning
> 

Thanks

> Thanks,
> Stefan
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 18:06 ` Junio C Hamano
@ 2018-08-23 20:33   ` Ben Peart
  2018-08-24 15:37     ` Duy Nguyen
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-08-23 20:33 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git



On 8/23/2018 2:06 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
> 
> Nice.
> 
>> +int git_config_get_fast_index(void)
>> +{
>> +	int val;
>> +
>> +	if (!git_config_get_maybe_bool("core.fastindex", &val))
>> +		return val;
>> +
>> +	if (getenv("GIT_FASTINDEX_TEST"))
>> +		return 1;
> 
> It probably makes sense to use git_env_bool() to be consistent,
> which allows GIT_FASTINDEX_TEST=0 to turn it off after this becomes
> the default.
> 
>> diff --git a/read-cache.c b/read-cache.c
>> index 7b1354d759..0fa7e1a04c 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -24,6 +24,10 @@
>>   #include "utf8.h"
>>   #include "fsmonitor.h"
>>   
>> +#ifndef min
>> +#define min(a,b) (((a) < (b)) ? (a) : (b))
>> +#endif
> 
> Let's lose this, which is used only once, even though it could be
> used elsewhere but not used (e.g. threads vs cpus near the beginning
> of load_cache_entries()).
> 

I didn't have it, then added it to make it trivial to see what was 
actually happening.  I can switch back.

>> +static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
> 
> Wrap and possibly add comment before the function to describe what
> it does and what its parameters mean?
> 
>> +{
>> +	int i;
>> +	unsigned long src_offset = start_offset;
>> +
>> +	for (i = offset; i < offset + nr; i++) {
>> +		struct ondisk_cache_entry *disk_ce;
>> +		struct cache_entry *ce;
>> +		unsigned long consumed;
>> +
>> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
>> +		set_index_entry(istate, i, ce);
>> +
>> +		src_offset += consumed;
>> +	}
>> +	return src_offset - start_offset;
>> +}
> 
> OK.
> 
>> +static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
> 
> (following aloud) This "all" variant is "one thread does all", iow,
> unthreaded version.  Makes sense.
> 
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	unsigned long consumed;
>> +
>> +	if (istate->version == 4) {
>> +		previous_name = &previous_name_buf;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +			      estimate_cache_size_from_compressed(istate->cache_nr));
>> +	} else {
>> +		previous_name = NULL;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +			      estimate_cache_size(mmap_size, istate->cache_nr));
>> +	}
> 
> I count there are three instances of "if version 4 use the strbuf
> for name-buf, otherwise..." in this patch, which made me wonder if
> we can make them shared more and/or if it makes sense to attempt to
> do so.
> 

Actually, they are all different and all required.  One sets it up for 
the "do it all on one thread" path.  One sets it up for each thread. The 
last one is used by the primary thread when scanning for blocks to hand 
off to the child threads.

>> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
>> +	strbuf_release(&previous_name_buf);
>> +	return consumed;
>> +}
>> +
>> +#ifdef NO_PTHREADS
>> +
>> +#define load_cache_entries load_all_cache_entries
>> +
>> +#else
>> +
>> +#include "thread-utils.h"
>> +
>> +/*
>> +* Mostly randomly chosen maximum thread counts: we
>> +* cap the parallelism to online_cpus() threads, and we want
>> +* to have at least 7500 cache entries per thread for it to
>> +* be worth starting a thread.
>> +*/
>> +#define THREAD_COST		(7500)
>> +
>> +struct load_cache_entries_thread_data
>> +{
>> +	pthread_t pthread;
>> +	struct index_state *istate;
>> +	struct mem_pool *ce_mem_pool;
>> +	int offset, nr;
>> +	void *mmap;
>> +	unsigned long start_offset;
>> +	struct strbuf previous_name_buf;
>> +	struct strbuf *previous_name;
>> +	unsigned long consumed;	/* return # of bytes in index file processed */
>> +};
>> +
>> +/*
>> +* A thread proc to run the load_cache_entries() computation
>> +* across multiple background threads.
>> +*/
>> +static void *load_cache_entries_thread(void *_data)
>> +{
>> +	struct load_cache_entries_thread_data *p = _data;
>> +
>> +	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
>> +	return NULL;
>> +}
> 
> (following aloud) And the threaded version chews the block of ce's
> given to each thread.  Makes sense.
> 
>> +static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	struct load_cache_entries_thread_data *data;
>> +	int threads, cpus, thread_nr;
>> +	unsigned long consumed;
>> +	int i, thread;
>> +
>> +	cpus = online_cpus();
>> +	threads = istate->cache_nr / THREAD_COST;
>> +	if (threads > cpus)
>> +		threads = cpus;
> 
> No other caller of online_cpus() is prepared to deal with faulty
> return from the function (e.g. 0 or negative), so it is perfectly
> fine for this caller to trust it would return at least 1.  OK.
> 
> Not using min() and it still is very readable ;-).
> 
>> +	/* enable testing with fewer than default minimum of entries */
>> +	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
>> +		threads = 2;
> 
> Another good place to use git_env_bool().
> 
>> +	if (threads < 2 || !git_config_get_fast_index())
>> +		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> 
> config_get_fast_index() can return -1 to signal "no strong
> preference either way".  A caller that negates the value without
> paying special attention to negative return makes the reader wonder
> if the code is buggy or actively interpreting "do not care" as "I do
> not mind if you use it" (it is the latter in this case).
> 
> I actually think git_config_get_fast_index() is a helper that does a
> bit too little.  Perhaps the above two if() statements can be
> combined into a single call to
> 
> 	threads = use_fast_index(istate);
> 	if (threads < 2)
> 		return load_all_cache_entries(...);
> 
> and let it call online_cpus(), determination of thread-count taking
> THREADS_COST into account, and also reading the configuration
> variable?  The configuration variable might even want to say how
> many threads it wants to cap us at maximum in the future.
> 

I reworked this a bit.

git_config_get_parallel_read_index() still just deals with the config 
value (I had to read it this way as in some code paths, the global 
config settings in environment.c haven't been read yet).

All the logic about whether to use threads and how many to use is 
centralized here along with the environment variable to override the 
default behavior.

>> +	mem_pool_init(&istate->ce_mem_pool, 0);
>> +	if (istate->version == 4)
>> +		previous_name = &previous_name_buf;
>> +	else
>> +		previous_name = NULL;
>> +
>> +	thread_nr = (istate->cache_nr + threads - 1) / threads;
> 
> (following aloud) threads is the number of threads that we are going
> to spawn.  thread_nr is not any number about threads---it is number
> of cache entries each thread will work on.  The latter is
> confusingly named.
> 
> ce_per_thread perhaps?
> 

Sure

> As the division is rounded up, among "threads" threads, we know we
> will cover all "cache_nr" cache entries.  The last thread may handle
> fewer than "thread_nr" entries, or even just a single entry in the
> worst case.
> 

It's divided by the number of threads so will only be up to 1 less than 
the other threads.  Given the minimum # of entries per thread is 7500, 
you'd never end up with just a single entry (unless using the 
GIT_PARALLELREADINDEX_TEST override).

> When cache_nr == 1 and FASTINDEX_TEST tells us to use threads == 2,
> then thread_nr = (1 + 2 - 1) / 2 = 1.
> 
> The first one in the loop is given (offset, nr) = (0, 1) in the loop
> The second one is given (offset, nr) = (1, 0) in the loop.  Two
> questions come to mind:
> 
>   - Is load_cache_entries_thread() prepared to be given offset that
>     is beyond the end of istate->cache[] and become a no-op?
> 
>   - Does the next loop even terminate without running beyond the end
>     of istate->cache[]?
> 
>> +	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
>> +
>> +	/* loop through index entries starting a thread for every thread_nr entries */
>> +	consumed = thread = 0;
>> +	for (i = 0; ; i++) {
> 
> Uncapped for() loop makes readers a bit nervous.
> An extra "i < istate->cache_nr" would not hurt, perhaps?
> 

We don't need or want to run through _all_ the entries, only to the 
first entry of the last block.  I'd prefer to leave that extra test out 
as it implies that we are going to loop through them all. I'll add a 
comment to make it more obvious what is happening.

>> +		struct ondisk_cache_entry *ondisk;
>> +		const char *name;
>> +		unsigned int flags;
>> +
>> +		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
>> +		if (0 == i % thread_nr) {
>> +			struct load_cache_entries_thread_data *p = &data[thread];
>> +
>> +			p->istate = istate;
>> +			p->offset = i;
>> +			p->nr = min(thread_nr, istate->cache_nr - i);
> 
> (following aloud) p->nr is the number of entries this thread will
> work on.
> 
>> +			/* create a mem_pool for each thread */
>> +			if (istate->version == 4)
>> +				mem_pool_init(&p->ce_mem_pool,
>> +						  estimate_cache_size_from_compressed(p->nr));
>> +			else
>> +				mem_pool_init(&p->ce_mem_pool,
>> +						  estimate_cache_size(mmap_size, p->nr));
>> +
>> +			p->mmap = mmap;
>> +			p->start_offset = src_offset;
>> +			if (previous_name) {
>> +				strbuf_addbuf(&p->previous_name_buf, previous_name);
>> +				p->previous_name = &p->previous_name_buf;
>> +			}
>> +
>> +			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
>> +				die("unable to create load_cache_entries_thread");
>> +			if (++thread == threads || p->nr != thread_nr)
>> +				break;
>> +		}
>> +
>> +		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +
>> +		/* On-disk flags are just 16 bits */
>> +		flags = get_be16(&ondisk->flags);
>> +
>> +		if (flags & CE_EXTENDED) {
>> +			struct ondisk_cache_entry_extended *ondisk2;
>> +			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
>> +			name = ondisk2->name;
>> +		} else
>> +			name = ondisk->name;
>> +
>> +		if (!previous_name) {
>> +			size_t len;
>> +
>> +			/* v3 and earlier */
>> +			len = flags & CE_NAMEMASK;
>> +			if (len == CE_NAMEMASK)
>> +				len = strlen(name);
>> +			src_offset += (flags & CE_EXTENDED) ?
>> +				ondisk_cache_entry_extended_size(len) :
>> +				ondisk_cache_entry_size(len);
>> +		} else
>> +			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
> 
> Nice to see this done without a new index extension that records
> offsets, so that we can load existing index files in parallel.
> 

Yes, I prefer this simpler model as well.  I wasn't sure it would 
produce a significant improvement given the primary thread still has to 
run through the variable length cache entries but was pleasantly surprised.

The recent mem_pool changes really helped as well as it removed all 
thread contention in the heap that was happening before.

>> +	}
>> +
>> +	for (i = 0; i < threads; i++) {
>> +		struct load_cache_entries_thread_data *p = data + i;
>> +		if (pthread_join(p->pthread, NULL))
>> +			die("unable to join load_cache_entries_thread");
>> +		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
>> +		strbuf_release(&p->previous_name_buf);
>> +		consumed += p->consumed;
>> +	}
>> +
>> +	free(data);
>> +	strbuf_release(&previous_name_buf);
>> +
>> +	return consumed;
>> +}
>> +
>> +#endif

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 20:33   ` Ben Peart
@ 2018-08-24 15:37     ` Duy Nguyen
  2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
  0 siblings, 2 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-08-24 15:37 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

Since we're cutting corners to speed things up, could you try
something like this?

I notice that reading v4 is significantly slower than v2 and
apparently strlen() (at least from glibc) is much cleverer and at
least gives me a few percentage time saving.

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..d10cccaed0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
strbuf *name, const char *cp_)
        if (name->len < len)
                die("malformed name field in the index");
        strbuf_remove(name, name->len - len, len);
-       for (ep = cp; *ep; ep++)
-               ; /* find the end */
+       ep = cp + strlen(cp);
        strbuf_add(name, cp, ep - cp);
        return (const char *)ep + 1 - cp_;
 }

On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> > Nice to see this done without a new index extension that records
> > offsets, so that we can load existing index files in parallel.
> >
>
> Yes, I prefer this simpler model as well.  I wasn't sure it would
> produce a significant improvement given the primary thread still has to
> run through the variable length cache entries but was pleasantly surprised.

Out of curiosity, how much time saving could we gain by recording
offsets as an extension (I assume we need, like 4 offsets if the
system has 4 cores)? Much much more than this simpler model (which may
justify the complexity) or just "meh" compared to this?
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:37     ` Duy Nguyen
@ 2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 17:28         ` Ben Peart
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
  1 sibling, 2 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-08-24 15:57 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 05:37:20PM +0200, Duy Nguyen wrote:
> Since we're cutting corners to speed things up, could you try
> something like this?
> 
> I notice that reading v4 is significantly slower than v2 and
> apparently strlen() (at least from glibc) is much cleverer and at
> least gives me a few percentage time saving.
> 
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..d10cccaed0 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
> strbuf *name, const char *cp_)
>         if (name->len < len)
>                 die("malformed name field in the index");
>         strbuf_remove(name, name->len - len, len);
> -       for (ep = cp; *ep; ep++)
> -               ; /* find the end */
> +       ep = cp + strlen(cp);
>         strbuf_add(name, cp, ep - cp);
>         return (const char *)ep + 1 - cp_;
>  }

No try this instead. It's half way back to v2 numbers for me (tested
with "test-tool read-cache 100" on webkit.git). For the record, v4 is
about 30% slower than v2 in my tests.

We could probably do better too. Instead of preparing the string in a
separate buffer (previous_name_buf), we could just assemble it directly
to the newly allocated "ce".

-- 8< --
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..237f60a76c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen(cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 8< --

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:57       ` Duy Nguyen
@ 2018-08-24 17:28         ` Ben Peart
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  1 sibling, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-24 17:28 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 11:57 AM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 05:37:20PM +0200, Duy Nguyen wrote:
>> Since we're cutting corners to speed things up, could you try
>> something like this?
>>
>> I notice that reading v4 is significantly slower than v2 and
>> apparently strlen() (at least from glibc) is much cleverer and at
>> least gives me a few percentage time saving.
>>
>> diff --git a/read-cache.c b/read-cache.c
>> index 7b1354d759..d10cccaed0 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
>> strbuf *name, const char *cp_)
>>          if (name->len < len)
>>                  die("malformed name field in the index");
>>          strbuf_remove(name, name->len - len, len);
>> -       for (ep = cp; *ep; ep++)
>> -               ; /* find the end */
>> +       ep = cp + strlen(cp);
>>          strbuf_add(name, cp, ep - cp);
>>          return (const char *)ep + 1 - cp_;
>>   }
> 
> No try this instead. It's half way back to v2 numbers for me (tested
> with "test-tool read-cache 100" on webkit.git). For the record, v4 is
> about 30% slower than v2 in my tests.
> 

Thanks Duy, this helped on my system as well.

Interestingly, simply reading the cache tree extension in read_one() now 
takes about double the CPU on the primary thread as does 
load_cache_entries().

Hmm, that gives me an idea.  I could kick off another thread to load 
that extension in parallel and cut off another ~160 ms.  I'll add that 
to my list of future patches to investigate...

> We could probably do better too. Instead of preparing the string in a
> separate buffer (previous_name_buf), we could just assemble it directly
> to the newly allocated "ce".
> 
> -- 8< --
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..237f60a76c 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
>   
>   	if (name->len < len)
>   		die("malformed name field in the index");
> -	strbuf_remove(name, name->len - len, len);
> -	for (ep = cp; *ep; ep++)
> -		; /* find the end */
> +	strbuf_setlen(name, name->len - len);
> +	ep = cp + strlen(cp);
>   	strbuf_add(name, cp, ep - cp);
>   	return (const char *)ep + 1 - cp_;
>   }
> -- 8< --
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:37     ` Duy Nguyen
  2018-08-24 15:57       ` Duy Nguyen
@ 2018-08-24 18:20       ` Duy Nguyen
  2018-08-24 18:40         ` Ben Peart
  1 sibling, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-08-24 18:20 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> > > Nice to see this done without a new index extension that records
> > > offsets, so that we can load existing index files in parallel.
> > >
> >
> > Yes, I prefer this simpler model as well.  I wasn't sure it would
> > produce a significant improvement given the primary thread still has to
> > run through the variable length cache entries but was pleasantly surprised.
>
> Out of curiosity, how much time saving could we gain by recording
> offsets as an extension (I assume we need, like 4 offsets if the
> system has 4 cores)? Much much more than this simpler model (which may
> justify the complexity) or just "meh" compared to this?

To answer my own question, I ran a patched git to precalculate
individual thread parameters, removed the scheduler code and hard
coded these parameters (I ran just 4 threads, one per core). I got
0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
0m4.996s from Ben's patch (same test settings of course) I think it's
definitely worth adding some extra complexity.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 17:31 ` Stefan Beller
  2018-08-23 19:44   ` Ben Peart
@ 2018-08-24 18:40   ` Duy Nguyen
  2018-08-28 14:53     ` Ben Peart
  1 sibling, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-08-24 18:40 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Ben Peart, Git Mailing List, Junio C Hamano

On Thu, Aug 23, 2018 at 7:33 PM Stefan Beller <sbeller@google.com> wrote:
> > +core.fastIndex::
> > +       Enable parallel index loading
> > ++
> > +This can speed up operations like 'git diff' and 'git status' especially
> > +when the index is very large.  When enabled, Git will do the index
> > +loading from the on disk format to the in-memory format in parallel.
> > +Defaults to true.
>
> "fast" is a non-descriptive word as we try to be fast in any operation?
> Maybe core.parallelIndexReading as that just describes what it
> turns on/off, without second guessing its effects?

Another option is index.threads (the "index" section currently only
has one item, index.version). The value could be the same as
grep.threads or pack.threads.

(and if you're thinking about parallelizing write as well but it
should be tuned differently, then perhaps index.readThreads, but I
don't think we need to go that far)
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
@ 2018-08-24 18:40         ` Ben Peart
  2018-08-24 19:00           ` Duy Nguyen
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-08-24 18:40 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 2:20 PM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
>> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
>>>> Nice to see this done without a new index extension that records
>>>> offsets, so that we can load existing index files in parallel.
>>>>
>>>
>>> Yes, I prefer this simpler model as well.  I wasn't sure it would
>>> produce a significant improvement given the primary thread still has to
>>> run through the variable length cache entries but was pleasantly surprised.
>>
>> Out of curiosity, how much time saving could we gain by recording
>> offsets as an extension (I assume we need, like 4 offsets if the
>> system has 4 cores)? Much much more than this simpler model (which may
>> justify the complexity) or just "meh" compared to this?
> 
> To answer my own question, I ran a patched git to precalculate
> individual thread parameters, removed the scheduler code and hard
> coded these parameters (I ran just 4 threads, one per core). I got
> 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
> 0m4.996s from Ben's patch (same test settings of course) I think it's
> definitely worth adding some extra complexity.
> 

I took a run at doing that last year [1] but that was before the 
mem_pool work that allowed us to avoid the thread contention on the heap 
so the numbers aren't an apples to apples comparison (they would be 
better today).

The trade-off is the additional complexity to be able to load the index 
extension without having to parse through all the variable length cache 
entries.  My patch worked but there was feedback requested to make it 
more generic and robust that I haven't gotten around to yet.

This patch series went for simplicity over absolutely the best possible 
performance.

[1] 
https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:40         ` Ben Peart
@ 2018-08-24 19:00           ` Duy Nguyen
  2018-08-24 19:57             ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-08-24 19:00 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 8:40 PM Ben Peart <peartben@gmail.com> wrote:
>
>
>
> On 8/24/2018 2:20 PM, Duy Nguyen wrote:
> > On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
> >> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> >>>> Nice to see this done without a new index extension that records
> >>>> offsets, so that we can load existing index files in parallel.
> >>>>
> >>>
> >>> Yes, I prefer this simpler model as well.  I wasn't sure it would
> >>> produce a significant improvement given the primary thread still has to
> >>> run through the variable length cache entries but was pleasantly surprised.
> >>
> >> Out of curiosity, how much time saving could we gain by recording
> >> offsets as an extension (I assume we need, like 4 offsets if the
> >> system has 4 cores)? Much much more than this simpler model (which may
> >> justify the complexity) or just "meh" compared to this?
> >
> > To answer my own question, I ran a patched git to precalculate
> > individual thread parameters, removed the scheduler code and hard
> > coded these parameters (I ran just 4 threads, one per core). I got
> > 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
> > 0m4.996s from Ben's patch (same test settings of course) I think it's
> > definitely worth adding some extra complexity.
> >
>
> I took a run at doing that last year [1] but that was before the
> mem_pool work that allowed us to avoid the thread contention on the heap
> so the numbers aren't an apples to apples comparison (they would be
> better today).

Ah.. sorry I was not aware. A big chunk of 2017 is blank to me when it
comes to git.

> The trade-off is the additional complexity to be able to load the index
> extension without having to parse through all the variable length cache
> entries.  My patch worked but there was feedback requested to make it
> more generic and robust that I haven't gotten around to yet.

One more comment. Instead of forcing this special index at the bottom,
add a generic one that gives positions of all extensions and put that
one at the bottom. Then you can still quickly locate your offset table
extension, and you could load UNTR and TREE extensions in parallel too
(those scale up to worktree size)

> This patch series went for simplicity over absolutely the best possible
> performance.

Well, you know my stance on this now :) Not that it really matters.

> [1]
> https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/

PS. I still think it's worth bring v4's performance back to v2. It's
low hanging fruit because I'm pretty sure Junio did not add v4 code
with cpu performance in mind. It was about file size at that time and
cpu consumption was still dwarfed by hashing.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 19:00           ` Duy Nguyen
@ 2018-08-24 19:57             ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-24 19:57 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 3:00 PM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 8:40 PM Ben Peart <peartben@gmail.com> wrote:
>>
>>
>>
>> On 8/24/2018 2:20 PM, Duy Nguyen wrote:
>>> On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
>>>> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
>>>>>> Nice to see this done without a new index extension that records
>>>>>> offsets, so that we can load existing index files in parallel.
>>>>>>
>>>>>
>>>>> Yes, I prefer this simpler model as well.  I wasn't sure it would
>>>>> produce a significant improvement given the primary thread still has to
>>>>> run through the variable length cache entries but was pleasantly surprised.
>>>>
>>>> Out of curiosity, how much time saving could we gain by recording
>>>> offsets as an extension (I assume we need, like 4 offsets if the
>>>> system has 4 cores)? Much much more than this simpler model (which may
>>>> justify the complexity) or just "meh" compared to this?
>>>
>>> To answer my own question, I ran a patched git to precalculate
>>> individual thread parameters, removed the scheduler code and hard
>>> coded these parameters (I ran just 4 threads, one per core). I got
>>> 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
>>> 0m4.996s from Ben's patch (same test settings of course) I think it's
>>> definitely worth adding some extra complexity.
>>>
>>
>> I took a run at doing that last year [1] but that was before the
>> mem_pool work that allowed us to avoid the thread contention on the heap
>> so the numbers aren't an apples to apples comparison (they would be
>> better today).
> 
> Ah.. sorry I was not aware. A big chunk of 2017 is blank to me when it
> comes to git.
> 
>> The trade-off is the additional complexity to be able to load the index
>> extension without having to parse through all the variable length cache
>> entries.  My patch worked but there was feedback requested to make it
>> more generic and robust that I haven't gotten around to yet.
> 
> One more comment. Instead of forcing this special index at the bottom,
> add a generic one that gives positions of all extensions and put that
> one at the bottom. Then you can still quickly locate your offset table
> extension, and you could load UNTR and TREE extensions in parallel too
> (those scale up to worktree size)
> 

That is pretty much what Junio's feedback was and what I was referring 
to as making it "more generic."  The "more robust" was the request to 
add a SHA to the extension ensure it wasn't corrupt and was a valid 
extension.

>> This patch series went for simplicity over absolutely the best possible
>> performance.
> 
> Well, you know my stance on this now :) Not that it really matters.
> 
>> [1]
>> https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/
> 
> PS. I still think it's worth bring v4's performance back to v2. It's
> low hanging fruit because I'm pretty sure Junio did not add v4 code
> with cpu performance in mind. It was about file size at that time and
> cpu consumption was still dwarfed by hashing.
> 

I see that as a nice follow up patch.  If the extension exists, use it 
and jump directly to the blocks and spin up threads.  If it doesn't 
exist, fall back to the code in this patch that has to find/compute the 
blocks on the fly.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH] read-cache.c: optimize reading index format v4
  2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 17:28         ` Ben Peart
@ 2018-08-25  6:44         ` Nguyễn Thái Ngọc Duy
  2018-08-27 19:36           ` Junio C Hamano
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 153+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-08-25  6:44 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in 
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

PS. I notice that v4 does not pad to align entries at 4 byte boundary
like v2/v3. This could cause a slight slow down on x86 and segfault on
some other platforms. We need to fix this in v5 when we introduce
SHA-256 support in the index.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 124 +++++++++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..5c04c8f200 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,16 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
 static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t strip_len;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1782,28 +1735,61 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 		extended_flags = get_be16(&ondisk2->flags2) << 16;
 		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
 		if (extended_flags & ~CE_EXTENDED_FLAGS)
-			die("Unknown index entry format %08x", extended_flags);
+			die(_("unknown index entry format %08x"), extended_flags);
 		flags |= extended_flags;
 		name = ondisk2->name;
 	}
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	if (previous_ce) {
+		const unsigned char *cp = (const unsigned char *)name;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		strip_len = decode_varint(&cp);
+		if (previous_ce->ce_namelen < strip_len)
+			die(_("malformed name field in the index, path '%s'"),
+			    previous_ce->name);
+		name = (const char *)cp;
+	}
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (previous_ce)
+			len += previous_ce->ce_namelen - strip_len;
+	}
+
+	ce = mem_pool__ce_alloc(mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
+
+	if (previous_ce) {
+		size_t copy_len = previous_ce->ce_namelen - strip_len;
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
+	struct cache_entry *dummy_entry = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
+		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1938,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		if (previous_ce)
+			previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
+	free(dummy_entry);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
@ 2018-08-27 19:36           ` Junio C Hamano
  2018-08-28 19:25             ` Duy Nguyen
  2018-09-04 16:08             ` Duy Nguyen
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-08-27 19:36 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Running "test-tool read-cache 100" on webkit.git (275k files), reading
> v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
> time. The patch reduces read time on v4 to 4.319 seconds.

Nice.

> PS. I notice that v4 does not pad to align entries at 4 byte boundary
> like v2/v3. This could cause a slight slow down on x86 and segfault on
> some other platforms.

Care to elaborate?  

Long time ago, we used to mmap and read directly from the index file
contents, requiring either an unaligned read or padded entries.  But
that was eons ago and we first read and convert from on-disk using
get_be32() etc. to in-core structure, so I am not sure what you mean
by "segfault" here.

> @@ -1782,28 +1735,61 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
>  		extended_flags = get_be16(&ondisk2->flags2) << 16;
>  		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
>  		if (extended_flags & ~CE_EXTENDED_FLAGS)
> -			die("Unknown index entry format %08x", extended_flags);
> +			die(_("unknown index entry format %08x"), extended_flags);

Do this as a separate preparation patch that is not controversial
and can sail through without waiting for the rest of this patch.

In other words, don't slip in unreleted changes.

> -	if (!previous_name) {
> -		/* v3 and earlier */
> -		if (len == CE_NAMEMASK)
> -			len = strlen(name);
> -		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
> +	/*
> +	 * Adjacent cache entries tend to share the leading paths, so it makes
> +	 * sense to only store the differences in later entries.  In the v4
> +	 * on-disk format of the index, each on-disk cache entry stores the
> +	 * number of bytes to be stripped from the end of the previous name,
> +	 * and the bytes to append to the result, to come up with its name.
> +	 */
> +	if (previous_ce) {
> +		const unsigned char *cp = (const unsigned char *)name;
>  
> -		*ent_size = ondisk_ce_size(ce);
> -	} else {
> -		unsigned long consumed;
> -		consumed = expand_name_field(previous_name, name);
> -		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
> -					     previous_name->buf,
> -					     previous_name->len);
> +		strip_len = decode_varint(&cp);
> +		if (previous_ce->ce_namelen < strip_len)
> +			die(_("malformed name field in the index, path '%s'"),
> +			    previous_ce->name);

The message is misleading; the previous is not the problematic one,
but the one that comes after it is.  Perhaps s/, path/, near path/
or something.

> +		name = (const char *)cp;
> +	}
>  
> -		*ent_size = (name - ((char *)ondisk)) + consumed;
> +	if (len == CE_NAMEMASK) {
> +		len = strlen(name);
> +		if (previous_ce)
> +			len += previous_ce->ce_namelen - strip_len;

Nicely done.  If the result fits in that 12-bit truncated name, then
it is full so we do not need to adjust for strip.  Otherwise, we
know the length of this name is the sum of the part that is shared
with the previous one and the part that is unique to this one.

> +	}
> +
> +	ce = mem_pool__ce_alloc(mem_pool, len);
> +	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
> +	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
> +	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
> +	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
> +	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
> +	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
> +	ce->ce_mode  = get_be32(&ondisk->mode);
> +	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
> +	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
> +	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
> +	ce->ce_flags = flags & ~CE_NAMEMASK;
> +	ce->ce_namelen = len;
> +	ce->index = 0;
> +	hashcpy(ce->oid.hash, ondisk->sha1);

Again, nice.  Now two callsites (both in this function) that call
cache_entry_from_ondisk() with slightly different parameters are
unified, there is no strong reason to have it as a single caller
helper function.

> @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  	struct cache_header *hdr;
>  	void *mmap;
>  	size_t mmap_size;
> -	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	const struct cache_entry *previous_ce = NULL;
> +	struct cache_entry *dummy_entry = NULL;
>  
>  	if (istate->initialized)
>  		return istate->cache_nr;
> @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  	istate->initialized = 1;
>  
>  	if (istate->version == 4) {
> -		previous_name = &previous_name_buf;
> +		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);

I do like the idea of passing the previous ce around to tell the
next one what the previous name was, but I would have preferred to
see this done a bit more cleanly without requiring us to support "a
dummy entry with name whose length is 0"; a real cache entry never
has zero-length name, and our code may want to enforce it as a
sanity check.

I think we can just call create_from_disk() with NULL set to
previous_ce in the first round; of course, the logic to assign the
one we just created to previous_ce must check istate->version,
instead of "is previous_ce NULL?" (which is an indirect way to check
the same thing used in this patch).

Other than that, looks quite nice.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:40   ` Duy Nguyen
@ 2018-08-28 14:53     ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-28 14:53 UTC (permalink / raw)
  To: Duy Nguyen, Stefan Beller; +Cc: Ben Peart, Git Mailing List, Junio C Hamano



On 8/24/2018 2:40 PM, Duy Nguyen wrote:
> On Thu, Aug 23, 2018 at 7:33 PM Stefan Beller <sbeller@google.com> wrote:
>>> +core.fastIndex::
>>> +       Enable parallel index loading
>>> ++
>>> +This can speed up operations like 'git diff' and 'git status' especially
>>> +when the index is very large.  When enabled, Git will do the index
>>> +loading from the on disk format to the in-memory format in parallel.
>>> +Defaults to true.
>> "fast" is a non-descriptive word as we try to be fast in any operation?
>> Maybe core.parallelIndexReading as that just describes what it
>> turns on/off, without second guessing its effects?
> Another option is index.threads (the "index" section currently only
> has one item, index.version). The value could be the same as
> grep.threads or pack.threads.
>
> (and if you're thinking about parallelizing write as well but it
> should be tuned differently, then perhaps index.readThreads, but I
> don't think we need to go that far)

I like that.  I'll switch to index.threads and make 'true' or '0' mean 
"automatically determine the number of threads to use" similar to 
pack.threads.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-27 19:36           ` Junio C Hamano
@ 2018-08-28 19:25             ` Duy Nguyen
  2018-08-28 23:54               ` Ben Peart
  2018-08-29 17:14               ` Junio C Hamano
  2018-09-04 16:08             ` Duy Nguyen
  1 sibling, 2 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-08-28 19:25 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Git Mailing List, Ben Peart

On Mon, Aug 27, 2018 at 9:36 PM Junio C Hamano <gitster@pobox.com> wrote:
> > PS. I notice that v4 does not pad to align entries at 4 byte boundary
> > like v2/v3. This could cause a slight slow down on x86 and segfault on
> > some other platforms.
>
> Care to elaborate?
>
> Long time ago, we used to mmap and read directly from the index file
> contents, requiring either an unaligned read or padded entries.  But
> that was eons ago and we first read and convert from on-disk using
> get_be32() etc. to in-core structure, so I am not sure what you mean
> by "segfault" here.
>

My bad. I saw this line

#define get_be16(p) ntohs(*(unsigned short *)(p))

and jumped to conclusion without realizing that block is for safe
unaligned access.

> > @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >       struct cache_header *hdr;
> >       void *mmap;
> >       size_t mmap_size;
> > -     struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> > +     const struct cache_entry *previous_ce = NULL;
> > +     struct cache_entry *dummy_entry = NULL;
> >
> >       if (istate->initialized)
> >               return istate->cache_nr;
> > @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >       istate->initialized = 1;
> >
> >       if (istate->version == 4) {
> > -             previous_name = &previous_name_buf;
> > +             previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
>
> I do like the idea of passing the previous ce around to tell the
> next one what the previous name was, but I would have preferred to
> see this done a bit more cleanly without requiring us to support "a
> dummy entry with name whose length is 0"; a real cache entry never
> has zero-length name, and our code may want to enforce it as a
> sanity check.
>
> I think we can just call create_from_disk() with NULL set to
> previous_ce in the first round; of course, the logic to assign the
> one we just created to previous_ce must check istate->version,
> instead of "is previous_ce NULL?" (which is an indirect way to check
> the same thing used in this patch).

Yeah I kinda hated dummy_entry too but the feeling wasn't strong
enough to move towards the index->version check. I guess I'm going to
do it now.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-28 19:25             ` Duy Nguyen
@ 2018-08-28 23:54               ` Ben Peart
  2018-08-29 17:14               ` Junio C Hamano
  1 sibling, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-28 23:54 UTC (permalink / raw)
  To: Duy Nguyen, Junio C Hamano; +Cc: Ben Peart, Git Mailing List



On 8/28/2018 3:25 PM, Duy Nguyen wrote:
> On Mon, Aug 27, 2018 at 9:36 PM Junio C Hamano <gitster@pobox.com> wrote:
>>> PS. I notice that v4 does not pad to align entries at 4 byte boundary
>>> like v2/v3. This could cause a slight slow down on x86 and segfault on
>>> some other platforms.
>>
>> Care to elaborate?
>>
>> Long time ago, we used to mmap and read directly from the index file
>> contents, requiring either an unaligned read or padded entries.  But
>> that was eons ago and we first read and convert from on-disk using
>> get_be32() etc. to in-core structure, so I am not sure what you mean
>> by "segfault" here.
>>
> 
> My bad. I saw this line
> 
> #define get_be16(p) ntohs(*(unsigned short *)(p))
> 
> and jumped to conclusion without realizing that block is for safe
> unaligned access.
> 
>>> @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>        struct cache_header *hdr;
>>>        void *mmap;
>>>        size_t mmap_size;
>>> -     struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>> +     const struct cache_entry *previous_ce = NULL;
>>> +     struct cache_entry *dummy_entry = NULL;
>>>
>>>        if (istate->initialized)
>>>                return istate->cache_nr;
>>> @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>        istate->initialized = 1;
>>>
>>>        if (istate->version == 4) {
>>> -             previous_name = &previous_name_buf;
>>> +             previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
>>
>> I do like the idea of passing the previous ce around to tell the
>> next one what the previous name was, but I would have preferred to
>> see this done a bit more cleanly without requiring us to support "a
>> dummy entry with name whose length is 0"; a real cache entry never
>> has zero-length name, and our code may want to enforce it as a
>> sanity check.
>>
>> I think we can just call create_from_disk() with NULL set to
>> previous_ce in the first round; of course, the logic to assign the
>> one we just created to previous_ce must check istate->version,
>> instead of "is previous_ce NULL?" (which is an indirect way to check
>> the same thing used in this patch).
> 
> Yeah I kinda hated dummy_entry too but the feeling wasn't strong
> enough to move towards the index->version check. I guess I'm going to
> do it now.
> 

I ran some perf tests using p0002-read-cache.sh to compare V4 
performance before and after this patch so I could get a feel for how 
much it helps.

100,000 files

Test                                  HEAD~1   HEAD
------------------------------------------------------------
read_cache/discard_cache 1000 times    14.12    10.75 -23.9%

1,000,000 files

Test                                  HEAD~1   HEAD
------------------------------------------------------------
read_cache/discard_cache 1000 times   202.81   170.33 -16.0%


This provides a nice speedup and IMO simplifies the code as well. 
Nicely done.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 0/3] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
  2018-08-23 17:31 ` Stefan Beller
  2018-08-23 18:06 ` Junio C Hamano
@ 2018-08-29 15:25 ` " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
                     ` (2 more replies)
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                   ` (5 subsequent siblings)
  8 siblings, 3 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

The big changes in this itteration are:

- Switched to index.threads to provide control over the use of threading

- Added another worker thread to load the index extensions in parallel

- Applied optimization expand_name_field() suggested by Duy

The net result of these optimizations is a savings of 25.8% (1,000,000 files)
to 38.1% (100,000 files) as measured by p0002-read-cache.sh.

This patch conflicts with Duy's patch to remove the double memory copy and
pass in the previous ce instead.  The two will need to be merged/reconciled
once they settle down a bit.


Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/39f2b0f5fe
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v2 && git checkout 39f2b0f5fe


### Interdiff (v1..v2):

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 3344685cc4..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -899,14 +899,6 @@ relatively high IO latencies.  When enabled, Git will do the
 index comparison to the filesystem data in parallel, allowing
 overlapping IO's.  Defaults to true.
 
-core.fastIndex::
-       Enable parallel index loading
-+
-This can speed up operations like 'git diff' and 'git status' especially
-when the index is very large.  When enabled, Git will do the index
-loading from the on disk format to the in-memory format in parallel.
-Defaults to true.
-
 core.createObject::
 	You can set this to 'link', in which case a hardlink followed by
 	a delete of the source are used to make sure that object creation
@@ -2399,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 883092fdd3..3bda124550 100644
--- a/config.c
+++ b/config.c
@@ -2289,17 +2289,18 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
-int git_config_get_fast_index(void)
+int git_config_get_index_threads(void)
 {
-	int val;
+	int is_bool, val;
 
-	if (!git_config_get_maybe_bool("core.fastindex", &val))
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
 			return val;
+	}
 
-	if (getenv("GIT_FASTINDEX_TEST"))
-		return 1;
-
-	return -1; /* default value */
+	return 0; /* auto-detect */
 }
 
 NORETURN
diff --git a/config.h b/config.h
index 74ca4e7db5..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,7 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
-extern int git_config_get_fast_index(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 0fa7e1a04c..f5e7c86c42 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -24,10 +24,6 @@
 #include "utf8.h"
 #include "fsmonitor.h"
 
-#ifndef min
-#define min(a,b) (((a) < (b)) ? (a) : (b))
-#endif
-
 /* Mask for the name length in ce_flags in the on-disk index */
 
 #define CE_NAMEMASK  (0x0fff)
@@ -1758,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
@@ -1893,7 +1888,13 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
-static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1912,7 +1913,8 @@ static unsigned long load_cache_entry_block(struct index_state *istate, struct m
 	return src_offset - start_offset;
 }
 
-static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
@@ -1927,7 +1929,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate, void *mm
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
-	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
 	strbuf_release(&previous_name_buf);
 	return consumed;
 }
@@ -1955,67 +1958,110 @@ struct load_cache_entries_thread_data
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
 	void *mmap;
+	size_t mmap_size;
 	unsigned long start_offset;
 	struct strbuf previous_name_buf;
 	struct strbuf *previous_name;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
-/*
-* A thread proc to run the load_cache_entries() computation
-* across multiple background threads.
-*/
 static void *load_cache_entries_thread(void *_data)
 {
 	struct load_cache_entries_thread_data *p = _data;
 
-	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static void *load_index_extensions_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	unsigned long src_offset = p->start_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+								(const char *)p->mmap + src_offset,
+								(char *)p->mmap + src_offset + 8,
+								extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	p->consumed += src_offset - p->start_offset;
+
 	return NULL;
 }
 
-static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+static unsigned long load_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
-	int threads, cpus, thread_nr;
+	int nr_threads, cpus, ce_per_thread;
 	unsigned long consumed;
 	int i, thread;
 
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads) {
 		cpus = online_cpus();
-	threads = istate->cache_nr / THREAD_COST;
-	if (threads > cpus)
-		threads = cpus;
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
 
 	/* enable testing with fewer than default minimum of entries */
-	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
-		threads = 2;
+	if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 2;
 
-	if (threads < 2 || !git_config_get_fast_index())
+	if (nr_threads < 2)
 		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		die("the name hash isn't thread safe");
+
 	mem_pool_init(&istate->ce_mem_pool, 0);
 	if (istate->version == 4)
 		previous_name = &previous_name_buf;
 	else
 		previous_name = NULL;
 
-	thread_nr = (istate->cache_nr + threads - 1) / threads;
-	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
+	/* allocate an extra thread for loading the index extensions */
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
 
-	/* loop through index entries starting a thread for every thread_nr entries */
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries.
+	 */
 	consumed = thread = 0;
-	for (i = 0; ; i++) {
+	for (i = 0; i < istate->cache_nr; i++) {
 		struct ondisk_cache_entry *ondisk;
 		const char *name;
 		unsigned int flags;
 
-		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
-		if (0 == i % thread_nr) {
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (0 == i % ce_per_thread) {
 			struct load_cache_entries_thread_data *p = &data[thread];
 
 			p->istate = istate;
 			p->offset = i;
-			p->nr = min(thread_nr, istate->cache_nr - i);
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
 
 			/* create a mem_pool for each thread */
 			if (istate->version == 4)
@@ -2034,8 +2080,8 @@ static unsigned long load_cache_entries(struct index_state *istate, void *mmap,
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
 				die("unable to create load_cache_entries_thread");
-			if (++thread == threads || p->nr != thread_nr)
-				break;
+
+			++thread;
 		}
 
 		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
@@ -2064,7 +2110,18 @@ static unsigned long load_cache_entries(struct index_state *istate, void *mmap,
 			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
 	}
 
-	for (i = 0; i < threads; i++) {
+	/* create a thread to load the index extensions */
+	struct load_cache_entries_thread_data *p = &data[thread];
+	p->istate = istate;
+	mem_pool_init(&p->ce_mem_pool, 0);
+	p->mmap = mmap;
+	p->mmap_size = mmap_size;
+	p->start_offset = src_offset;
+
+	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
+		die("unable to create load_index_extensions_thread");
+
+	for (i = 0; i < nr_threads + 1; i++) {
 		struct load_cache_entries_thread_data *p = data + i;
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");


### Patches

Ben Peart (3):
  read-cache: speed up index load through parallelization
  read-cache: load cache extensions on worker thread
  read-cache: micro-optimize expand_name_field() to speed up V4 index
    parsing.

 Documentation/config.txt |   6 +
 config.c                 |  14 ++
 config.h                 |   1 +
 read-cache.c             | 281 +++++++++++++++++++++++++++++++++++----
 4 files changed, 275 insertions(+), 27 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
@ 2018-08-29 15:25   ` " Ben Peart
  2018-08-29 17:14     ` Junio C Hamano
  2018-09-03 19:16     ` Duy Nguyen
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
  2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2 siblings, 2 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them. Once the
threads are complete and the cache entries are loaded, the rest of the
extensions can be loaded and processed normally on the primary thread.

I used p0002-read-cache.sh to generate some performance data:

100,000 entries

Test                                HEAD~3           HEAD~2
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.02(0.01+0.12) 9.81(0.01+0.07) -30.0%

1,000,000 entries

Test                                HEAD~3            HEAD~2
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.06(0.06+0.09) 155.72(0.03+0.06) -22.9%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |   6 +
 config.c                 |  14 +++
 config.h                 |   1 +
 read-cache.c             | 240 +++++++++++++++++++++++++++++++++++----
 4 files changed, 237 insertions(+), 24 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..3bda124550 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,20 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..c30346388a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1889,16 +1889,229 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifdef NO_PTHREADS
+
+#define load_cache_entries load_all_cache_entries
+
+#else
+
+#include "thread-utils.h"
+
+/*
+* Mostly randomly chosen maximum thread counts: we
+* cap the parallelism to online_cpus() threads, and we want
+* to have at least 7500 cache entries per thread for it to
+* be worth starting a thread.
+*/
+#define THREAD_COST		(7500)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+* A thread proc to run the load_cache_entries() computation
+* across multiple background threads.
+*/
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int nr_threads, cpus, ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 2;
+
+	if (nr_threads < 2)
+		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		die("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (0 == i % ce_per_thread) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1935,29 +2148,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
 	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
+	src_offset += load_cache_entries(istate, mmap, mmap_size, src_offset);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
@ 2018-08-29 15:25   ` Ben Peart
  2018-08-29 17:12     ` Junio C Hamano
  2018-09-03 19:21     ` Duy Nguyen
  2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2 siblings, 2 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data on the
cumulative impact:

100,000 entries

Test                                HEAD~3           HEAD~2
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

1,000,000 entries

Test                                HEAD~3            HEAD~2
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 60 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index c30346388a..f768004617 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1959,16 +1959,13 @@ struct load_cache_entries_thread_data
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
 	void *mmap;
+	size_t mmap_size;
 	unsigned long start_offset;
 	struct strbuf previous_name_buf;
 	struct strbuf *previous_name;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
-/*
-* A thread proc to run the load_cache_entries() computation
-* across multiple background threads.
-*/
 static void *load_cache_entries_thread(void *_data)
 {
 	struct load_cache_entries_thread_data *p = _data;
@@ -1978,6 +1975,36 @@ static void *load_cache_entries_thread(void *_data)
 	return NULL;
 }
 
+static void *load_index_extensions_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	unsigned long src_offset = p->start_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+								(const char *)p->mmap + src_offset,
+								(char *)p->mmap + src_offset + 8,
+								extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	p->consumed += src_offset - p->start_offset;
+
+	return NULL;
+}
+
 static unsigned long load_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
@@ -2012,16 +2039,16 @@ static unsigned long load_cache_entries(struct index_state *istate,
 	else
 		previous_name = NULL;
 
+	/* allocate an extra thread for loading the index extensions */
 	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
-	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
 
 	/*
 	 * Loop through index entries starting a thread for every ce_per_thread
-	 * entries. Exit the loop when we've created the final thread (no need
-	 * to parse the remaining entries.
+	 * entries.
 	 */
 	consumed = thread = 0;
-	for (i = 0; ; i++) {
+	for (i = 0; i < istate->cache_nr; i++) {
 		struct ondisk_cache_entry *ondisk;
 		const char *name;
 		unsigned int flags;
@@ -2055,9 +2082,7 @@ static unsigned long load_cache_entries(struct index_state *istate,
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
 				die("unable to create load_cache_entries_thread");
 
-			/* exit the loop when we've created the last thread */
-			if (++thread == nr_threads)
-				break;
+			++thread;
 		}
 
 		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
@@ -2086,7 +2111,18 @@ static unsigned long load_cache_entries(struct index_state *istate,
 			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
 	}
 
-	for (i = 0; i < nr_threads; i++) {
+	/* create a thread to load the index extensions */
+	struct load_cache_entries_thread_data *p = &data[thread];
+	p->istate = istate;
+	mem_pool_init(&p->ce_mem_pool, 0);
+	p->mmap = mmap;
+	p->mmap_size = mmap_size;
+	p->start_offset = src_offset;
+
+	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
+		die("unable to create load_index_extensions_thread");
+
+	for (i = 0; i < nr_threads + 1; i++) {
 		struct load_cache_entries_thread_data *p = data + i;
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing.
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
@ 2018-08-29 15:25   ` Ben Peart
  2 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

I used p0002-read-cache.sh to generate some performance data on the
cumulative impact:

100,000 files

Test                                HEAD~3           HEAD
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.03+0.09) 8.71(0.01+0.09) -38.1%

1,000,000 files

Test                                HEAD~3            HEAD
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 201.77(0.03+0.07) 149.68(0.04+0.07) -25.8%

Suggested by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index f768004617..f5e7c86c42 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
@ 2018-08-29 17:12     ` Junio C Hamano
  2018-08-29 21:42       ` Ben Peart
  2018-09-03 19:21     ` Duy Nguyen
  1 sibling, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:12 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\

Ben Peart <Ben.Peart@microsoft.com> writes:

> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.

Good to see such an analysis here.  Once we define an extension
section, which requires us to have the cache entries before
populating it, this scheme would falls down, of course, but the
extension mechanism is all about protecting ourselves from the
future changes, so we'd at least need a good feel for how we read an
unknown extension from the future with the current code.  Perhaps
just like the main cache entries were pre-scanned to apportion them
to worker threads, we can pre-scan the sections and compare them
with a white-list built into our binary before deciding that it is
safe to read them in parallel (and otherwise, we ask the last thread
for reading extensions to wait until the workers that read the main
index all return)?

> -/*
> -* A thread proc to run the load_cache_entries() computation
> -* across multiple background threads.
> -*/

This one was mis-indented (lacking SP before '*') but they are gone
so ... ;-)

> @@ -1978,6 +1975,36 @@ static void *load_cache_entries_thread(void *_data)
>  	return NULL;
>  }
>  
> +static void *load_index_extensions_thread(void *_data)
> +{
> +	struct load_cache_entries_thread_data *p = _data;
> +	unsigned long src_offset = p->start_offset;
> +
> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +		/* After an array of active_nr index entries,
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);
> +		if (read_index_extension(p->istate,
> +								(const char *)p->mmap + src_offset,
> +								(char *)p->mmap + src_offset + 8,
> +								extsize) < 0) {

Overly deep indentation.  Used a wrong tab-width?

> +	/* allocate an extra thread for loading the index extensions */
>  	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
> -	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> +	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
>  
>  	/*
>  	 * Loop through index entries starting a thread for every ce_per_thread
> -	 * entries. Exit the loop when we've created the final thread (no need
> -	 * to parse the remaining entries.
> +	 * entries.
>  	 */

I see.  Now the pre-parsing process needs to go through all the
cache entries to find the beginning of the extensions section.

>  	consumed = thread = 0;
> -	for (i = 0; ; i++) {
> +	for (i = 0; i < istate->cache_nr; i++) {
>  		struct ondisk_cache_entry *ondisk;
>  		const char *name;
>  		unsigned int flags;
> @@ -2055,9 +2082,7 @@ static unsigned long load_cache_entries(struct index_state *istate,
>  			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
>  				die("unable to create load_cache_entries_thread");
>  
> -			/* exit the loop when we've created the last thread */
> -			if (++thread == nr_threads)
> -				break;
> +			++thread;

This is not C++, and in (void) context, the codebase always prefers
post-increment.

> @@ -2086,7 +2111,18 @@ static unsigned long load_cache_entries(struct index_state *istate,
>  			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
>  	}
>  
> -	for (i = 0; i < nr_threads; i++) {
> +	/* create a thread to load the index extensions */
> +	struct load_cache_entries_thread_data *p = &data[thread];

This probably triggers decl-after-statement.

> +	p->istate = istate;
> +	mem_pool_init(&p->ce_mem_pool, 0);
> +	p->mmap = mmap;
> +	p->mmap_size = mmap_size;
> +	p->start_offset = src_offset;
> +
> +	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
> +		die("unable to create load_index_extensions_thread");
> +
> +	for (i = 0; i < nr_threads + 1; i++) {
>  		struct load_cache_entries_thread_data *p = data + i;
>  		if (pthread_join(p->pthread, NULL))
>  			die("unable to join load_cache_entries_thread");

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-28 19:25             ` Duy Nguyen
  2018-08-28 23:54               ` Ben Peart
@ 2018-08-29 17:14               ` Junio C Hamano
  1 sibling, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:14 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> Yeah I kinda hated dummy_entry too but the feeling wasn't strong
> enough to move towards the index->version check. I guess I'm going to
> do it now.

Sounds like a plan.  Thanks again for a pleasant read.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
@ 2018-08-29 17:14     ` Junio C Hamano
  2018-08-29 21:35       ` Ben Peart
  2018-09-03 19:16     ` Duy Nguyen
  1 sibling, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:14 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\

Ben Peart <Ben.Peart@microsoft.com> writes:

> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..79f8296d9c 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -2391,6 +2391,12 @@ imap::
>  	The configuration variables in the 'imap' section are described
>  	in linkgit:git-imap-send[1].
>  
> +index.threads::
> +	Specifies the number of threads to spawn when loading the index.
> +	This is meant to reduce index load time on multiprocessor machines.
> +	Specifying 0 or 'true' will cause Git to auto-detect the number of
> +	CPU's and set the number of threads accordingly. Defaults to 'true'.

"0 or 'true' means 'auto'" made me go "Huh?"

The "Huh?"  I initially felt comes from the fact that usually 0 and
false are interchangeable, but for this particular application,
"disabling" the threading means setting the count to one (not zero),
leaving us zero as a usable "special value" to signal 'auto'.

So the end result does make sense, especially with this bit ...

> diff --git a/config.c b/config.c
> index 9a0b10d4bc..3bda124550 100644
> --- a/config.c
> +++ b/config.c
> @@ -2289,6 +2289,20 @@ int git_config_get_fsmonitor(void)
> ...
> +	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
> +		if (is_bool)
> +			return val ? 0 : 1;
> +		else
> +			return val;

... which says "'0' and 'true' are the same and yields 0, '1' and
'false' yields 1, and '2' and above will give the int".  

Adding something like

	You can disable multi-threaded code by setting this variable
	to 'false' (or 1).

may reduce the risk of a similar "Huh?" reaction by other readers.

> +struct load_cache_entries_thread_data
> +{
> +	pthread_t pthread;
> +	struct index_state *istate;
> +	struct mem_pool *ce_mem_pool;
> +	int offset, nr;
> +	void *mmap;
> +	unsigned long start_offset;
> +	struct strbuf previous_name_buf;
> +	struct strbuf *previous_name;
> +	unsigned long consumed;	/* return # of bytes in index file processed */
> +};

We saw that Duy's "let's not use strbuf to remember the previous
name but instead use the previous ce" approach gave us a nice
performance boost; I wonder if we can build on that idea here?

One possible approach might be to create one ce per "block" in the
pre-scanning thread and use that ce as the "previous one" in the
per-thread data before spawning a worker.

> +static unsigned long load_cache_entries(struct index_state *istate,
> +			void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	struct load_cache_entries_thread_data *data;
> +	int nr_threads, cpus, ce_per_thread;
> +	unsigned long consumed;
> +	int i, thread;
> +
> +	nr_threads = git_config_get_index_threads();
> +	if (!nr_threads) {
> +		cpus = online_cpus();
> +		nr_threads = istate->cache_nr / THREAD_COST;

Here, nr_threads could become 0 with a small index, but any value
below 2 makes us call load_all_cache_entries() by the main thread
(and the value of nr_thread is not used anyore), it is fine.  Of
course, forced test will set it to 2 so there is no problem, either.

OK.

> +	/* a little sanity checking */
> +	if (istate->name_hash_initialized)
> +		die("the name hash isn't thread safe");

If it is a programming error to call into this codepath without
initializing the name_hash, which I think is the case, this is
better done with BUG("").

The remainder of the patch looked good.  Thanks.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 17:14     ` Junio C Hamano
@ 2018-08-29 21:35       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-08-29 21:35 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds



On 8/29/2018 1:14 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 1c42364988..79f8296d9c 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -2391,6 +2391,12 @@ imap::
>>   	The configuration variables in the 'imap' section are described
>>   	in linkgit:git-imap-send[1].
>>   

> Adding something like
> 
> 	You can disable multi-threaded code by setting this variable
> 	to 'false' (or 1).
> 
> may reduce the risk of a similar "Huh?" reaction by other readers.
> 

Will do

>> +struct load_cache_entries_thread_data
>> +{
>> +	pthread_t pthread;
>> +	struct index_state *istate;
>> +	struct mem_pool *ce_mem_pool;
>> +	int offset, nr;
>> +	void *mmap;
>> +	unsigned long start_offset;
>> +	struct strbuf previous_name_buf;
>> +	struct strbuf *previous_name;
>> +	unsigned long consumed;	/* return # of bytes in index file processed */
>> +};
> 
> We saw that Duy's "let's not use strbuf to remember the previous
> name but instead use the previous ce" approach gave us a nice
> performance boost; I wonder if we can build on that idea here?
> 
> One possible approach might be to create one ce per "block" in the
> pre-scanning thread and use that ce as the "previous one" in the
> per-thread data before spawning a worker.
> 

Yes, I believe this can be done.  I was planning to wait until both 
patches settled down a bit before adapting it to threads.  It's a little 
trickier because the previous ce doesn't yet exist but I believe one can 
be fabricated enough to make the optimization work.

>> +static unsigned long load_cache_entries(struct index_state *istate,
>> +			void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	struct load_cache_entries_thread_data *data;
>> +	int nr_threads, cpus, ce_per_thread;
>> +	unsigned long consumed;
>> +	int i, thread;
>> +
>> +	nr_threads = git_config_get_index_threads();
>> +	if (!nr_threads) {
>> +		cpus = online_cpus();
>> +		nr_threads = istate->cache_nr / THREAD_COST;
> 
> Here, nr_threads could become 0 with a small index, but any value
> below 2 makes us call load_all_cache_entries() by the main thread
> (and the value of nr_thread is not used anyore), it is fine.  Of
> course, forced test will set it to 2 so there is no problem, either.
> 
> OK.
> 
>> +	/* a little sanity checking */
>> +	if (istate->name_hash_initialized)
>> +		die("the name hash isn't thread safe");
> 
> If it is a programming error to call into this codepath without
> initializing the name_hash, which I think is the case, this is
> better done with BUG("").
> 

Will do

> The remainder of the patch looked good.  Thanks.
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 17:12     ` Junio C Hamano
@ 2018-08-29 21:42       ` Ben Peart
  2018-08-29 22:19         ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-08-29 21:42 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds



On 8/29/2018 1:12 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> This is possible because the current extensions don't access the cache
>> entries in the index_state structure so are OK that they don't all exist
>> yet.
>>
>> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
>> extensions don't even get a pointer to the index so don't have access to the
>> cache entries.
>>
>> CACHE_EXT_LINK only uses the index_state to initialize the split index.
>> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
>> update and dirty flags.
> 
> Good to see such an analysis here.  Once we define an extension
> section, which requires us to have the cache entries before
> populating it, this scheme would falls down, of course, but the
> extension mechanism is all about protecting ourselves from the
> future changes, so we'd at least need a good feel for how we read an
> unknown extension from the future with the current code.  Perhaps
> just like the main cache entries were pre-scanned to apportion them
> to worker threads, we can pre-scan the sections and compare them
> with a white-list built into our binary before deciding that it is
> safe to read them in parallel (and otherwise, we ask the last thread
> for reading extensions to wait until the workers that read the main
> index all return)?
> 

Yes, when we add a new extension that requires the cache entries to 
exist and be parsed, we will need to add a mechanism to ensure that 
happens for that extension.  I agree a white list is probably the right 
way to deal with it.  Until we have that need, it would just add 
unnecessary complexity so I think we should wait till it is actually needed.

There isn't any change in behavior with unknown extensions and this 
patch.  If an unknown extension exists it will just get ignored and 
reported as an "unknown extension" or "die" if it is marked as "required."

I'll fix the rest of your suggestions - thanks for the close review.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 21:42       ` Ben Peart
@ 2018-08-29 22:19         ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-08-29 22:19 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, git\, pclouds\

Ben Peart <peartben@gmail.com> writes:

> There isn't any change in behavior with unknown extensions and this
> patch.  If an unknown extension exists it will just get ignored and
> reported as an "unknown extension" or "die" if it is marked as
> "required."

OK.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 0/1] optimize reading index format v4
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  2018-08-27 19:36           ` Junio C Hamano
@ 2018-09-02 13:19           ` " Nguyễn Thái Ngọc Duy
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
  1 sibling, 1 reply; 153+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-09-02 13:19 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

v2 removes unrelated changes and the dummy_entry. strip_len is also
replaced with copy_len to reduce repeated subtraction calculation.
Diff: 

diff --git a/read-cache.c b/read-cache.c
index 5c04c8f200..8628d0f3a8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,7 +1713,7 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    const struct cache_entry *previous_ce)
@@ -1722,7 +1722,15 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	size_t len;
 	const char *name;
 	unsigned int flags;
-	size_t strip_len;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1735,37 +1743,37 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 		extended_flags = get_be16(&ondisk2->flags2) << 16;
 		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
 		if (extended_flags & ~CE_EXTENDED_FLAGS)
-			die(_("unknown index entry format %08x"), extended_flags);
+			die("Unknown index entry format %08x", extended_flags);
 		flags |= extended_flags;
 		name = ondisk2->name;
 	}
 	else
 		name = ondisk->name;
 
-	/*
-	 * Adjacent cache entries tend to share the leading paths, so it makes
-	 * sense to only store the differences in later entries.  In the v4
-	 * on-disk format of the index, each on-disk cache entry stores the
-	 * number of bytes to be stripped from the end of the previous name,
-	 * and the bytes to append to the result, to come up with its name.
-	 */
-	if (previous_ce) {
+	if (expand_name_field) {
 		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
 		strip_len = decode_varint(&cp);
-		if (previous_ce->ce_namelen < strip_len)
-			die(_("malformed name field in the index, path '%s'"),
-			    previous_ce->name);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
 		name = (const char *)cp;
 	}
 
 	if (len == CE_NAMEMASK) {
 		len = strlen(name);
-		if (previous_ce)
-			len += previous_ce->ce_namelen - strip_len;
+		if (expand_name_field)
+			len += copy_len;
 	}
 
-	ce = mem_pool__ce_alloc(mem_pool, len);
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1782,9 +1790,9 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	ce->index = 0;
 	hashcpy(ce->oid.hash, ondisk->sha1);
 
-	if (previous_ce) {
-		size_t copy_len = previous_ce->ce_namelen - strip_len;
-		memcpy(ce->name, previous_ce->name, copy_len);
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
 		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
 		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
 	} else {
@@ -1885,7 +1893,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
-	struct cache_entry *dummy_entry = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1923,7 +1930,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
@@ -1938,14 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_ce);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
-		if (previous_ce)
-			previous_ce = ce;
+		previous_ce = ce;
 	}
-	free(dummy_entry);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 


Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
@ 2018-09-02 13:19             ` " Nguyễn Thái Ngọc Duy
  2018-09-04 18:58               ` Junio C Hamano
  2018-09-04 19:31               ` Junio C Hamano
  0 siblings, 2 replies; 153+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-09-02 13:19 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..8628d0f3a8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,24 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1789,21 +1750,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1892,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1930,9 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
  2018-08-29 17:14     ` Junio C Hamano
@ 2018-09-03 19:16     ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:16 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Wed, Aug 29, 2018 at 5:25 PM Ben Peart <Ben.Peart@microsoft.com> wrote:
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..c30346388a 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1889,16 +1889,229 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> +                       struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
> +                       unsigned long start_offset, struct strbuf *previous_name)
> +{
> +       int i;
> +       unsigned long src_offset = start_offset;
> +
> +       for (i = offset; i < offset + nr; i++) {

It may be micro optimization, but since we're looping a lot and can't
trust the compiler to optimize this, maybe just calculate this upper
limit and store in a local variable to make it clear the upper limit
is known, no point of recalculating it at every iteration.

> +               struct ondisk_cache_entry *disk_ce;
> +               struct cache_entry *ce;
> +               unsigned long consumed;
> +
> +               disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +               ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +               set_index_entry(istate, i, ce);
> +
> +               src_offset += consumed;
> +       }
> +       return src_offset - start_offset;
> +}
> +
> +static unsigned long load_all_cache_entries(struct index_state *istate,
> +                       void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +       struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       unsigned long consumed;
> +
> +       if (istate->version == 4) {
> +               previous_name = &previous_name_buf;
> +               mem_pool_init(&istate->ce_mem_pool,
> +                             estimate_cache_size_from_compressed(istate->cache_nr));
> +       } else {
> +               previous_name = NULL;
> +               mem_pool_init(&istate->ce_mem_pool,
> +                             estimate_cache_size(mmap_size, istate->cache_nr));
> +       }
> +
> +       consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
> +                                       0, istate->cache_nr, mmap, src_offset, previous_name);
> +       strbuf_release(&previous_name_buf);
> +       return consumed;
> +}
> +
> +#ifdef NO_PTHREADS
> +
> +#define load_cache_entries load_all_cache_entries
> +
> +#else
> +
> +#include "thread-utils.h"

Don't include files in a middle of a file.

> +
> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST            (7500)

Isn't 7500 a bit too low? I'm still basing on webkit.git,  and 7500
entries take about 1.2ms on average. 100k files would take about 16ms
and may be more reasonable (still too low in my opinion).

> +
> +struct load_cache_entries_thread_data
> +{
> +       pthread_t pthread;
> +       struct index_state *istate;
> +       struct mem_pool *ce_mem_pool;
> +       int offset, nr;
> +       void *mmap;
> +       unsigned long start_offset;
> +       struct strbuf previous_name_buf;
> +       struct strbuf *previous_name;
> +       unsigned long consumed; /* return # of bytes in index file processed */
> +};
> +
> +/*
> +* A thread proc to run the load_cache_entries() computation
> +* across multiple background threads.
> +*/
> +static void *load_cache_entries_thread(void *_data)
> +{
> +       struct load_cache_entries_thread_data *p = _data;
> +
> +       p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
> +               p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
> +       return NULL;
> +}
> +
> +static unsigned long load_cache_entries(struct index_state *istate,
> +                       void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +       struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       struct load_cache_entries_thread_data *data;
> +       int nr_threads, cpus, ce_per_thread;
> +       unsigned long consumed;
> +       int i, thread;
> +
> +       nr_threads = git_config_get_index_threads();
> +       if (!nr_threads) {
> +               cpus = online_cpus();
> +               nr_threads = istate->cache_nr / THREAD_COST;
> +               if (nr_threads > cpus)
> +                       nr_threads = cpus;
> +       }
> +
> +       /* enable testing with fewer than default minimum of entries */
> +       if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
> +               nr_threads = 2;

Please don't add more '()' than necessary. It's just harder to read.
Maybe break that "if" into two lines since it's getting long.

> +
> +       if (nr_threads < 2)
> +               return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> +
> +       /* a little sanity checking */
> +       if (istate->name_hash_initialized)
> +               die("the name hash isn't thread safe");
> +
> +       mem_pool_init(&istate->ce_mem_pool, 0);
> +       if (istate->version == 4)
> +               previous_name = &previous_name_buf;
> +       else
> +               previous_name = NULL;
> +
> +       ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> +
> +       /*
> +        * Loop through index entries starting a thread for every ce_per_thread
> +        * entries. Exit the loop when we've created the final thread (no need
> +        * to parse the remaining entries.
> +        */
> +       consumed = thread = 0;
> +       for (i = 0; ; i++) {
> +               struct ondisk_cache_entry *ondisk;
> +               const char *name;
> +               unsigned int flags;
> +
> +               /*
> +                * we've reached the beginning of a block of cache entries,
> +                * kick off a thread to process them
> +                */
> +               if (0 == i % ce_per_thread) {

I don't get why people keep putting constants in reversed order like
this. Perhaps in the old days, it helps catch "a = 0" mistakes, but
compilers nowadays are smart enough to complain about that and this is
just hard to read.

> +                       struct load_cache_entries_thread_data *p = &data[thread];
> +
> +                       p->istate = istate;
> +                       p->offset = i;
> +                       p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
> +
> +                       /* create a mem_pool for each thread */
> +                       if (istate->version == 4)
> +                               mem_pool_init(&p->ce_mem_pool,
> +                                                 estimate_cache_size_from_compressed(p->nr));
> +                       else
> +                               mem_pool_init(&p->ce_mem_pool,
> +                                                 estimate_cache_size(mmap_size, p->nr));
> +
> +                       p->mmap = mmap;
> +                       p->start_offset = src_offset;
> +                       if (previous_name) {
> +                               strbuf_addbuf(&p->previous_name_buf, previous_name);
> +                               p->previous_name = &p->previous_name_buf;
> +                       }
> +
> +                       if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
> +                               die("unable to create load_cache_entries_thread");
> +
> +                       /* exit the loop when we've created the last thread */
> +                       if (++thread == nr_threads)
> +                               break;

I still think it's better to have an extension to avoid looping
through like this. How much time does this "for (i = 0; ; i++)" loop
cost? The first thread can't start until you've scanned to the second
block, when you have zillion of entries and about 4 cores, that could
be significant delay. Unless you break smaller blocks and have one
thread handles multiple blocks, but then you pay the cost for
synchronization. Other threads may overlap a bit, but starting all
threads at the same time would benefit more. You also can't start
loading the extensions until you've scanned through all this.

> +               }
> +
> +               ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +
> +               /* On-disk flags are just 16 bits */
> +               flags = get_be16(&ondisk->flags);
> +
> +               if (flags & CE_EXTENDED) {
> +                       struct ondisk_cache_entry_extended *ondisk2;
> +                       ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
> +                       name = ondisk2->name;
> +               } else
> +                       name = ondisk->name;
> +
> +               if (!previous_name) {
> +                       size_t len;
> +
> +                       /* v3 and earlier */
> +                       len = flags & CE_NAMEMASK;
> +                       if (len == CE_NAMEMASK)
> +                               len = strlen(name);
> +                       src_offset += (flags & CE_EXTENDED) ?
> +                               ondisk_cache_entry_extended_size(len) :
> +                               ondisk_cache_entry_size(len);
> +               } else
> +                       src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
> +       }
> +
> +       for (i = 0; i < nr_threads; i++) {
> +               struct load_cache_entries_thread_data *p = data + i;
> +               if (pthread_join(p->pthread, NULL))
> +                       die("unable to join load_cache_entries_thread");

_()

> +               mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
> +               strbuf_release(&p->previous_name_buf);
> +               consumed += p->consumed;
> +       }
> +
> +       free(data);
> +       strbuf_release(&previous_name_buf);
> +
> +       return consumed;
> +}
> +
> +#endif
> +
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
  2018-08-29 17:12     ` Junio C Hamano
@ 2018-09-03 19:21     ` Duy Nguyen
  2018-09-03 19:27       ` Duy Nguyen
  1 sibling, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:21 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Wed, Aug 29, 2018 at 5:25 PM Ben Peart <Ben.Peart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by loading
> the cache extensions on a worker thread in parallel with loading the cache
> entries.
>
> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.
>
> I used p0002-read-cache.sh to generate some performance data on the
> cumulative impact:
>
> 100,000 entries
>
> Test                                HEAD~3           HEAD~2
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

This is misleading (if I read it correctly). 1/3 already drops
execution time down to 9.81, so this patch alone only has about 6%
saving. Have you measured how much time is spent on loading extensions
in single threaded mode? I'm just curious if we could hide that
completely (provided that we have enough cores) while we load the
index.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-09-03 19:21     ` Duy Nguyen
@ 2018-09-03 19:27       ` Duy Nguyen
  0 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:27 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Mon, Sep 3, 2018 at 9:21 PM Duy Nguyen <pclouds@gmail.com> wrote:
> > I used p0002-read-cache.sh to generate some performance data on the
> > cumulative impact:
> >
> > 100,000 entries
> >
> > Test                                HEAD~3           HEAD~2
> > ---------------------------------------------------------------------------
> > read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> This is misleading (if I read it correctly). 1/3 already drops
> execution time down to 9.81, so this patch alone only has about 6%
> saving.

I may have miscalculated that. 1/3 says -30% saving, here it's -31%,
so I guess it's 1% extra saving (or ~3% on 1m entries)? That's
definitely not worth doing.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-27 19:36           ` Junio C Hamano
  2018-08-28 19:25             ` Duy Nguyen
@ 2018-09-04 16:08             ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-04 16:08 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben.Peart, git, peartben

On Mon, Aug 27, 2018 at 12:36:27PM -0700, Junio C Hamano wrote:
> > PS. I notice that v4 does not pad to align entries at 4 byte boundary
> > like v2/v3. This could cause a slight slow down on x86 and segfault on
> > some other platforms.
> 
> Care to elaborate?  
> 
> Long time ago, we used to mmap and read directly from the index file
> contents, requiring either an unaligned read or padded entries.  But
> that was eons ago and we first read and convert from on-disk using
> get_be32() etc. to in-core structure, so I am not sure what you mean
> by "segfault" here.

To conclude this unalignment thing (since I plan more changes in the
index to keep its size down, which may increase unaligned access), I
ran with the following patch on amd64 (still webkit.git, 275k files,
100 runs), the index version that does not make unaligned access does
not give noticeable differences. Still roughly around 4.2s.

Running with NO_UNALIGNED_LOADS defined is clearly slower, in 4.3s
range. So in theory if we avoid unaligned access in the index and
avoid slow get_beXX versions, we could bring performance back to 4.2s
range for those platforms.

But on the other hand, padding the index increases the index size by
~1MB (v4 version before padding is 21MB) and this may add more cost at
update time because of the trailer hash.

So, yeah it's probably ok to keep living with unaligned access and not
pad more. At least until those on "no unaligned access" platforms yell
up.

diff --git a/read-cache.c b/read-cache.c
index 8628d0f3a8..33ee35fb81 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1794,7 +1794,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		if (copy_len)
 			memcpy(ce->name, previous_ce->name, copy_len);
 		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
-		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+		*ent_size = ((name - ((char *)ondisk)) + len - copy_len + 8) & ~7;
 	} else {
 		memcpy(ce->name, name, len + 1);
 		*ent_size = ondisk_ce_size(ce);
@@ -2345,8 +2345,10 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
 			result = ce_write(c, fd, to_remove_vi, prefix_size);
 		if (!result)
 			result = ce_write(c, fd, ce->name + common, ce_namelen(ce) - common);
-		if (!result)
-			result = ce_write(c, fd, padding, 1);
+		if (!result) {
+			int len = prefix_size + ce_namelen(ce) - common;
+			result = ce_write(c, fd, padding, align_padding_size(size, len));
+		}
 
 		strbuf_splice(previous_name, common, to_remove,
 			      ce->name + common, ce_namelen(ce) - common);

--
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
@ 2018-09-04 18:58               ` Junio C Hamano
  2018-09-04 19:31               ` Junio C Hamano
  1 sibling, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-04 18:58 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static struct cache_entry *create_from_disk(struct index_state *istate,
>  					    struct ondisk_cache_entry *ondisk,
>  					    unsigned long *ent_size,
> -					    struct strbuf *previous_name)
> +					    const struct cache_entry *previous_ce)
>  {
>  	struct cache_entry *ce;
>  	size_t len;
>  	const char *name;
>  	unsigned int flags;
> +	size_t copy_len;

We should not have to, but let's initialize it to 0 here, because
...

> +	if (expand_name_field) {
> +...
> +		copy_len = previous_len - strip_len;
> +		name = (const char *)cp;
> +	}
> +
> +	if (len == CE_NAMEMASK) {
> +		len = strlen(name);
> +		if (expand_name_field)
> +			len += copy_len;
> ...
> +	}
> +	if (expand_name_field) {
> +		if (copy_len)
> +			memcpy(ce->name, previous_ce->name, copy_len);
> +		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
> +		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;

I am seeing a compiler getting confused, thinking that copy_len
could be used before getting assigned.

Humans can see that reference to copy_len are made only inside "if
(expand_name_field)", so we shouldn't have to.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
  2018-09-04 18:58               ` Junio C Hamano
@ 2018-09-04 19:31               ` Junio C Hamano
  1 sibling, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-04 19:31 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Index format v4 requires some more computation to assemble a path
> based on a previous one. The current code is not very efficient
> because
>
>  - it doubles memory copy, we assemble the final path in a temporary
>    first before putting it back to a cache_entry
>
>  - strbuf_remove() in expand_name_field() is not exactly a good fit
>    for stripping a part at the end, _setlen() would do the same job
>    and is much cheaper.
>
>  - the open-coded loop to find the end of the string in
>    expand_name_field() can't beat an optimized strlen()
>
> This patch avoids the temporary buffer and writes directly to the new
> cache_entry, which addresses the first two points. The last point
> could also be avoided if the total string length fits in the first 12
> bits of ce_flags, if not we fall back to strlen().
>
> Running "test-tool read-cache 100" on webkit.git (275k files), reading
> v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
> time. The patch reduces read time on v4 to 4.319 seconds.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  read-cache.c | 128 ++++++++++++++++++++++++---------------------------
>  1 file changed, 60 insertions(+), 68 deletions(-)

Thanks; this round is much easier to read with a clearly named
"expand_name_field" boolean variable, etc.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (2 preceding siblings ...)
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
@ 2018-09-06 21:03 ` Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
                     ` (4 more replies)
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                   ` (4 subsequent siblings)
  8 siblings, 5 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

On further investigation with the previous patch, I noticed that my test
repos didn't contain the cache tree extension in their index. After doing a
commit to ensure they existed, I realized that in some instances, the time
to load the cache tree exceeded the time to load all the cache entries in
parallel.  Because the thread to read the cache tree was started last (due
to having to parse through all the cache entries first) we weren't always
getting optimal performance.

To better optimize for this case, I decided to write the EOIE extension
as suggested by Junio [1] in response to my earlier multithreading patch
series [2].  This enables me to spin up the thread to load the extensions
earlier as it no longer has to parse through all the cache entries first.

The big changes in this iteration are:

- add the EOIE extension
- update the index extension worker thread to start first

The absolute perf numbers don't look as good as the previous iteration
because not loading the cache tree at all is a lot faster than loading it in
parallel. These were measured with a V4 index that included a cache tree
extension.

I used p0002-read-cache.sh to generate some performance data on how the three
performance patches help:

p0002-read-cache.sh w/100,000 files                        
Baseline         expand_name_field()    Thread extensions       Thread entries
---------------------------------------------------------------------------------------
22.34(0.01+0.12) 21.14(0.03+0.01) -5.4% 20.71(0.03+0.03) -7.3%	13.93(0.04+0.04) -37.6%

p0002-read-cache.sh w/1,000,000 files                        
Baseline          expand_name_field()     Thread extensions        Thread entries
-------------------------------------------------------------------------------------------
306.44(0.04+0.07) 295.42(0.01+0.07) -3.6% 217.60(0.03+0.04) -29.0% 199.00(0.00+0.10) -35.1%

This patch conflicts with Duy's patch to remove the double memory copy and
pass in the previous ce instead.  The two will need to be merged/reconciled
once they settle down a bit.

[1] https://public-inbox.org/git/xmqq1sl017dw.fsf@gitster.mtv.corp.google.com/
[2] https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/


Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/325ec69299
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v3 && git checkout 325ec69299


### Patches

Ben Peart (4):
  read-cache: optimize expand_name_field() to speed up V4 index parsing.
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: speed up index load through parallelization

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 ++
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 476 ++++++++++++++++++++---
 t/README                                 |  11 +
 t/t1700-split-index.sh                   |   1 +
 7 files changed, 487 insertions(+), 49 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing.
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

Optimize expand_name_field() to speed up V4 index parsing.

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline         expand_name_field()
---------------------------------------
22.34(0.01+0.12) 21.14(0.03+0.01) -5.4%

p0002-read-cache.sh w/1,000,000 files
Baseline          expand_name_field()
-----------------------------------------
306.44(0.04+0.07) 295.42(0.01+0.07) -3.6%

Suggested by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..382cc16bdc 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07 17:55     ` Junio C Hamano
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 149 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 170 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 382cc16bdc..d0d2793780 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1888,6 +1892,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2197,11 +2206,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2444,7 +2457,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2453,6 +2466,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2519,11 +2533,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2534,7 +2550,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2544,7 +2560,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2555,7 +2571,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2566,7 +2582,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2977,3 +3009,104 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* validate the extension signature */
+	index = eoie;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
+		return 0;
+	if ((const char *)mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 9028b47d92..d8754dd23a 100644
--- a/t/README
+++ b/t/README
@@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
 path where deltas larger than this limit require extra memory
 allocation for bookkeeping.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 39133bcbc8..f613dd72e3 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+export GIT_TEST_DISABLE_EOIE=true
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07 21:10     ` Junio C Hamano
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  4 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline         Thread extensions
---------------------------------------
21.14(0.03+0.01) 20.71(0.03+0.03) -2.0%

p0002-read-cache.sh w/1,000,000 files
Baseline          Thread extensions
------------------------------------------
295.42(0.01+0.07) 217.60(0.03+0.04) -26.3%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..9bd79fb165 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index d0d2793780..fcc776aaf0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1897,6 +1901,46 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+ };
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1907,6 +1951,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1943,6 +1992,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1969,23 +2038,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07  4:16     ` Torsten Bögershausen
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  4 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline           Thread entries
------------------------------------------
20.71(0.03+0.03)   13.93(0.04+0.04) -32.7%

p0002-read-cache.sh w/1,000,000 files
Baseline            Thread entries
-------------------------------------------
217.60(0.03+0.04)   199.00(0.00+0.10) -8.6%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 242 +++++++++++++++++++++++++++++++++++++++++++++------
 t/README     |   6 ++
 2 files changed, 220 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index fcc776aaf0..8537a55750 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 100000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1996,10 +2188,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_TEST_INDEX_THREADS", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2008,33 +2210,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
diff --git a/t/README b/t/README
index d8754dd23a..59015f7150 100644
--- a/t/README
+++ b/t/README
@@ -324,6 +324,12 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
 as they currently hard code SHA values for the index which are no longer
 valid due to the addition of the EOIE extension.
 
+GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
+the index cache entries and extensions for the whole test suite.
+
 Naming Tests
 ------------
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-07  4:16     ` Torsten Bögershausen
  2018-09-07 13:43       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Torsten Bögershausen @ 2018-09-07  4:16 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart


> diff --git a/read-cache.c b/read-cache.c
> index fcc776aaf0..8537a55750 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
>  	return NULL;
>  }
>  
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> +			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
> +			unsigned long start_offset, struct strbuf *previous_name)
> +{
> +	int i;
> +	unsigned long src_offset = start_offset;

I read an unsigned long here:
should that be a size_t instead ?

(And probably even everywhere else in this patch)

> +
> +	for (i = offset; i < offset + nr; i++) {
> +		struct ondisk_cache_entry *disk_ce;
> +		struct cache_entry *ce;
> +		unsigned long consumed;
> +
> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +		set_index_entry(istate, i, ce);
> +
> +		src_offset += consumed;
> +	}
> +	return src_offset - start_offset;
> +}
> +
> +static unsigned long load_all_cache_entries(struct index_state *istate,
> +			void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	unsigned long consumed;
> +
> +	if (istate->version == 4) {
> +		previous_name = &previous_name_buf;
> +		mem_pool_init(&istate->ce_mem_pool,
> +				estimate_cache_size_from_compressed(istate->cache_nr));
> +	} else {
> +		previous_name = NULL;
> +		mem_pool_init(&istate->ce_mem_pool,
> +				estimate_cache_size(mmap_size, istate->cache_nr));
> +	}
> +
> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
> +					0, istate->cache_nr, mmap, src_offset, previous_name);
> +	strbuf_release(&previous_name_buf);
> +	return consumed;
> +}
> +
> +#ifndef NO_PTHREADS
> +

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-07  4:16     ` Torsten Bögershausen
@ 2018-09-07 13:43       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-07 13:43 UTC (permalink / raw)
  To: Torsten Bögershausen, Ben Peart; +Cc: git, gitster, pclouds, Ben Peart



On 9/7/2018 12:16 AM, Torsten Bögershausen wrote:
> 
>> diff --git a/read-cache.c b/read-cache.c
>> index fcc776aaf0..8537a55750 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
>>   	return NULL;
>>   }
>>   
>> +/*
>> + * A helper function that will load the specified range of cache entries
>> + * from the memory mapped file and add them to the given index.
>> + */
>> +static unsigned long load_cache_entry_block(struct index_state *istate,
>> +			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
>> +			unsigned long start_offset, struct strbuf *previous_name)
>> +{
>> +	int i;
>> +	unsigned long src_offset = start_offset;
> 
> I read an unsigned long here:
> should that be a size_t instead ?
> 
> (And probably even everywhere else in this patch)
> 

It's a fair question.  The pre-patch code had a mix of unsigned long and 
size_t.  Both src_offset and consumed were unsigned long but mmap_size 
was a size_t.  I stuck with that pattern for consistency.

While it would be possible to convert everything to size_t as a step to 
enable index files >4 GB, I have a hard time believing that will be 
necessary for a very long time and would likely require more substantial 
changes to enable that to work.

>> +
>> +	for (i = offset; i < offset + nr; i++) {
>> +		struct ondisk_cache_entry *disk_ce;
>> +		struct cache_entry *ce;
>> +		unsigned long consumed;
>> +
>> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
>> +		set_index_entry(istate, i, ce);
>> +
>> +		src_offset += consumed;
>> +	}
>> +	return src_offset - start_offset;
>> +}
>> +
>> +static unsigned long load_all_cache_entries(struct index_state *istate,
>> +			void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	unsigned long consumed;
>> +
>> +	if (istate->version == 4) {
>> +		previous_name = &previous_name_buf;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +				estimate_cache_size_from_compressed(istate->cache_nr));
>> +	} else {
>> +		previous_name = NULL;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +				estimate_cache_size(mmap_size, istate->cache_nr));
>> +	}
>> +
>> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
>> +					0, istate->cache_nr, mmap, src_offset, previous_name);
>> +	strbuf_release(&previous_name_buf);
>> +	return consumed;
>> +}
>> +
>> +#ifndef NO_PTHREADS
>> +

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-07 17:21   ` " Junio C Hamano
  2018-09-07 18:31     ` Ben Peart
  2018-09-08 13:18     ` Duy Nguyen
  4 siblings, 2 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-07 17:21 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> On further investigation with the previous patch, I noticed that my test
> repos didn't contain the cache tree extension in their index. After doing a
> commit to ensure they existed, I realized that in some instances, the time
> to load the cache tree exceeded the time to load all the cache entries in
> parallel.  Because the thread to read the cache tree was started last (due
> to having to parse through all the cache entries first) we weren't always
> getting optimal performance.
>
> To better optimize for this case, I decided to write the EOIE extension
> as suggested by Junio [1] in response to my earlier multithreading patch
> series [2].  This enables me to spin up the thread to load the extensions
> earlier as it no longer has to parse through all the cache entries first.

Hmph. I kinda liked the simplicity of the previous one, but if we
need to start reading the extension sections sooner by eliminating
the overhead to scan the cache entries, perhaps we should bite the
bullet now.

> The big changes in this iteration are:
>
> - add the EOIE extension
> - update the index extension worker thread to start first

I guess I'd need to see the actual patch to find this out, but once
we rely on a new extension, then we could omit scanning the main
index even to partition the work among workers (i.e. like the topic
long ago, you can have list of pointers into the main index to help
partitioning, plus reset the prefix compression used in v4).  I
think you didn't get that far in this round, which is good.  If the
gain with EOIE alone (and starting the worker for the extension
section early) is large enough without such a pre-computed work
partition table, the simplicity of this round may give us a good
stopping point.

> This patch conflicts with Duy's patch to remove the double memory copy and
> pass in the previous ce instead.  The two will need to be merged/reconciled
> once they settle down a bit.

Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
reading index format v4", 2018-09-02) is already 'next'-worthy
and ready to be built on, but I'd prefer to hear from Duy to double
check.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-07 17:55     ` Junio C Hamano
  2018-09-07 20:23       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-09-07 17:55 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> The extension consists of:
>
> - 32-bit offset to the end of the index entries
>
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
>
> SHA-1("TREE" + <binary representation of N> +
> 	"REUC" + <binary representation of M>)

I didn't look at the documentation patch in the larger context, but
please make sure that it is clear to the readers that these fixed
width integers "binary representations" use network byte order.

I briefly wondered if the above should include

    + "EOIE" + <binary representation of (32+160)/8 = 24>

as it is pretty much common file format design to include the header
part of the checksum record (with checksum values padded out with NUL
bytes) when you define a record to hold the checksum of the entire
file.  Since this does not protect the contents of each section from
bit-flipping, adding the data for EOIE itself in the sum does not
give us much (iow, what I am adding above is a constant that does
not contribute any entropy).

How big is a typical TREE extension in _your_ work repository
housing Windows sources?  I am guessing that replacing SHA-1 with
something faster (as this is not about security but is about disk
corruption) and instead hash also the contents of these sections
would NOT help all that much in the performance department, as
having to page them in to read through would already consume
non-trivial amount of time, and that is why you are not hashing the
contents.

> +	/*
> +	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1

s/SHA1/trailing checksum/ or something so that we can withstand
NewHash world order?

> +	 * so that it can be found and processed before all the index entries are
> +	 * read.
> +	 */
> +	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
> +		struct strbuf sb = STRBUF_INIT;
> +
> +		write_eoie_extension(&sb, &eoie_c, offset);
> +		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>  			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
>  		strbuf_release(&sb);
>  		if (err)

OK.

> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
> +
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
> +{
> +	/*
> +	 * The end of index entries (EOIE) extension is guaranteed to be last
> +	 * so that it can be found by scanning backwards from the EOF.
> +	 *
> +	 * "EOIE"
> +	 * <4-byte length>
> +	 * <4-byte offset>
> +	 * <20-byte hash>
> +	 */
> +	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
> +	uint32_t extsize;
> +	unsigned long offset, src_offset;
> +	unsigned char hash[GIT_MAX_RAWSZ];
> +	git_hash_ctx c;
> +
> +	/* validate the extension signature */
> +	index = eoie;
> +	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
> +		return 0;
> +	index += sizeof(uint32_t);
> +
> +	/* validate the extension size */
> +	extsize = get_be32(index);
> +	if (extsize != EOIE_SIZE)
> +		return 0;
> +	index += sizeof(uint32_t);

Do we know we have at least 8-byte to consume to perform the above
two checks, or is that something we need to verify at the beginning
of this function?  Better yet, as we know that a correct EOIE with
its own header is 28-byte long, we probably should abort if
mmap_size is smaller than that.

> +	/*
> +	 * Validate the offset we're going to look for the first extension
> +	 * signature is after the index header and before the eoie extension.
> +	 */
> +	offset = get_be32(index);
> +	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
> +		return 0;

Claims that the end is before the beginning, which we reject as bogus.  Good.

> +	if ((const char *)mmap + offset >= eoie)
> +		return 0;

Claims that the end is beyond the EOIE marker we should have placed
after it, which we reject as bogus.  Good.

> +	index += sizeof(uint32_t);
> +
> +	/*
> +	 * The hash is computed over extension types and their sizes (but not
> +	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
> +	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +	 * then the hash would be:
> +	 *
> +	 * SHA-1("TREE" + <binary representation of N> +
> +	 *               "REUC" + <binary representation of M>)
> +	 */
> +	src_offset = offset;
> +	the_hash_algo->init_fn(&c);
> +	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> +		/* After an array of active_nr index entries,
(Style nit).
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);

Earlier we were using get_be32() but now we use memcpy with ntohl()?
How are we choosing which one to use?

I think you meant to cast mmap to (const char *) here.  It may make it
easier to write and read if we started this function like so:

	static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
	{
		const char *mmap = mmap_;

then we do not have to keep casting mmap and cast to a wrong type by
mistake.

> +
> +		/* verify the extension size isn't so large it will wrap around */
> +		if (src_offset + 8 + extsize < src_offset)
> +			return 0;

Good.

> +		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
> +
> +		src_offset += 8;
> +		src_offset += extsize;
> +	}
> +	the_hash_algo->final_fn(hash, &c);
> +	if (hashcmp(hash, (unsigned char *)index))
> +		return 0;
> +
> +	/* Validate that the extension offsets returned us back to the eoie extension. */
> +	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> +		return 0;

Very good.

> +	return offset;
> +}
> +#endif

Overall it looks like it is carefully done.
Thanks.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
@ 2018-09-07 18:31     ` Ben Peart
  2018-09-08 13:18     ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-07 18:31 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 1:21 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> On further investigation with the previous patch, I noticed that my test
>> repos didn't contain the cache tree extension in their index. After doing a
>> commit to ensure they existed, I realized that in some instances, the time
>> to load the cache tree exceeded the time to load all the cache entries in
>> parallel.  Because the thread to read the cache tree was started last (due
>> to having to parse through all the cache entries first) we weren't always
>> getting optimal performance.
>>
>> To better optimize for this case, I decided to write the EOIE extension
>> as suggested by Junio [1] in response to my earlier multithreading patch
>> series [2].  This enables me to spin up the thread to load the extensions
>> earlier as it no longer has to parse through all the cache entries first.
> 
> Hmph. I kinda liked the simplicity of the previous one, but if we
> need to start reading the extension sections sooner by eliminating
> the overhead to scan the cache entries, perhaps we should bite the
> bullet now.
> 

I preferred the simplicity as well but when I was profiling the code and 
found out that loading the extensions was most often the last thread to 
complete, I took this intermediate step to speed things up.

>> The big changes in this iteration are:
>>
>> - add the EOIE extension
>> - update the index extension worker thread to start first
> 
> I guess I'd need to see the actual patch to find this out, but once
> we rely on a new extension, then we could omit scanning the main
> index even to partition the work among workers (i.e. like the topic
> long ago, you can have list of pointers into the main index to help
> partitioning, plus reset the prefix compression used in v4).  I
> think you didn't get that far in this round, which is good.  If the
> gain with EOIE alone (and starting the worker for the extension
> section early) is large enough without such a pre-computed work
> partition table, the simplicity of this round may give us a good
> stopping point.
> 

Agreed.  I didn't go that far in this series as it doesn't appear to be 
necessary.  We could always add that later if it turned out to be worth 
the additional complexity.

>> This patch conflicts with Duy's patch to remove the double memory copy and
>> pass in the previous ce instead.  The two will need to be merged/reconciled
>> once they settle down a bit.
> 
> Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
> reading index format v4", 2018-09-02) is already 'next'-worthy
> and ready to be built on, but I'd prefer to hear from Duy to double
> check.
> 

I'll take a closer look at what this will entail. It gets more 
complicated as we don't actually have a previous expanded cache entry 
when starting each thread.  I'll see how complex it makes the code and 
how much additional performance it gives.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-07 17:55     ` Junio C Hamano
@ 2018-09-07 20:23       ` Ben Peart
  2018-09-08  6:29         ` Martin Ågren
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-07 20:23 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 1:55 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> The extension consists of:
>>
>> - 32-bit offset to the end of the index entries
>>
>> - 160-bit SHA-1 over the extension types and their sizes (but not
>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> then the hash would be:
>>
>> SHA-1("TREE" + <binary representation of N> +
>> 	"REUC" + <binary representation of M>)
> 
> I didn't look at the documentation patch in the larger context, but
> please make sure that it is clear to the readers that these fixed
> width integers "binary representations" use network byte order.
> 

At the top of the documentation it says "All binary numbers are in 
network byte order" and that is not repeated for any of the other 
sections that are documenting the file format.

> I briefly wondered if the above should include
> 
>      + "EOIE" + <binary representation of (32+160)/8 = 24>
> 
> as it is pretty much common file format design to include the header
> part of the checksum record (with checksum values padded out with NUL
> bytes) when you define a record to hold the checksum of the entire
> file.  Since this does not protect the contents of each section from
> bit-flipping, adding the data for EOIE itself in the sum does not
> give us much (iow, what I am adding above is a constant that does
> not contribute any entropy).
> 
> How big is a typical TREE extension in _your_ work repository
> housing Windows sources?  I am guessing that replacing SHA-1 with
> something faster (as this is not about security but is about disk
> corruption) and instead hash also the contents of these sections
> would NOT help all that much in the performance department, as
> having to page them in to read through would already consume
> non-trivial amount of time, and that is why you are not hashing the
> contents.
> 

The purpose of the SHA isn't to detect disk corruption (we already have 
a SHA for the entire index that can serve that purpose) but to help 
ensure that this was actually a valid EOIE extension and not a lucky 
random set of bytes.  I had used leading and trailing signature bytes 
along with the length and version bytes to validate it was an actual 
EOIE extension but you suggested [1] that I use a SHA of the 4 byte 
extension type + 4 byte extension length instead so I rewrote it that way.

[1] 
https://public-inbox.org/git/xmqq1sl017dw.fsf@gitster.mtv.corp.google.com/

>> +	/*
>> +	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
> 
> s/SHA1/trailing checksum/ or something so that we can withstand
> NewHash world order?
> 

I thought about this but in the document elsewhere it refers to it as 
"160-bit SHA-1 over the content of the index file before this checksum." 
and there are at least a dozen other references to "SHA-1" so I figured 
we can fix them all at the same time when we have a new/better name. :-)

>> +	 * so that it can be found and processed before all the index entries are
>> +	 * read.
>> +	 */
>> +	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
>> +		struct strbuf sb = STRBUF_INIT;
>> +
>> +		write_eoie_extension(&sb, &eoie_c, offset);
>> +		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>>   			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
>>   		strbuf_release(&sb);
>>   		if (err)
> 
> OK.
> 
>> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
>> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
>> +
>> +#ifndef NO_PTHREADS
>> +static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
>> +{
>> +	/*
>> +	 * The end of index entries (EOIE) extension is guaranteed to be last
>> +	 * so that it can be found by scanning backwards from the EOF.
>> +	 *
>> +	 * "EOIE"
>> +	 * <4-byte length>
>> +	 * <4-byte offset>
>> +	 * <20-byte hash>
>> +	 */
>> +	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
>> +	uint32_t extsize;
>> +	unsigned long offset, src_offset;
>> +	unsigned char hash[GIT_MAX_RAWSZ];
>> +	git_hash_ctx c;
>> +
>> +	/* validate the extension signature */
>> +	index = eoie;
>> +	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
>> +		return 0;
>> +	index += sizeof(uint32_t);
>> +
>> +	/* validate the extension size */
>> +	extsize = get_be32(index);
>> +	if (extsize != EOIE_SIZE)
>> +		return 0;
>> +	index += sizeof(uint32_t);
> 
> Do we know we have at least 8-byte to consume to perform the above
> two checks, or is that something we need to verify at the beginning
> of this function?  Better yet, as we know that a correct EOIE with
> its own header is 28-byte long, we probably should abort if
> mmap_size is smaller than that.
> 

I'll add that additional test.

>> +	/*
>> +	 * Validate the offset we're going to look for the first extension
>> +	 * signature is after the index header and before the eoie extension.
>> +	 */
>> +	offset = get_be32(index);
>> +	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
>> +		return 0;
> 
> Claims that the end is before the beginning, which we reject as bogus.  Good.
> 
>> +	if ((const char *)mmap + offset >= eoie)
>> +		return 0;
> 
> Claims that the end is beyond the EOIE marker we should have placed
> after it, which we reject as bogus.  Good.
> 
>> +	index += sizeof(uint32_t);
>> +
>> +	/*
>> +	 * The hash is computed over extension types and their sizes (but not
>> +	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
>> +	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> +	 * then the hash would be:
>> +	 *
>> +	 * SHA-1("TREE" + <binary representation of N> +
>> +	 *               "REUC" + <binary representation of M>)
>> +	 */
>> +	src_offset = offset;
>> +	the_hash_algo->init_fn(&c);
>> +	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
>> +		/* After an array of active_nr index entries,
> (Style nit).
>> +		 * there can be arbitrary number of extended
>> +		 * sections, each of which is prefixed with
>> +		 * extension name (4-byte) and section length
>> +		 * in 4-byte network byte order.
>> +		 */
>> +		uint32_t extsize;
>> +		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
>> +		extsize = ntohl(extsize);
> 
> Earlier we were using get_be32() but now we use memcpy with ntohl()?
> How are we choosing which one to use?
> 

I literally copy/pasted this logic from the code that actually loads the 
extensions then removed the call to load the extension and replaced it 
with the call to update the hash.  I kept it the same to facilitate 
consistency for any future fixes or changes.

> I think you meant to cast mmap to (const char *) here.  It may make it
> easier to write and read if we started this function like so:
> 
> 	static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
> 	{
> 		const char *mmap = mmap_;
> 
> then we do not have to keep casting mmap and cast to a wrong type by
> mistake.
> 

Good suggestion.

>> +
>> +		/* verify the extension size isn't so large it will wrap around */
>> +		if (src_offset + 8 + extsize < src_offset)
>> +			return 0;
> 
> Good.
> 
>> +		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
>> +
>> +		src_offset += 8;
>> +		src_offset += extsize;
>> +	}
>> +	the_hash_algo->final_fn(hash, &c);
>> +	if (hashcmp(hash, (unsigned char *)index))
>> +		return 0;
>> +
>> +	/* Validate that the extension offsets returned us back to the eoie extension. */
>> +	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
>> +		return 0;
> 
> Very good.
> 
>> +	return offset;
>> +}
>> +#endif
> 
> Overall it looks like it is carefully done.

Thanks for the careful review!

> Thanks.
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-07 21:10     ` Junio C Hamano
  2018-09-08 14:56       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-09-07 21:10 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> +struct load_index_extensions
> +{
> +#ifndef NO_PTHREADS
> +	pthread_t pthread;
> +#endif
> +	struct index_state *istate;
> +	void *mmap;
> +	size_t mmap_size;
> +	unsigned long src_offset;

If the file format only allows uint32_t on any platform, perhaps
this is better specified as uint32_t?  Or if this is offset into
a mmap'ed region of memory, size_t may be more appropriate.

Same comment applies to "extension_offset" we see below (which in
turn means the returned type of read_eoie_extension() function may
want to match).

> + };

Space before '}'??

> +
> +static void *load_index_extensions(void *_data)
> +{
> +	struct load_index_extensions *p = _data;

Perhaps we are being superstitious, but I think our code try to
avoid leading underscore when able, i.e.

	load_index_extensions(void *data_)
	{
		struct load_index_extensions *p = data;

> +	unsigned long src_offset = p->src_offset;
> +
> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +		/* After an array of active_nr index entries,
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);

The same "ntohl(), not get_be32()?" question as the one for the
previous step applies here, too.  I think the answer is "the
original was written that way" and that is acceptable, but once this
series lands, we may want to review the whole file and see if it is
worth making them consistent with a separate clean-up patch.

I think mmap() and munmap() are the only places that wants p->mmap
and mmap parameters passed around in various callchains to be of
type "void *"---I wonder if it is simpler to use "const char *"
throughout and only cast it to "void *" when necessary (I suspect
that there is nowhere we need to cast to or from "void *" explicitly
if we did so---assignment and argument passing would give us an
appropriate cast for free)?

> +		if (read_index_extension(p->istate,
> +			(const char *)p->mmap + src_offset,
> +			(char *)p->mmap + src_offset + 8,
> +			extsize) < 0) {
> +			munmap(p->mmap, p->mmap_size);
> +			die("index file corrupt");
> +		}
> +	...
> @@ -1907,6 +1951,11 @@ ...
> ...
> +	p.mmap = mmap;
> +	p.mmap_size = mmap_size;
> +
> +#ifndef NO_PTHREADS
> +	nr_threads = git_config_get_index_threads();
> +	if (!nr_threads)
> +		nr_threads = online_cpus();
> +
> +	if (nr_threads >= 2) {
> +		extension_offset = read_eoie_extension(mmap, mmap_size);
> +		if (extension_offset) {
> +			/* create a thread to load the index extensions */
> +			p.src_offset = extension_offset;
> +			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> +				die(_("unable to create load_index_extensions_thread"));
> +		}
> +	}
> +#endif

Makes sense.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-07 20:23       ` Ben Peart
@ 2018-09-08  6:29         ` Martin Ågren
  2018-09-08 14:03           ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Martin Ågren @ 2018-09-08  6:29 UTC (permalink / raw)
  To: Ben Peart
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart

On Fri, 7 Sep 2018 at 22:24, Ben Peart <peartben@gmail.com> wrote:
> > Ben Peart <benpeart@microsoft.com> writes:

> >> - 160-bit SHA-1 over the extension types and their sizes (but not
> >> their contents).  E.g. if we have "TREE" extension that is N-bytes
> >> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> >> then the hash would be:

> The purpose of the SHA isn't to detect disk corruption (we already have
> a SHA for the entire index that can serve that purpose) but to help
> ensure that this was actually a valid EOIE extension and not a lucky
> random set of bytes. [...]

> >> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */

> >> +    the_hash_algo->init_fn(&c);
> >> +    while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
[...]
> >> +    the_hash_algo->final_fn(hash, &c);
> >> +    if (hashcmp(hash, (unsigned char *)index))
> >> +            return 0;
> >> +
> >> +    /* Validate that the extension offsets returned us back to the eoie extension. */
> >> +    if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> >> +            return 0;

Besides the issue you and Junio discussed with "should we document this
as being SHA-1 or NewHash" (or "the hash algo"), it seems to me that
this implementation is living somewhere between using SHA-1 and "the
hash algo". The hashing uses `the_hash_algo`, but the hash size is
hardcoded at 20 bytes.

Maybe it all works out, e.g., so that when someone (brian) merges a
NewHash and runs the testsuite, this will fail consistently and in a
safe way. But I wonder if it would be too hard to avoid the hardcoded 24
already now.

Martin

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  2018-09-07 18:31     ` Ben Peart
@ 2018-09-08 13:18     ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-08 13:18 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Git Mailing List, Ben Peart

On Fri, Sep 7, 2018 at 7:21 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Ben Peart <benpeart@microsoft.com> writes:
>
> > On further investigation with the previous patch, I noticed that my test
> > repos didn't contain the cache tree extension in their index. After doing a
> > commit to ensure they existed, I realized that in some instances, the time
> > to load the cache tree exceeded the time to load all the cache entries in
> > parallel.  Because the thread to read the cache tree was started last (due
> > to having to parse through all the cache entries first) we weren't always
> > getting optimal performance.
> >
> > To better optimize for this case, I decided to write the EOIE extension
> > as suggested by Junio [1] in response to my earlier multithreading patch
> > series [2].  This enables me to spin up the thread to load the extensions
> > earlier as it no longer has to parse through all the cache entries first.
>
> Hmph. I kinda liked the simplicity of the previous one, but if we
> need to start reading the extension sections sooner by eliminating
> the overhead to scan the cache entries, perhaps we should bite the
> bullet now.

My view is slightly different. If we have to optimize might as well
try to squeeze the best out of it. Simplicity is already out of the
window at this point (but maintainability remains).

> > The big changes in this iteration are:
> >
> > - add the EOIE extension
> > - update the index extension worker thread to start first
>
> I guess I'd need to see the actual patch to find this out, but once
> we rely on a new extension, then we could omit scanning the main
> index even to partition the work among workers (i.e. like the topic
> long ago, you can have list of pointers into the main index to help
> partitioning, plus reset the prefix compression used in v4).  I
> think you didn't get that far in this round, which is good.  If the
> gain with EOIE alone (and starting the worker for the extension
> section early) is large enough without such a pre-computed work
> partition table, the simplicity of this round may give us a good
> stopping point.

I suspect the reduced gain in 1M files case compared to 100k files in
4/4 is because of scanning the index to split work to worker threads.
Since the index is now larger, the scanning takes more time before we
can start worker threads and we gain less from parallelization. I have
not experimented to see if this is true or there is something else.

> > This patch conflicts with Duy's patch to remove the double memory copy and
> > pass in the previous ce instead.  The two will need to be merged/reconciled
> > once they settle down a bit.
>
> Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
> reading index format v4", 2018-09-02) is already 'next'-worthy
> and ready to be built on, but I'd prefer to hear from Duy to double
> check.

Yes I think it's good. I ran the entire test suite with v4 just to
double check (and thinking of testing v4 version in travis too).
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-08  6:29         ` Martin Ågren
@ 2018-09-08 14:03           ` Ben Peart
  2018-09-08 17:08             ` Martin Ågren
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-08 14:03 UTC (permalink / raw)
  To: Martin Ågren
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart



On 9/8/2018 2:29 AM, Martin Ågren wrote:
> On Fri, 7 Sep 2018 at 22:24, Ben Peart <peartben@gmail.com> wrote:
>>> Ben Peart <benpeart@microsoft.com> writes:
> 
>>>> - 160-bit SHA-1 over the extension types and their sizes (but not
>>>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>>>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>>>> then the hash would be:
> 
>> The purpose of the SHA isn't to detect disk corruption (we already have
>> a SHA for the entire index that can serve that purpose) but to help
>> ensure that this was actually a valid EOIE extension and not a lucky
>> random set of bytes. [...]
> 
>>>> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
> 
>>>> +    the_hash_algo->init_fn(&c);
>>>> +    while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> [...]
>>>> +    the_hash_algo->final_fn(hash, &c);
>>>> +    if (hashcmp(hash, (unsigned char *)index))
>>>> +            return 0;
>>>> +
>>>> +    /* Validate that the extension offsets returned us back to the eoie extension. */
>>>> +    if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
>>>> +            return 0;
> 
> Besides the issue you and Junio discussed with "should we document this
> as being SHA-1 or NewHash" (or "the hash algo"), it seems to me that
> this implementation is living somewhere between using SHA-1 and "the
> hash algo". The hashing uses `the_hash_algo`, but the hash size is
> hardcoded at 20 bytes.
> 
> Maybe it all works out, e.g., so that when someone (brian) merges a
> NewHash and runs the testsuite, this will fail consistently and in a
> safe way. But I wonder if it would be too hard to avoid the hardcoded 24
> already now.
> 
> Martin
> 

I can certainly change this to be:

#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ)

which should (hopefully) make it easier to find this hard coded hash 
length in the sea of hard coded "20" and "160" (bits) littered through 
the codebase.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-07 21:10     ` Junio C Hamano
@ 2018-09-08 14:56       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-08 14:56 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 5:10 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> +struct load_index_extensions
>> +{
>> +#ifndef NO_PTHREADS
>> +	pthread_t pthread;
>> +#endif
>> +	struct index_state *istate;
>> +	void *mmap;
>> +	size_t mmap_size;
>> +	unsigned long src_offset;
> 
> If the file format only allows uint32_t on any platform, perhaps
> this is better specified as uint32_t?  Or if this is offset into
> a mmap'ed region of memory, size_t may be more appropriate.
> 
> Same comment applies to "extension_offset" we see below (which in
> turn means the returned type of read_eoie_extension() function may
> want to match).
> 
>> + };
> 
> Space before '}'??
> 
>> +
>> +static void *load_index_extensions(void *_data)
>> +{
>> +	struct load_index_extensions *p = _data;
> 
> Perhaps we are being superstitious, but I think our code try to
> avoid leading underscore when able, i.e.
> 
> 	load_index_extensions(void *data_)
> 	{
> 		struct load_index_extensions *p = data;

That's what I get for copying code from elsewhere in the source. :-)

static void *preload_thread(void *_data)
{
	int nr;
	struct thread_data *p = _data;

since there isn't any need for the underscore at all, I'll just make it:

static void *load_index_extensions(void *data)
{
	struct load_index_extensions *p = data;

> 
>> +	unsigned long src_offset = p->src_offset;
>> +
>> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
>> +		/* After an array of active_nr index entries,
>> +		 * there can be arbitrary number of extended
>> +		 * sections, each of which is prefixed with
>> +		 * extension name (4-byte) and section length
>> +		 * in 4-byte network byte order.
>> +		 */
>> +		uint32_t extsize;
>> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
>> +		extsize = ntohl(extsize);
> 
> The same "ntohl(), not get_be32()?" question as the one for the
> previous step applies here, too.  I think the answer is "the
> original was written that way" and that is acceptable, but once this
> series lands, we may want to review the whole file and see if it is
> worth making them consistent with a separate clean-up patch.
> 

Makes sense, I'll add a cleanup patch to fix the inconsistency and have 
them use get_be32().

> I think mmap() and munmap() are the only places that wants p->mmap
> and mmap parameters passed around in various callchains to be of
> type "void *"---I wonder if it is simpler to use "const char *"
> throughout and only cast it to "void *" when necessary (I suspect
> that there is nowhere we need to cast to or from "void *" explicitly
> if we did so---assignment and argument passing would give us an
> appropriate cast for free)?

Sure, I'll add minimizing the casting to the clean up patch.

> 
>> +		if (read_index_extension(p->istate,
>> +			(const char *)p->mmap + src_offset,
>> +			(char *)p->mmap + src_offset + 8,
>> +			extsize) < 0) {
>> +			munmap(p->mmap, p->mmap_size);
>> +			die("index file corrupt");
>> +		}
>> +	...
>> @@ -1907,6 +1951,11 @@ ...
>> ...
>> +	p.mmap = mmap;
>> +	p.mmap_size = mmap_size;
>> +
>> +#ifndef NO_PTHREADS
>> +	nr_threads = git_config_get_index_threads();
>> +	if (!nr_threads)
>> +		nr_threads = online_cpus();
>> +
>> +	if (nr_threads >= 2) {
>> +		extension_offset = read_eoie_extension(mmap, mmap_size);
>> +		if (extension_offset) {
>> +			/* create a thread to load the index extensions */
>> +			p.src_offset = extension_offset;
>> +			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
>> +				die(_("unable to create load_index_extensions_thread"));
>> +		}
>> +	}
>> +#endif
> 
> Makes sense.
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-08 14:03           ` Ben Peart
@ 2018-09-08 17:08             ` Martin Ågren
  0 siblings, 0 replies; 153+ messages in thread
From: Martin Ågren @ 2018-09-08 17:08 UTC (permalink / raw)
  To: Ben Peart
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart

On Sat, 8 Sep 2018 at 16:04, Ben Peart <peartben@gmail.com> wrote:
> On 9/8/2018 2:29 AM, Martin Ågren wrote:
> > Maybe it all works out, e.g., so that when someone (brian) merges a
> > NewHash and runs the testsuite, this will fail consistently and in a
> > safe way. But I wonder if it would be too hard to avoid the hardcoded 24
> > already now.
>
> I can certainly change this to be:
>
> #define EOIE_SIZE (4 + GIT_SHA1_RAWSZ)
>
> which should (hopefully) make it easier to find this hard coded hash
> length in the sea of hard coded "20" and "160" (bits) littered through
> the codebase.

Yeah, that seems more grep-friendly.

Martin

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 0/5] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (3 preceding siblings ...)
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-11 23:26 ` " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (5 more replies)
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
                   ` (3 subsequent siblings)
  8 siblings, 6 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

This version of the patch merges in Duy's work to speed up index v4 decoding.
I had to massage it a bit to get it to work with the multi-threading but its
still largely his code. It helps a little (3%-4%) when the cache entry thread(s)
take the longest and not when the index extensions loading is the long thread.

I also added a minor cleanup patch to minimize the casting required when
working with the memory mapped index and other minor changes based on the
feedback received.

Base Ref: v2.19.0
Web-Diff: https://github.com/benpeart/git/commit/9d31d5fb20
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v4 && git checkout 9d31d5fb20


### Patches

Ben Peart (4):
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: speed up index load through parallelization
  read-cache: clean up casting and byte decoding

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 +
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 581 +++++++++++++++++++----
 5 files changed, 531 insertions(+), 98 deletions(-)


base-commit: 1d4361b0f344188ab5eec6dcea01f61a3a3a1670
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 154 +++++++++++++++++++++--
 2 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..2abac0a7a2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *mmap = mmap_;
+	const char *index, *eoie;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 2/5] read-cache: load cache extensions on a worker thread
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel Extensions
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel Extensions
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index eb66a11975..d0d8075978 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2400,6 +2400,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 3461993f0a..f7ebf149fc 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 2abac0a7a2..9b97c29f5b 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 3/5] read-cache: speed up index load through parallelization
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel entries
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel entries
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 240 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 212 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 9b97c29f5b..c01d34a71d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1942,20 +1942,210 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool, istate->cache_nr * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool, estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1997,10 +2187,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2009,33 +2209,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 4/5] read-cache.c: optimize reading index format v4
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
  2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 136 +++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 65 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index c01d34a71d..d21ccb5e67 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1762,22 +1735,24 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1797,21 +1772,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
+
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1948,7 +1956,7 @@ static void *load_index_extensions(void *_data)
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
 			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1959,10 +1967,11 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
@@ -1970,20 +1979,16 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 static unsigned long load_all_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool, istate->cache_nr * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool, estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -2005,8 +2010,7 @@ struct load_cache_entries_thread_data
 	int offset, nr;
 	void *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2019,7 +2023,7 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
@@ -2066,20 +2070,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2102,7 +2109,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2121,7 +2128,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v4 5/5] read-cache: clean up casting and byte decoding
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index d21ccb5e67..6220abc491 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1906,7 +1906,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1916,14 +1916,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1934,13 +1934,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1955,7 +1954,7 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
 			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
@@ -1966,7 +1965,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1977,7 +1976,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	unsigned long consumed;
 
@@ -2008,7 +2007,7 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
 	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
@@ -2028,7 +2027,7 @@ static void *load_cache_entries_thread(void *_data)
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2097,7 +2096,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2145,8 +2144,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2178,7 +2177,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2238,11 +2237,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3265,7 +3264,7 @@ int should_validate_cache_entries(void)
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3276,7 +3275,6 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *mmap = mmap_;
 	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
@@ -3329,8 +3327,7 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v4 0/5] read-cache: speed up index load through parallelization
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (4 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-09-12 14:34   ` Ben Peart
  5 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 14:34 UTC (permalink / raw)
  To: Ben Peart, git; +Cc: gitster, pclouds, Ben Peart



On 9/11/2018 7:26 PM, Ben Peart wrote:
> This version of the patch merges in Duy's work to speed up index v4 decoding.
> I had to massage it a bit to get it to work with the multi-threading but its
> still largely his code. It helps a little (3%-4%) when the cache entry thread(s)
> take the longest and not when the index extensions loading is the long thread.
> 
> I also added a minor cleanup patch to minimize the casting required when
> working with the memory mapped index and other minor changes based on the
> feedback received.
> 
> Base Ref: v2.19.0
> Web-Diff: https://github.com/benpeart/git/commit/9d31d5fb20
> Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v4 && git checkout 9d31d5fb20
> 
> 

A bad merge (mistake on my part, not a bug) means this is missing some 
of the changes from V3.  Please ignore, I'll send an updated series to 
address it.

> ### Patches
> 
> Ben Peart (4):
>    eoie: add End of Index Entry (EOIE) extension
>    read-cache: load cache extensions on a worker thread
>    read-cache: speed up index load through parallelization
>    read-cache: clean up casting and byte decoding
> 
> Nguyễn Thái Ngọc Duy (1):
>    read-cache.c: optimize reading index format v4
> 
>   Documentation/config.txt                 |   6 +
>   Documentation/technical/index-format.txt |  23 +
>   config.c                                 |  18 +
>   config.h                                 |   1 +
>   read-cache.c                             | 581 +++++++++++++++++++----
>   5 files changed, 531 insertions(+), 98 deletions(-)
> 
> 
> base-commit: 1d4361b0f344188ab5eec6dcea01f61a3a3a1670
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 0/5] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (4 preceding siblings ...)
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
@ 2018-09-12 16:18 ` " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (4 more replies)
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                   ` (2 subsequent siblings)
  8 siblings, 5 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

This version of the patch merges in Duy's work to speed up index v4 decoding.
I had to massage it a bit to get it to work with the multi-threading but it is
still largely his code. I also responded to Junio's feedback on initializing
copy_len to avoid compiler warnings.

I also added a minor cleanup patch to minimize the casting required when
working with the memory mapped index and other minor changes based on the
feedback received.

Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/dcf62005f8
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v5 && git checkout dcf62005f8


### Interdiff (v3..v5):

diff --git a/read-cache.c b/read-cache.c
index 8537a55750..c05e887fc9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1768,15 +1741,18 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len = 0;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1796,21 +1772,50 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
+
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
 	if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+		len = strlen(name) + copy_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1897,7 +1902,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1907,14 +1912,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1925,13 +1930,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1946,8 +1950,8 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1957,34 +1961,31 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -1993,7 +1994,7 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
 /*
  * Mostly randomly chosen maximum thread counts: we
  * cap the parallelism to online_cpus() threads, and we want
- * to have at least 100000 cache entries per thread for it to
+ * to have at least 10000 cache entries per thread for it to
  * be worth starting a thread.
  */
 #define THREAD_COST		(10000)
@@ -2004,10 +2005,9 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2020,12 +2020,12 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2067,20 +2067,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2091,7 +2094,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2103,7 +2106,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2122,7 +2125,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
@@ -2140,8 +2142,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2173,7 +2175,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2233,11 +2235,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3256,11 +3258,11 @@ int should_validate_cache_entries(void)
 	return validate_index_cache_entries;
 }
 
-#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3271,14 +3273,18 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
+	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
 	unsigned char hash[GIT_MAX_RAWSZ];
 	git_hash_ctx c;
 
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
 	/* validate the extension signature */
-	index = eoie;
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
 	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
 		return 0;
 	index += sizeof(uint32_t);
@@ -3294,9 +3300,9 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 	 * signature is after the index header and before the eoie extension.
 	 */
 	offset = get_be32(index);
-	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
+	if (mmap + offset < mmap + sizeof(struct cache_header))
 		return 0;
-	if ((const char *)mmap + offset >= eoie)
+	if (mmap + offset >= eoie)
 		return 0;
 	index += sizeof(uint32_t);
 
@@ -3319,20 +3325,19 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
 			return 0;
 
-		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
 
 		src_offset += 8;
 		src_offset += extsize;
 	}
 	the_hash_algo->final_fn(hash, &c);
-	if (hashcmp(hash, (unsigned char *)index))
+	if (hashcmp(hash, (const unsigned char *)index))
 		return 0;
 
 	/* Validate that the extension offsets returned us back to the eoie extension. */
diff --git a/t/README b/t/README
index 59015f7150..69c695ad8e 100644
--- a/t/README
+++ b/t/README
@@ -326,9 +326,6 @@ valid due to the addition of the EOIE extension.
 
 GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
 the index cache entries and extensions for the whole test suite.
-Currently tests 1, 4-9 in t1700-split-index.sh fail as they hard
-code SHA values for the index which are no longer valid due to the
-addition of the EOIE extension.
 
 Naming Tests
 ------------


### Patches

Ben Peart (4):
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: load cache entries on worker threads
  read-cache: clean up casting and byte decoding

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 +
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 579 +++++++++++++++++++----
 t/README                                 |   8 +
 t/t1700-split-index.sh                   |   1 +
 7 files changed, 538 insertions(+), 98 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-13 22:44     ` Junio C Hamano
  2018-09-15 10:02     ` Duy Nguyen
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (3 subsequent siblings)
  4 siblings, 2 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 154 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..858935f123 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *mmap = mmap_;
+	const char *index, *eoie;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 9028b47d92..d8754dd23a 100644
--- a/t/README
+++ b/t/README
@@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
 path where deltas larger than this limit require extra memory
 allocation for bookkeeping.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 39133bcbc8..f613dd72e3 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+export GIT_TEST_DISABLE_EOIE=true
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel Extensions
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel Extensions
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..9bd79fb165 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 858935f123..b203eebb44 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
                       ` (2 more replies)
  2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
  2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart
  4 siblings, 3 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel entries
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel entries
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 242 +++++++++++++++++++++++++++++++++++++++++++++------
 t/README     |   3 +
 2 files changed, 217 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index b203eebb44..880f627b4c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1942,20 +1942,212 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1997,10 +2189,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_TEST_INDEX_THREADS", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2009,33 +2211,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
diff --git a/t/README b/t/README
index d8754dd23a..69c695ad8e 100644
--- a/t/README
+++ b/t/README
@@ -324,6 +324,9 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
 as they currently hard code SHA values for the index which are no longer
 valid due to the addition of the EOIE extension.
 
+GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
+the index cache entries and extensions for the whole test suite.
+
 Naming Tests
 ------------
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 4/5] read-cache.c: optimize reading index format v4
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart
  4 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 132 ++++++++++++++++++++++++++-------------------------
 1 file changed, 67 insertions(+), 65 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 880f627b4c..40dc4723b2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1762,22 +1735,24 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len = 0;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1797,21 +1772,50 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK)
+		len = strlen(name) + copy_len;
+
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1948,7 +1952,7 @@ static void *load_index_extensions(void *_data)
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
 			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1959,10 +1963,11 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
@@ -1970,22 +1975,18 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 static unsigned long load_all_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -2007,8 +2008,7 @@ struct load_cache_entries_thread_data
 	int offset, nr;
 	void *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2021,7 +2021,7 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
@@ -2068,20 +2068,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2104,7 +2107,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2123,7 +2126,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v5 5/5] read-cache: clean up casting and byte decoding
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  4 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 40dc4723b2..c05e887fc9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1902,7 +1902,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1912,14 +1912,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1930,13 +1930,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1951,7 +1950,7 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
 			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
@@ -1962,7 +1961,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1973,7 +1972,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	unsigned long consumed;
 
@@ -2006,7 +2005,7 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
 	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
@@ -2026,7 +2025,7 @@ static void *load_cache_entries_thread(void *_data)
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2095,7 +2094,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2143,8 +2142,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2176,7 +2175,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2236,11 +2235,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3263,7 +3262,7 @@ int should_validate_cache_entries(void)
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3274,7 +3273,6 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *mmap = mmap_;
 	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
@@ -3327,8 +3325,7 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-13 22:44     ` Junio C Hamano
  2018-09-15 10:02     ` Duy Nguyen
  1 sibling, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-13 22:44 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>  
>  test_expect_success 'enable split index' '
>  	git config splitIndex.maxPercentChange 100 &&

It is safer to squash the following in; we may want to revisit the
decision test-lint makes on this issue later, though.

-- >8 --
Subject: [PATCH] SQUASH???

http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#export

specifies how "export name[=word]" ought to work, but because
writing "name=word; export name" is not so much more cumbersome
and some older shells that do not understand the former do grok
the latter.  test-lint also recommends spelling it this way.
---
 t/t1700-split-index.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index f613dd72e3..dab97c2187 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,7 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
-export GIT_TEST_DISABLE_EOIE=true
+GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.19.0


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-13 22:44     ` Junio C Hamano
@ 2018-09-15 10:02     ` Duy Nguyen
  2018-09-17 14:54       ` Ben Peart
  1 sibling, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:02 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> The End of Index Entry (EOIE) is used to locate the end of the variable
> length index entries and the beginning of the extensions. Code can take
> advantage of this to quickly locate the index extensions without having
> to parse through all of the index entries.
>
> Because it must be able to be loaded before the variable length cache
> entries and other index extensions, this extension must be written last.
> The signature for this extension is { 'E', 'O', 'I', 'E' }.
>
> The extension consists of:
>
> - 32-bit offset to the end of the index entries
>
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
>
> SHA-1("TREE" + <binary representation of N> +
>         "REUC" + <binary representation of M>)
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/technical/index-format.txt |  23 ++++
>  read-cache.c                             | 154 +++++++++++++++++++++--
>  t/README                                 |   5 +
>  t/t1700-split-index.sh                   |   1 +
>  4 files changed, 175 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
> index db3572626b..6bc2d90f7f 100644
> --- a/Documentation/technical/index-format.txt
> +++ b/Documentation/technical/index-format.txt
> @@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
>
>    - An ewah bitmap, the n-th bit indicates whether the n-th index entry
>      is not CE_FSMONITOR_VALID.
> +
> +== End of Index Entry
> +
> +  The End of Index Entry (EOIE) is used to locate the end of the variable
> +  length index entries and the begining of the extensions. Code can take
> +  advantage of this to quickly locate the index extensions without having
> +  to parse through all of the index entries.
> +
> +  Because it must be able to be loaded before the variable length cache
> +  entries and other index extensions, this extension must be written last.
> +  The signature for this extension is { 'E', 'O', 'I', 'E' }.
> +
> +  The extension consists of:
> +
> +  - 32-bit offset to the end of the index entries
> +
> +  - 160-bit SHA-1 over the extension types and their sizes (but not
> +       their contents).  E.g. if we have "TREE" extension that is N-bytes
> +       long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +       then the hash would be:
> +
> +       SHA-1("TREE" + <binary representation of N> +
> +               "REUC" + <binary representation of M>)
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..858935f123 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -43,6 +43,7 @@
>  #define CACHE_EXT_LINK 0x6c696e6b        /* "link" */
>  #define CACHE_EXT_UNTRACKED 0x554E5452   /* "UNTR" */
>  #define CACHE_EXT_FSMONITOR 0x46534D4E   /* "FSMN" */
> +#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945 /* "EOIE" */
>
>  /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
>  #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
> @@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
>         case CACHE_EXT_FSMONITOR:
>                 read_fsmonitor_extension(istate, data, sz);
>                 break;
> +       case CACHE_EXT_ENDOFINDEXENTRIES:
> +               /* already handled in do_read_index() */
> +               break;

Perhaps catch this extension when it's not written at the end (e.g. by
some other git implementation) and warn.

>         default:
>                 if (*ext < 'A' || 'Z' < *ext)
>                         return error("index uses %.4s extension, which we do not understand",
> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> +#endif

Keep functions unconditionally built as much as possible. I don't see
why this read_eoie_extension() must be built only on multithread
platforms.

> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>         return 0;
>  }
>
> -static int write_index_ext_header(git_hash_ctx *context, int fd,
> -                                 unsigned int ext, unsigned int sz)
> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
> +                                 int fd, unsigned int ext, unsigned int sz)
>  {
>         ext = htonl(ext);
>         sz = htonl(sz);
> +       if (eoie_context) {
> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
> +       }
>         return ((ce_write(context, fd, &ext, 4) < 0) ||
>                 (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>  }
> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>  {
>         uint64_t start = getnanotime();
>         int newfd = tempfile->fd;
> -       git_hash_ctx c;
> +       git_hash_ctx c, eoie_c;
>         struct cache_header hdr;
>         int i, err = 0, removed, extended, hdr_version;
>         struct cache_entry **cache = istate->cache;
> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         struct ondisk_cache_entry_extended ondisk;
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>         int drop_cache_tree = istate->drop_cache_tree;
> +       unsigned long offset;
>
>         for (i = removed = extended = 0; i < entries; i++) {
>                 if (cache[i]->ce_flags & CE_REMOVE)
> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 return err;
>
>         /* Write extension data here */
> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> +       the_hash_algo->init_fn(&eoie_c);

Don't write (or even calculate to write it) unless it's needed. Which
means only do this when parallel reading is enabled and the index size
large enough, or when a test variable is set so you can force writing
this extension.

I briefly wondered if we should continue writing the extension if it's
already written. This way you can manually enable it with "git
update-index". But I don't think it's worth the complexity.

>         if (!strip_extensions && istate->split_index) {
>                 struct strbuf sb = STRBUF_INIT;
>
>                 err = write_link_extension(&sb, istate) < 0 ||
> -                       write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
> +                       write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
>                                                sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 cache_tree_write(&sb, istate->cache_tree);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 resolve_undo_write(&sb, istate->resolve_undo);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
>                                              sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_untracked_extension(&sb, istate->untracked);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
>                                              sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_fsmonitor_extension(&sb, istate);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +                       || ce_write(&c, newfd, sb.buf, sb.len) < 0;
> +               strbuf_release(&sb);
> +               if (err)
> +                       return -1;
> +       }
> +
> +       /*
> +        * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
> +        * so that it can be found and processed before all the index entries are
> +        * read.
> +        */
> +       if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
> +               struct strbuf sb = STRBUF_INIT;
> +
> +               write_eoie_extension(&sb, &eoie_c, offset);
> +               err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
>
>         return validate_index_cache_entries;
>  }
> +
> +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
> +
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
> +{
> +       /*
> +        * The end of index entries (EOIE) extension is guaranteed to be last
> +        * so that it can be found by scanning backwards from the EOF.
> +        *
> +        * "EOIE"
> +        * <4-byte length>
> +        * <4-byte offset>
> +        * <20-byte hash>
> +        */
> +       const char *mmap = mmap_;
> +       const char *index, *eoie;
> +       uint32_t extsize;
> +       unsigned long offset, src_offset;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +       git_hash_ctx c;
> +
> +       /* ensure we have an index big enough to contain an EOIE extension */
> +       if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
> +               return 0;

All these "return 0" indicates an error in EOIE extension. You
probably want to print some warning (much easier to track down why
parallel reading does not happen).

> +
> +       /* validate the extension signature */
> +       index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
> +       if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /* validate the extension size */
> +       extsize = get_be32(index);
> +       if (extsize != EOIE_SIZE)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * Validate the offset we're going to look for the first extension
> +        * signature is after the index header and before the eoie extension.
> +        */
> +       offset = get_be32(index);
> +       if (mmap + offset < mmap + sizeof(struct cache_header))
> +               return 0;
> +       if (mmap + offset >= eoie)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * The hash is computed over extension types and their sizes (but not
> +        * their contents).  E.g. if we have "TREE" extension that is N-bytes
> +        * long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +        * then the hash would be:
> +        *
> +        * SHA-1("TREE" + <binary representation of N> +
> +        *               "REUC" + <binary representation of M>)
> +        */
> +       src_offset = offset;
> +       the_hash_algo->init_fn(&c);
> +       while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);
> +
> +               /* verify the extension size isn't so large it will wrap around */
> +               if (src_offset + 8 + extsize < src_offset)
> +                       return 0;
> +
> +               the_hash_algo->update_fn(&c, mmap + src_offset, 8);
> +
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +       the_hash_algo->final_fn(hash, &c);
> +       if (hashcmp(hash, (const unsigned char *)index))
> +               return 0;
> +
> +       /* Validate that the extension offsets returned us back to the eoie extension. */
> +       if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> +               return 0;
> +
> +       return offset;
> +}
> +#endif
> +
> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)

We normally just put function implementations before it's used to
avoid static forward declaration. Any special reason why it's not done
here?

> +{
> +       uint32_t buffer;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +
> +       /* offset */
> +       put_be32(&buffer, offset);
> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
> +
> +       /* hash */
> +       the_hash_algo->final_fn(hash, eoie_context);
> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
> +}
> diff --git a/t/README b/t/README
> index 9028b47d92..d8754dd23a 100644
> --- a/t/README
> +++ b/t/README
> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>  path where deltas larger than this limit require extra memory
>  allocation for bookkeeping.
>
> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed

I have a feeling that you won't have problems if you don't write eoie
extension by default in the first place. Then this could be switched
to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
eoie and split index are forced on when running through the test
suite, but that should be an easy fix.

> +as they currently hard code SHA values for the index which are no longer
> +valid due to the addition of the EOIE extension.
> +
>  Naming Tests
>  ------------
>
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>
>  test_expect_success 'enable split index' '
>         git config splitIndex.maxPercentChange 100 &&
> --
> 2.18.0.windows.1
>


-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
                         ` (3 more replies)
  0 siblings, 4 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:22 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by loading
> the cache extensions on a worker thread in parallel with loading the cache
> entries.
>
> In some cases, loading the extensions takes longer than loading the
> cache entries so this patch utilizes the new EOIE to start the thread to
> load the extensions before loading all the cache entries in parallel.
>
> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.
>
> I used p0002-read-cache.sh to generate some performance data:
>
> Test w/100,000 files                Baseline         Parallel Extensions
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> Test w/1,000,000 files              Baseline         Parallel Extensions
> ------------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/config.txt |  6 +++
>  config.c                 | 18 ++++++++
>  config.h                 |  1 +
>  read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
>  4 files changed, 102 insertions(+), 17 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..79f8296d9c 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -2391,6 +2391,12 @@ imap::
>         The configuration variables in the 'imap' section are described
>         in linkgit:git-imap-send[1].
>
> +index.threads::
> +       Specifies the number of threads to spawn when loading the index.
> +       This is meant to reduce index load time on multiprocessor machines.
> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
> +       CPU's and set the number of threads accordingly. Defaults to 'true'.

I'd rather this variable defaults to 0. Spawning threads have
associated cost and most projects out there are small enough that this
multi threading could just add more cost than gain. It only makes
sense to enable this on huge repos.

Wait there's no way to disable this parallel reading? Does not sound
right. And  if ordinary numbers mean the number of threads then 0
should mean no threading. Auto detection could have a new keyword,
like 'auto'.

> +
>  index.version::
>         Specify the version with which new index files should be
>         initialized.  This does not affect existing repositories.
> diff --git a/config.c b/config.c
> index 9a0b10d4bc..9bd79fb165 100644
> --- a/config.c
> +++ b/config.c
> @@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
>         return 0;
>  }
>
> +/*
> + * You can disable multi-threaded code by setting index.threads
> + * to 'false' (or 1)
> + */
> +int git_config_get_index_threads(void)
> +{
> +       int is_bool, val;
> +
> +       if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
> +               if (is_bool)
> +                       return val ? 0 : 1;
> +               else
> +                       return val;
> +       }
> +
> +       return 0; /* auto-detect */
> +}
> +
>  NORETURN
>  void git_die_config_linenr(const char *key, const char *filename, int linenr)
>  {
> diff --git a/config.h b/config.h
> index ab46e0165d..a06027e69b 100644
> --- a/config.h
> +++ b/config.h
> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>  extern int git_config_get_split_index(void);
>  extern int git_config_get_max_percent_split_change(void);
>  extern int git_config_get_fsmonitor(void);
> +extern int git_config_get_index_threads(void);
>
>  /* This dies if the configured or default date is in the future */
>  extern int git_config_get_expiry(const char *key, const char **output);
> diff --git a/read-cache.c b/read-cache.c
> index 858935f123..b203eebb44 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -23,6 +23,10 @@
>  #include "split-index.h"
>  #include "utf8.h"
>  #include "fsmonitor.h"
> +#ifndef NO_PTHREADS
> +#include <pthread.h>
> +#include <thread-utils.h>
> +#endif

I don't think you're supposed to include system header files after
"cache.h". Including thread-utils.h should be enough (and it keeps the
exception of inclduing pthread.h in just one place). Please use
"pthread-utils.h" instead of <pthread-utils.h> which is usually for
system header files. And include ptherad-utils.h unconditionally.

>
>  /* Mask for the name length in ce_flags in the on-disk index */
>
> @@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>  #endif
>  static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>
> +struct load_index_extensions
> +{
> +#ifndef NO_PTHREADS
> +       pthread_t pthread;
> +#endif
> +       struct index_state *istate;
> +       void *mmap;
> +       size_t mmap_size;
> +       unsigned long src_offset;
> +};
> +
> +static void *load_index_extensions(void *_data)
> +{
> +       struct load_index_extensions *p = _data;
> +       unsigned long src_offset = p->src_offset;
> +
> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);
> +               if (read_index_extension(p->istate,
> +                       (const char *)p->mmap + src_offset,
> +                       (char *)p->mmap + src_offset + 8,
> +                       extsize) < 0) {
> +                       munmap(p->mmap, p->mmap_size);
> +                       die("index file corrupt");

_()

> +               }
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +
> +       return NULL;
> +}
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> @@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         void *mmap;
>         size_t mmap_size;
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       struct load_index_extensions p = { 0 };
> +       unsigned long extension_offset = 0;
> +#ifndef NO_PTHREADS
> +       int nr_threads;
> +#endif
>
>         if (istate->initialized)
>                 return istate->cache_nr;
> @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
>         istate->initialized = 1;
>
> +       p.istate = istate;
> +       p.mmap = mmap;
> +       p.mmap_size = mmap_size;
> +
> +#ifndef NO_PTHREADS
> +       nr_threads = git_config_get_index_threads();
> +       if (!nr_threads)
> +               nr_threads = online_cpus();
> +
> +       if (nr_threads >= 2) {
> +               extension_offset = read_eoie_extension(mmap, mmap_size);
> +               if (extension_offset) {
> +                       /* create a thread to load the index extensions */

Pointless comment. It's pretty clear from the pthread_create() below
thanks to good function naming. Please remove.

> +                       p.src_offset = extension_offset;
> +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> +                               die(_("unable to create load_index_extensions_thread"));
> +               }
> +       }
> +#endif
> +
>         if (istate->version == 4) {
>                 previous_name = &previous_name_buf;
>                 mem_pool_init(&istate->ce_mem_pool,
> @@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         istate->timestamp.sec = st.st_mtime;
>         istate->timestamp.nsec = ST_MTIME_NSEC(st);
>
> -       while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
> -               /* After an array of active_nr index entries,
> -                * there can be arbitrary number of extended
> -                * sections, each of which is prefixed with
> -                * extension name (4-byte) and section length
> -                * in 4-byte network byte order.
> -                */
> -               uint32_t extsize;
> -               memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> -               extsize = ntohl(extsize);
> -               if (read_index_extension(istate,
> -                                        (const char *) mmap + src_offset,
> -                                        (char *) mmap + src_offset + 8,
> -                                        extsize) < 0)
> -                       goto unmap;
> -               src_offset += 8;
> -               src_offset += extsize;
> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
> +#ifndef NO_PTHREADS
> +       if (extension_offset && pthread_join(p.pthread, NULL))
> +               die(_("unable to join load_index_extensions_thread"));

I guess the last _ is a typo and you wanted "unable to join
load_index_extensions thread". Please use die_errno() instead.

> +#endif
> +       if (!extension_offset) {
> +               p.src_offset = src_offset;
> +               load_index_extensions(&p);
>         }
>         munmap(mmap, mmap_size);
>         return istate->cache_nr;
> --
> 2.18.0.windows.1
>


-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
@ 2018-09-15 10:24       ` Duy Nguyen
  2018-09-17 16:38         ` Ben Peart
  2018-09-15 16:23       ` Duy Nguyen
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:24 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
> > @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >         istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
> >         istate->initialized = 1;
> >
> > +       p.istate = istate;
> > +       p.mmap = mmap;
> > +       p.mmap_size = mmap_size;
> > +
> > +#ifndef NO_PTHREADS
> > +       nr_threads = git_config_get_index_threads();
> > +       if (!nr_threads)
> > +               nr_threads = online_cpus();
> > +
> > +       if (nr_threads >= 2) {
> > +               extension_offset = read_eoie_extension(mmap, mmap_size);
> > +               if (extension_offset) {

One more thing I forgot. If the extension area is small enough, then
we should not need to create a thread to parse extensions in parallel.
We should know roughly how much work we need because we know the total
size of all extensions.

> > +                       /* create a thread to load the index extensions */
>
> Pointless comment. It's pretty clear from the pthread_create() below
> thanks to good function naming. Please remove.
>
> > +                       p.src_offset = extension_offset;
> > +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> > +                               die(_("unable to create load_index_extensions_thread"));
> > +               }
> > +       }
> > +#endif
> > +
> >         if (istate->version == 4) {
> >                 previous_name = &previous_name_buf;
> >                 mem_pool_init(&istate->ce_mem_pool,
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
@ 2018-09-15 10:31     ` Duy Nguyen
  2018-09-17 17:25       ` Ben Peart
  2018-09-15 11:07     ` Duy Nguyen
  2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:31 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them.
>
> I used p0002-read-cache.sh to generate some performance data:
>
> Test w/100,000 files                Baseline         Parallel entries
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> Test w/1,000,000 files              Baseline         Parallel entries
> ------------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

The numbers here and the previous patch to load extensions in parallel
are exactly the same. What do these numbers mean? With both changes?
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
@ 2018-09-15 11:07     ` Duy Nguyen
  2018-09-15 11:09       ` Duy Nguyen
  2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:07 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them.

I added a couple trace_printf() to see how time is spent. This is with
a 1m entry index (basically my webkit.git index repeated 4 times)

12:50:00.084237 read-cache.c:1721       start loading index
12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s:
loaded all extensions (1667075 bytes)
12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s:
loaded 367110 entries
12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s:
finished scanning all entries
12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s:
loaded 367110 entries
12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s:
loaded 367108 entries
12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s:
read cache .git/index

Two observations:

- the extension thread finishes up quickly (this is with TREE
extension alone). We could use that spare core to parse some more
entries.

- the main "scanning and allocating" thread does hold up the two
remaining threads. You can see the first index entry thread is
finished even before the scanning thread. And this scanning thread
takes a lot of cpu.

If all index entry threads start at the same time, based on these
numbers we would be finished around 12:50:00.185352 mark, cutting
loading time by half.

Could you go back to your original solution? If you don't want to
spend more time on this, I offer to rewrite this patch.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 11:07     ` Duy Nguyen
@ 2018-09-15 11:09       ` Duy Nguyen
  2018-09-17 18:52         ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:09 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 01:07:46PM +0200, Duy Nguyen wrote:
> 12:50:00.084237 read-cache.c:1721       start loading index
> 12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s: loaded all extensions (1667075 bytes)
> 12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s: loaded 367110 entries
> 12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s: finished scanning all entries
> 12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s: loaded 367110 entries
> 12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s: loaded 367108 entries
> 12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s: read cache .git/index

The previous mail wraps these lines and make it a bit hard to read. Corrected now.

--
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
  2018-09-15 11:07     ` Duy Nguyen
@ 2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:29 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>  #ifndef NO_PTHREADS
>         nr_threads = git_config_get_index_threads();
> -       if (!nr_threads)
> -               nr_threads = online_cpus();
> +       if (!nr_threads) {
> +               cpus = online_cpus();
> +               nr_threads = istate->cache_nr / THREAD_COST;
> +               if (nr_threads > cpus)
> +                       nr_threads = cpus;

It seems like overcommitting cpu does reduce time. With this patch
(and a 4 core system), I got

$ test-tool read-cache 100
real    0m36.270s
user    0m54.193s
sys     0m17.346s

if I force nr_threads to 9 (even though cpus is 4)

$ test-tool read-cache 100
real    0m33.592s
user    1m4.230s
sys     0m18.380s

Even though we use more cpus, real time is shorter. I guess these
threads still sleep a bit due to I/O and having more threads than
cores will utilize those idle cycles.
--
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
@ 2018-09-15 16:23       ` Duy Nguyen
  2018-09-17 17:19         ` Junio C Hamano
  2018-09-17 16:26       ` Ben Peart
  2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-15 16:23 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
> Wait there's no way to disable this parallel reading? Does not sound
> right. And  if ordinary numbers mean the number of threads then 0
> should mean no threading. Auto detection could have a new keyword,
> like 'auto'.

My bad. Disabling threading means _1_ thread. What was I thinking...
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-15 10:02     ` Duy Nguyen
@ 2018-09-17 14:54       ` Ben Peart
  2018-09-17 16:05         ` Duy Nguyen
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-17 14:54 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:02 AM, Duy Nguyen wrote:

>>          default:
>>                  if (*ext < 'A' || 'Z' < *ext)
>>                          return error("index uses %.4s extension, which we do not understand",
>> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>>          return ondisk_size + entries * per_entry;
>>   }
>>
>> +#ifndef NO_PTHREADS
>> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>> +#endif
> 
> Keep functions unconditionally built as much as possible. I don't see
> why this read_eoie_extension() must be built only on multithread
> platforms.
> 

This is conditional to avoid generating a warning on single threaded 
platforms where the function is currently unused.  That seemed like a 
better choice than calling it and ignoring it on single threaded 
platforms just to avoid a compiler warning.

>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>> +
>>   /* remember to discard_cache() before reading a different cache! */
>>   int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>   {
>> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>>          return 0;
>>   }
>>
>> -static int write_index_ext_header(git_hash_ctx *context, int fd,
>> -                                 unsigned int ext, unsigned int sz)
>> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
>> +                                 int fd, unsigned int ext, unsigned int sz)
>>   {
>>          ext = htonl(ext);
>>          sz = htonl(sz);
>> +       if (eoie_context) {
>> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
>> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
>> +       }
>>          return ((ce_write(context, fd, &ext, 4) < 0) ||
>>                  (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>>   }
>> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>   {
>>          uint64_t start = getnanotime();
>>          int newfd = tempfile->fd;
>> -       git_hash_ctx c;
>> +       git_hash_ctx c, eoie_c;
>>          struct cache_header hdr;
>>          int i, err = 0, removed, extended, hdr_version;
>>          struct cache_entry **cache = istate->cache;
>> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          struct ondisk_cache_entry_extended ondisk;
>>          struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>          int drop_cache_tree = istate->drop_cache_tree;
>> +       unsigned long offset;
>>
>>          for (i = removed = extended = 0; i < entries; i++) {
>>                  if (cache[i]->ce_flags & CE_REMOVE)
>> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>                  return err;
>>
>>          /* Write extension data here */
>> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>> +       the_hash_algo->init_fn(&eoie_c);
> 
> Don't write (or even calculate to write it) unless it's needed. Which
> means only do this when parallel reading is enabled and the index size
> large enough, or when a test variable is set so you can force writing
> this extension.

I made the logic always write the extension based on the earlier 
discussion [1] where it was suggested this should have been part of the 
original index format for extensions from the beginning.  This helps 
ensure it is available for current and future uses we haven't even 
discovered yet.

[1] 
https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/


>> +
>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> 
> We normally just put function implementations before it's used to
> avoid static forward declaration. Any special reason why it's not done
> here?
> 

This was done to promote readability of the (already large) read-cache.c 
file.  I first considered moving the EOIE read/write functions into a 
separate file entirely but they need access to information only 
available within read-cache.c so I compromised and moved them to the end 
of the file instead.

>> +{
>> +       uint32_t buffer;
>> +       unsigned char hash[GIT_MAX_RAWSZ];
>> +
>> +       /* offset */
>> +       put_be32(&buffer, offset);
>> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +
>> +       /* hash */
>> +       the_hash_algo->final_fn(hash, eoie_context);
>> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
>> +}
>> diff --git a/t/README b/t/README
>> index 9028b47d92..d8754dd23a 100644
>> --- a/t/README
>> +++ b/t/README
>> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>>   path where deltas larger than this limit require extra memory
>>   allocation for bookkeeping.
>>
>> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
>> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
> 
> I have a feeling that you won't have problems if you don't write eoie
> extension by default in the first place. Then this could be switched
> to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
> eoie and split index are forced on when running through the test
> suite, but that should be an easy fix.
> 
>> +as they currently hard code SHA values for the index which are no longer
>> +valid due to the addition of the EOIE extension.
>> +
>>   Naming Tests
>>   ------------
>>
>> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
>> index 39133bcbc8..f613dd72e3 100755
>> --- a/t/t1700-split-index.sh
>> +++ b/t/t1700-split-index.sh
>> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>>   # We need total control of index splitting here
>>   sane_unset GIT_TEST_SPLIT_INDEX
>>   sane_unset GIT_FSMONITOR_TEST
>> +export GIT_TEST_DISABLE_EOIE=true
>>
>>   test_expect_success 'enable split index' '
>>          git config splitIndex.maxPercentChange 100 &&
>> --
>> 2.18.0.windows.1
>>
> 
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 14:54       ` Ben Peart
@ 2018-09-17 16:05         ` Duy Nguyen
  2018-09-17 17:31           ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-17 16:05 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Sep 17, 2018 at 4:55 PM Ben Peart <peartben@gmail.com> wrote:
> On 9/15/2018 6:02 AM, Duy Nguyen wrote:
>
> >>          default:
> >>                  if (*ext < 'A' || 'Z' < *ext)
> >>                          return error("index uses %.4s extension, which we do not understand",
> >> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
> >>          return ondisk_size + entries * per_entry;
> >>   }
> >>
> >> +#ifndef NO_PTHREADS
> >> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> >> +#endif
> >
> > Keep functions unconditionally built as much as possible. I don't see
> > why this read_eoie_extension() must be built only on multithread
> > platforms.
> >
>
> This is conditional to avoid generating a warning on single threaded
> platforms where the function is currently unused.  That seemed like a
> better choice than calling it and ignoring it on single threaded
> platforms just to avoid a compiler warning.

The third option is ignore the compiler. I consider that warning a
helpful suggestion, not a strict rule.

Most devs don't run single thread builds (I think) so is this function
is updated in a way that breaks single thread mode, it can only be
found out when this function is used in single thread mode. At that
point the function may have changed a lot. If it's built
unconditionally, at least single thread users will yell up much sooner
and we could fix it much earlier.

> >> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
> >>                  return err;
> >>
> >>          /* Write extension data here */
> >> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> >> +       the_hash_algo->init_fn(&eoie_c);
> >
> > Don't write (or even calculate to write it) unless it's needed. Which
> > means only do this when parallel reading is enabled and the index size
> > large enough, or when a test variable is set so you can force writing
> > this extension.
>
> I made the logic always write the extension based on the earlier
> discussion [1] where it was suggested this should have been part of the
> original index format for extensions from the beginning.  This helps
> ensure it is available for current and future uses we haven't even
> discovered yet.

But it _is_ available now. If you need it, you write the extension
out. If we make this part of index version 5 (and make it not an
extension anymore) then I buy that argument. As it is, it's an
optional extension.

> [1] https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/
>
>
> >> +
> >> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> >
> > We normally just put function implementations before it's used to
> > avoid static forward declaration. Any special reason why it's not done
> > here?
> >
>
> This was done to promote readability of the (already large) read-cache.c
> file.  I first considered moving the EOIE read/write functions into a
> separate file entirely but they need access to information only
> available within read-cache.c so I compromised and moved them to the end
> of the file instead.

I consider grouping extension related functions closer to
read_index_extension gives better readability, or at least better than
just putting new functions at the end in no particular order. But I
guess this is personal view.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
  2018-09-15 16:23       ` Duy Nguyen
@ 2018-09-17 16:26       ` Ben Peart
  2018-09-17 16:45         ` Duy Nguyen
  2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-17 16:26 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:22 AM, Duy Nguyen wrote:
>> +index.threads::
>> +       Specifies the number of threads to spawn when loading the index.
>> +       This is meant to reduce index load time on multiprocessor machines.
>> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
>> +       CPU's and set the number of threads accordingly. Defaults to 'true'.
> 
> I'd rather this variable defaults to 0. Spawning threads have
> associated cost and most projects out there are small enough that this
> multi threading could just add more cost than gain. It only makes
> sense to enable this on huge repos.
> 
> Wait there's no way to disable this parallel reading? Does not sound
> right. And  if ordinary numbers mean the number of threads then 0
> should mean no threading. Auto detection could have a new keyword,
> like 'auto'.
> 

The index.threads setting is patterned after the pack.threads setting 
for consistency.  Specifying 1 (or 'false') will disable multithreading 
but I will call that out explicitly in the documentation to make it more 
obvious.

The THREAD_COST logic is designed to ensure small repos don't incur more 
cost than gain.  If you have data on that logic that shows it isn't 
working properly, I'm happy to change the logic as necessary.

>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -23,6 +23,10 @@
>>   #include "split-index.h"
>>   #include "utf8.h"
>>   #include "fsmonitor.h"
>> +#ifndef NO_PTHREADS
>> +#include <pthread.h>
>> +#include <thread-utils.h>
>> +#endif
> 
> I don't think you're supposed to include system header files after
> "cache.h". Including thread-utils.h should be enough (and it keeps the
> exception of inclduing pthread.h in just one place). Please use
> "pthread-utils.h" instead of <pthread-utils.h> which is usually for
> system header files. And include ptherad-utils.h unconditionally.
> 

Thanks, I'll fix that.

>>
>>   /* Mask for the name length in ce_flags in the on-disk index */
>>
>> @@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>>   #endif
>>   static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>>
>> +struct load_index_extensions
>> +{
>> +#ifndef NO_PTHREADS
>> +       pthread_t pthread;
>> +#endif
>> +       struct index_state *istate;
>> +       void *mmap;
>> +       size_t mmap_size;
>> +       unsigned long src_offset;
>> +};
>> +
>> +static void *load_index_extensions(void *_data)
>> +{
>> +       struct load_index_extensions *p = _data;
>> +       unsigned long src_offset = p->src_offset;
>> +
>> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
>> +               /* After an array of active_nr index entries,
>> +                * there can be arbitrary number of extended
>> +                * sections, each of which is prefixed with
>> +                * extension name (4-byte) and section length
>> +                * in 4-byte network byte order.
>> +                */
>> +               uint32_t extsize;
>> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
>> +               extsize = ntohl(extsize);
>> +               if (read_index_extension(p->istate,
>> +                       (const char *)p->mmap + src_offset,
>> +                       (char *)p->mmap + src_offset + 8,
>> +                       extsize) < 0) {
>> +                       munmap(p->mmap, p->mmap_size);
>> +                       die("index file corrupt");
> 
> _()
> 

You're feedback style can be a bit abrupt and terse.  I _think_ what you 
are trying to say here is that the "die" call should use the _() macro 
around the string.

This is an edit of the previous code that loaded index extensions and 
doesn't change the use of _(). I don't know the rules for when _() 
should be used and didn't have any luck finding where it was documented 
so left it unchanged.

FWIW, in this file alone there are 20 existing instances of die() or 
die_errorno() and only two that use the _() macro.  A quick grep through 
the source code shows thousands of die() calls the vast majority of 
which do not use the _() macro.  This appears to be an area that is 
unclear and inconsistent and could use some attention in a separate patch.


>> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
>> +#ifndef NO_PTHREADS
>> +       if (extension_offset && pthread_join(p.pthread, NULL))
>> +               die(_("unable to join load_index_extensions_thread"));
> 
> I guess the last _ is a typo and you wanted "unable to join
> load_index_extensions thread". Please use die_errno() instead.
> 

Why should this be die_errorno() here?  All other instances of 
pthread_join() failing in a fatal way use die(), not die_errorno().

>> +#endif
>> +       if (!extension_offset) {
>> +               p.src_offset = src_offset;
>> +               load_index_extensions(&p);
>>          }
>>          munmap(mmap, mmap_size);
>>          return istate->cache_nr;
>> --
>> 2.18.0.windows.1
>>
> 
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:24       ` Duy Nguyen
@ 2018-09-17 16:38         ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-17 16:38 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:24 AM, Duy Nguyen wrote:
> On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
>>> @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>          istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
>>>          istate->initialized = 1;
>>>
>>> +       p.istate = istate;
>>> +       p.mmap = mmap;
>>> +       p.mmap_size = mmap_size;
>>> +
>>> +#ifndef NO_PTHREADS
>>> +       nr_threads = git_config_get_index_threads();
>>> +       if (!nr_threads)
>>> +               nr_threads = online_cpus();
>>> +
>>> +       if (nr_threads >= 2) {
>>> +               extension_offset = read_eoie_extension(mmap, mmap_size);
>>> +               if (extension_offset) {
> 
> One more thing I forgot. If the extension area is small enough, then
> we should not need to create a thread to parse extensions in parallel.
> We should know roughly how much work we need because we know the total
> size of all extensions.
> 

The only extensions I found to be significant enough to be helped by a 
separate thread was the cache tree.  Since the size of the cache tree is 
driven by the number of files in the repo, I think the existing 
THREAD_COST logic (that comes in the next patch of the series) is a 
sufficient proxy.  Basically, if you have enough cache entries to be 
benefited by threading, your extensions (driven by the cache tree) are 
probably also big enough to warrant a thread.

>>> +                       /* create a thread to load the index extensions */
>>
>> Pointless comment. It's pretty clear from the pthread_create() below
>> thanks to good function naming. Please remove.
>>
>>> +                       p.src_offset = extension_offset;
>>> +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
>>> +                               die(_("unable to create load_index_extensions_thread"));
>>> +               }
>>> +       }
>>> +#endif
>>> +
>>>          if (istate->version == 4) {
>>>                  previous_name = &previous_name_buf;
>>>                  mem_pool_init(&istate->ce_mem_pool,

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-17 16:26       ` Ben Peart
@ 2018-09-17 16:45         ` Duy Nguyen
  0 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-17 16:45 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Sep 17, 2018 at 6:26 PM Ben Peart <peartben@gmail.com> wrote:
>
>
>
> On 9/15/2018 6:22 AM, Duy Nguyen wrote:
> >> +index.threads::
> >> +       Specifies the number of threads to spawn when loading the index.
> >> +       This is meant to reduce index load time on multiprocessor machines.
> >> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
> >> +       CPU's and set the number of threads accordingly. Defaults to 'true'.
> >
> > I'd rather this variable defaults to 0. Spawning threads have
> > associated cost and most projects out there are small enough that this
> > multi threading could just add more cost than gain. It only makes
> > sense to enable this on huge repos.
> >
> > Wait there's no way to disable this parallel reading? Does not sound
> > right. And  if ordinary numbers mean the number of threads then 0
> > should mean no threading. Auto detection could have a new keyword,
> > like 'auto'.
> >
>
> The index.threads setting is patterned after the pack.threads setting
> for consistency.  Specifying 1 (or 'false') will disable multithreading
> but I will call that out explicitly in the documentation to make it more
> obvious.
>
> The THREAD_COST logic is designed to ensure small repos don't incur more
> cost than gain.  If you have data on that logic that shows it isn't
> working properly, I'm happy to change the logic as necessary.

THREAD_COST does not apply to this extension thread if I remember correctly.

> >> +static void *load_index_extensions(void *_data)
> >> +{
> >> +       struct load_index_extensions *p = _data;
> >> +       unsigned long src_offset = p->src_offset;
> >> +
> >> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> >> +               /* After an array of active_nr index entries,
> >> +                * there can be arbitrary number of extended
> >> +                * sections, each of which is prefixed with
> >> +                * extension name (4-byte) and section length
> >> +                * in 4-byte network byte order.
> >> +                */
> >> +               uint32_t extsize;
> >> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> >> +               extsize = ntohl(extsize);
> >> +               if (read_index_extension(p->istate,
> >> +                       (const char *)p->mmap + src_offset,
> >> +                       (char *)p->mmap + src_offset + 8,
> >> +                       extsize) < 0) {
> >> +                       munmap(p->mmap, p->mmap_size);
> >> +                       die("index file corrupt");
> >
> > _()
> >
>
> You're feedback style can be a bit abrupt and terse.  I _think_ what you
> are trying to say here is that the "die" call should use the _() macro
> around the string.

Yes. Sorry I should have explained a bit better.

> This is an edit of the previous code that loaded index extensions and
> doesn't change the use of _(). I don't know the rules for when _()
> should be used and didn't have any luck finding where it was documented
> so left it unchanged.
>
> FWIW, in this file alone there are 20 existing instances of die() or
> die_errorno() and only two that use the _() macro.  A quick grep through
> the source code shows thousands of die() calls the vast majority of
> which do not use the _() macro.  This appears to be an area that is
> unclear and inconsistent and could use some attention in a separate patch.

This is one of the gray areas where we have to determine if the
message should be translated or not. And it should be translated
unless it's part of the plumbing output, to be consumed by scripts.

I know there's lots of messages still untranslated. I'm trying to do
something about that. But I cannot just go fix up all strings when you
all keep adding more strings for me to go fix. When you add a new
string, please consider if it should be translated or not. In this
case since it already receives reviewer attention we should be able to
determine it now, instead of delaying it for later.

> >> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
> >> +#ifndef NO_PTHREADS
> >> +       if (extension_offset && pthread_join(p.pthread, NULL))
> >> +               die(_("unable to join load_index_extensions_thread"));
> >
> > I guess the last _ is a typo and you wanted "unable to join
> > load_index_extensions thread". Please use die_errno() instead.
> >
>
> Why should this be die_errorno() here?  All other instances of
> pthread_join() failing in a fatal way use die(), not die_errorno().

That argument does not fly well in my opinion. I read the man page and
it listed the error codes, which made me think that we need to use
die_errno() to show the error. My mistake though is the error is
returned as the return value, not in errno, so die_errno() would not
catch it. But we could still do something like

    int ret = pthread_join();
    die(_("blah blah: %s"), strerror(ret));

Other code can also be improved, but that's a separate issue.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 16:23       ` Duy Nguyen
@ 2018-09-17 17:19         ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-17 17:19 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
>> Wait there's no way to disable this parallel reading? Does not sound
>> right. And  if ordinary numbers mean the number of threads then 0
>> should mean no threading. Auto detection could have a new keyword,
>> like 'auto'.
>
> My bad. Disabling threading means _1_ thread. What was I thinking...

I did the same during my earlier review.  It seems that it somehow
is unintuitive to us that we do not specify how many _extra_ threads
of control we dedicate to ;-).

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 10:31     ` Duy Nguyen
@ 2018-09-17 17:25       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-17 17:25 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:31 AM, Duy Nguyen wrote:
> On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>>
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
>>
>> It accomplishes this by having the primary thread loop across the index file
>> tracking the offset and (for V4 indexes) expanding the name. It creates a
>> thread to process each block of entries as it comes to them.
>>
>> I used p0002-read-cache.sh to generate some performance data:
>>
>> Test w/100,000 files                Baseline         Parallel entries
>> ---------------------------------------------------------------------------
>> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>>
>> Test w/1,000,000 files              Baseline         Parallel entries
>> ------------------------------------------------------------------------------
>> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%
> 
> The numbers here and the previous patch to load extensions in parallel
> are exactly the same. What do these numbers mean? With both changes?
> 

It means I messed up when creating my commit message for the extension 
patch and copy/pasted the wrong numbers.  Yes, these numbers are with 
both changes (the correct numbers for the extension only are not as good).

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 16:05         ` Duy Nguyen
@ 2018-09-17 17:31           ` Junio C Hamano
  2018-09-17 17:38             ` Duy Nguyen
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-09-17 17:31 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> But it _is_ available now. If you need it, you write the extension
> out.

Are you arguing for making it omitted when it is not needed (e.g.
small enough index file)?  IOW, did you mean "If you do not need it,
you do not write it out" by the above?

I do not think overhead of writing (or preparing to write) the
extension for a small index file is by definition small enough ;-).

I do not think the configuration that decides if the reader side
uses parallel reading should have any say in the decision to write
(or omit) the extension, by the way.



^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 17:31           ` Junio C Hamano
@ 2018-09-17 17:38             ` Duy Nguyen
  2018-09-17 19:08               ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-17 17:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

On Mon, Sep 17, 2018 at 7:31 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Duy Nguyen <pclouds@gmail.com> writes:
>
> > But it _is_ available now. If you need it, you write the extension
> > out.
>
> Are you arguing for making it omitted when it is not needed (e.g.
> small enough index file)?  IOW, did you mean "If you do not need it,
> you do not write it out" by the above?

Yes I did.

> I do not think overhead of writing (or preparing to write) the
> extension for a small index file is by definition small enough ;-).

Good point.

I get annoyed by the "ignoring unknown extension xxx" messages while
testing though (not just this extension) and I think it will be the
same for other git implementations. But perhaps other implementations
just silently drop the extension. Most of the extensions we have added
so far (except the ancient 'TREE') are optional and are probably not
present 99% of time when a different git impl reads an index created
by C Git. This 'EIOE' may be a good test then to see if they follow
the "ignore optional extensions" rule since it will always appear in
new C Git releases.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 11:09       ` Duy Nguyen
@ 2018-09-17 18:52         ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-17 18:52 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 7:09 AM, Duy Nguyen wrote:
> On Sat, Sep 15, 2018 at 01:07:46PM +0200, Duy Nguyen wrote:
>> 12:50:00.084237 read-cache.c:1721       start loading index
>> 12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s: loaded all extensions (1667075 bytes)
>> 12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s: loaded 367110 entries
>> 12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s: finished scanning all entries
>> 12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s: loaded 367110 entries
>> 12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s: loaded 367108 entries
>> 12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s: read cache .git/index
> 
> The previous mail wraps these lines and make it a bit hard to read. Corrected now.
> 
> --
> Duy
> 

Interesting!  Clearly the data shape makes a big difference here as I 
had run a similar test but in my case, the extensions thread actually 
finished last (and it's cost is what drove me to move that onto a 
separate thread that starts first).

Purpose	    			First	Last	Duration
load_index_extensions_thread	719.40	968.50	249.10
load_cache_entries_thread	718.89	738.65	19.76
load_cache_entries_thread	730.39	753.83	23.43
load_cache_entries_thread	741.23	751.23	10.00
load_cache_entries_thread	751.93	780.88	28.95
load_cache_entries_thread	763.60	791.31	27.72
load_cache_entries_thread	773.46	783.46	10.00
load_cache_entries_thread	783.96	794.28	10.32
load_cache_entries_thread	795.61	805.52	9.91
load_cache_entries_thread	805.99	827.21	21.22
load_cache_entries_thread	816.85	826.85	10.00
load_cache_entries_thread	827.03	837.96	10.93

In my tests, the scanning thread clearly delayed the later ce threads 
but given the extension was so slow, it didn't impact the overall time 
nearly as much as your case.

I completely agree that the optimal solution would be to go back to my 
original patch/design.  It eliminates the overhead of the scanning 
thread entirely and allows all threads to start at the same time. This 
would ensure the best performance whether the extensions were the 
longest thread or the cache entry threads took the longest.

I ran out of time and energy last year so dropped it to work on other 
tasks.  I appreciate your offer of help. Perhaps between the two of us 
we could successfully get it through the mailing list this time. :-) 
Let me go back and see what it would take to combine the current EOIE 
patch with the older IEOT patch.

I'm also intrigued with your observation that over committing the cpu 
actually results in time savings.  I hadn't tested that.  It looks like 
that could have a positive impact on the overall time and warrant a 
change to the default nr_threads logic.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 17:38             ` Duy Nguyen
@ 2018-09-17 19:08               ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-17 19:08 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> I get annoyed by the "ignoring unknown extension xxx" messages while
> testing though (not just this extension) and I think it will be the
> same for other git implementations. But perhaps other implementations
> just silently drop the extension. Most of the extensions we have added
> so far (except the ancient 'TREE') are optional and are probably not

Most of the index extensions are optional, including TREE.  I think
"link" is the only one that the readers that do not understand it
are told to abort without causing damage.

> present 99% of time when a different git impl reads an index created
> by C Git. This 'EIOE' may be a good test then to see if they follow
> the "ignore optional extensions" rule since it will always appear in
> new C Git releases.

I think we probably should squelch "ignoring unknown" unless some
sort of GIT_TRACE/DEBUG switch is set.

Patches welcome ;-)

Thanks.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
                         ` (2 preceding siblings ...)
  2018-09-17 16:26       ` Ben Peart
@ 2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-17 21:32 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

>> diff --git a/read-cache.c b/read-cache.c
>> index 858935f123..b203eebb44 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -23,6 +23,10 @@
>>  #include "split-index.h"
>>  #include "utf8.h"
>>  #include "fsmonitor.h"
>> +#ifndef NO_PTHREADS
>> +#include <pthread.h>
>> +#include <thread-utils.h>
>> +#endif
>
> I don't think you're supposed to include system header files after
> "cache.h". Including thread-utils.h should be enough (and it keeps the
> exception of inclduing pthread.h in just one place). Please use
> "pthread-utils.h" instead of <pthread-utils.h> which is usually for
> system header files. And include ptherad-utils.h unconditionally.

All correct except for s/p\(thread-utils\)/\1/g;
Sorry for missing this during my earlier review.

Thanks.



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 0/7] speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (5 preceding siblings ...)
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
@ 2018-09-26 19:54 ` Ben Peart
  2018-09-26 19:54   ` [PATCH v6 1/7] read-cache.c: optimize reading index format v4 Ben Peart
                     ` (8 more replies)
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
  8 siblings, 9 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart


Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/a0300882d4
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v6 && git checkout a0300882d4


This iteration brings back the Index Entry Offset Table (IEOT) extension
which enables us to multi-thread the cache entry parsing without having
the primary thread have to scan all the entries first.  In cases where the
cache entry parsing is the most expensive part, this yields some additional
savings.

Using p0002-read-cache.sh to generate some performance numbers shows how
each of the various patches contribute to the overall performance win.


Test w/100,000 files    Baseline  Optimize V4    Extensions     Entries
----------------------------------------------------------------------------
0002.1: read_cache      22.36     18.74 -16.2%   18.64 -16.6%   12.63 -43.5%

Test w/1,000,000 files  Baseline  Optimize V4    Extensions     Entries
-----------------------------------------------------------------------------
0002.1: read_cache      304.40    270.70 -11.1%  195.50 -35.8%  204.82 -32.7%

Note that on the 1,000,000 files case, multi-threading the cache entry parsing
does not yield a performance win.  This is because the cost to parse the
index extensions in this repo, far outweigh the cost of loading the cache
entries.

Name                            First    Last	  Elapsed	
load_index_extensions()		629.001  870.244  241.243	
load_cache_entries_thread()	683.911  723.199  39.288	
load_cache_entries_thread()	686.206  723.512  37.306	
load_cache_entries_thread()	686.43   722.596  36.166	
load_cache_entries_thread()	684.998  718.74   33.742	
load_cache_entries_thread()	685.035  718.698  33.663	
load_cache_entries_thread()	686.557  709.545  22.988	
load_cache_entries_thread()	684.533  703.536  19.003	
load_cache_entries_thread()	684.537  703.521  18.984	
load_cache_entries_thread()	685.062  703.774  18.712	
load_cache_entries_thread()	685.42   703.416  17.996	
load_cache_entries_thread()	648.604  664.496  15.892	
				
293.74 Total load_cache_entries_thread()

The high cost of parsing the index extensions is driven by the cache tree
and the untracked cache extensions. As this is currently the longest pole,
any reduction in this time will reduce the overall index load times so is
worth further investigation in another patch series.

Name                                    First    Last     Elapsed
|   + git!read_index_extension     	684.052  870.244  186.192
|    + git!cache_tree_read         	684.052  797.801  113.749
|    + git!read_untracked_extension	797.801  870.244  72.443

One option would be to load each extension on a separate thread but I
believe that is overkill for the vast majority of repos.  Instead, some
optimization of the loading code for these two extensions is probably worth
looking into as a quick examination shows that the bulk of the time for both
of them is spent in xcalloc().


### Patches

Ben Peart (6):
  read-cache: clean up casting and byte decoding
  eoie: add End of Index Entry (EOIE) extension
  config: add new index.threads config setting
  read-cache: load cache extensions on a worker thread
  ieot: add Index Entry Offset Table (IEOT) extension
  read-cache: load cache entries on worker threads

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   7 +
 Documentation/technical/index-format.txt |  41 ++
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 741 +++++++++++++++++++----
 t/README                                 |  10 +
 t/t1700-split-index.sh                   |   2 +
 7 files changed, 705 insertions(+), 115 deletions(-)


base-commit: fe8321ec057f9231c26c29b364721568e58040f7
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 1/7] read-cache.c: optimize reading index format v4
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-26 19:54   ` [PATCH v6 2/7] read-cache: clean up casting and byte decoding Ben Peart
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 8d04d78a58..583a4fb1f8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,24 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1789,21 +1750,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1892,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1930,9 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 2/7] read-cache: clean up casting and byte decoding
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
  2018-09-26 19:54   ` [PATCH v6 1/7] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 583a4fb1f8..6ba99e2c96 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1650,7 +1650,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1674,7 +1674,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1889,8 +1889,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd, i;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
 
@@ -1918,7 +1918,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -1943,7 +1943,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1961,21 +1961,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
+					 mmap + src_offset,
+					 mmap + src_offset + 8,
 					 extsize) < 0)
 			goto unmap;
 		src_offset += 8;
 		src_offset += extsize;
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
  2018-09-26 19:54   ` [PATCH v6 1/7] read-cache.c: optimize reading index format v4 Ben Peart
  2018-09-26 19:54   ` [PATCH v6 2/7] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-28  0:19     ` SZEDER Gábor
                       ` (2 more replies)
  2018-09-26 19:54   ` [PATCH v6 4/7] config: add new index.threads config setting Ben Peart
                     ` (5 subsequent siblings)
  8 siblings, 3 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 151 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 172 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 6ba99e2c96..80255d3088 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1883,6 +1887,9 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2190,11 +2197,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2437,7 +2448,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2446,6 +2457,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	off_t offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2479,6 +2491,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2512,11 +2525,14 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2527,7 +2543,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2537,7 +2553,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2548,7 +2564,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2559,7 +2575,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2975,3 +3007,106 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *index, *eoie;
+	uint32_t extsize;
+	size_t offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *	 "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 3ea6c85460..aa33ac4f26 100644
--- a/t/README
+++ b/t/README
@@ -327,6 +327,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
 be written after every 'git commit' command, and overrides the
 'core.commitGraph' setting to true.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index be22398a85..1f168378c8 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-28  0:26     ` SZEDER Gábor
  2018-09-26 19:54   ` [PATCH v6 5/7] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (4 subsequent siblings)
  8 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

Add support for a new index.threads config setting which will be used to
control the threading code in do_read_index().  A value of 0 will tell the
index code to automatically determine the correct number of threads to use.
A value of 1 will make the code single threaded.  A value greater than 1
will set the maximum number of threads to use.

For testing purposes, this setting can be overwritten by setting the
GIT_TEST_INDEX_THREADS=<n> environment variable to a value greater than 0.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  7 +++++++
 config.c                 | 18 ++++++++++++++++++
 config.h                 |  1 +
 t/README                 |  5 +++++
 t/t1700-split-index.sh   |  1 +
 5 files changed, 32 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ad0f4510c3..8fd973b76b 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2413,6 +2413,13 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Specifying 1 or
+	'false' will disable multithreading. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 3461993f0a..2ee29f6f86 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_index_threads(void)
+{
+	int is_bool, val = 0;
+
+	val = git_env_ulong("GIT_TEST_INDEX_THREADS", 0);
+	if (val)
+		return val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/t/README b/t/README
index aa33ac4f26..0fcecf4500 100644
--- a/t/README
+++ b/t/README
@@ -332,6 +332,11 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
 as they currently hard code SHA values for the index which are no longer
 valid due to the addition of the EOIE extension.
 
+GIT_TEST_INDEX_THREADS=<n> enables exercising the multi-threaded loading
+of the index for the whole test suite by bypassing the default number of
+cache entries and thread minimums. Settting this to 1 will make the
+index loading single threaded.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 1f168378c8..ab205954cf 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -8,6 +8,7 @@ test_description='split index mode tests'
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
 GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
+GIT_TEST_INDEX_THREADS=1; export GIT_TEST_INDEX_THREADS
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 5/7] read-cache: load cache extensions on a worker thread
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-26 19:54   ` [PATCH v6 4/7] config: add new index.threads config setting Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-26 19:54   ` [PATCH v6 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

	Test w/100,000 files reduced the time by 0.53%
	Test w/1,000,000 files reduced the time by 27.78%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 97 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 80255d3088..8da21c9273 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,7 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#include "thread-utils.h"
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1890,6 +1891,46 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	const char *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap((void *)p->mmap, p->mmap_size);
+			die(_("index file corrupt"));
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1900,6 +1941,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
+	struct load_index_extensions p;
+	size_t extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,6 +1982,30 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads > 1) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			int err;
+
+			p.src_offset = extension_offset;
+			err = pthread_create(&p.pthread, NULL, load_index_extensions, &p);
+			if (err)
+				die(_("unable to create load_index_extensions thread: %s"), strerror(err));
+
+			nr_threads--;
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
@@ -1960,22 +2030,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		extsize = get_be32(mmap + src_offset + 4);
-		if (read_index_extension(istate,
-					 mmap + src_offset,
-					 mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset) {
+		int ret = pthread_join(p.pthread, NULL);
+		if (ret)
+			die(_("unable to join load_index_extensions thread: %s"), strerror(ret));
+	}
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (4 preceding siblings ...)
  2018-09-26 19:54   ` [PATCH v6 5/7] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-26 19:54   ` [PATCH v6 7/7] read-cache: load cache entries on worker threads Ben Peart
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

This patch enables addressing the CPU cost of loading the index by adding
additional data to the index that will allow us to efficiently multi-
thread the loading and conversion of cache entries.

It accomplishes this by adding an (optional) index extension that is a
table of offsets to blocks of cache entries in the index file.  To make
this work for V4 indexes, when writing the cache entries, it periodically
"resets" the prefix-compression by encoding the current entry as if the
path name for the previous entry is completely different and saves the
offset of that entry in the IEOT.  Basically, with V4 indexes, it
generates offsets into blocks of prefix-compressed entries.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 Documentation/technical/index-format.txt |  18 +++
 read-cache.c                             | 166 +++++++++++++++++++++++
 2 files changed, 184 insertions(+)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index 6bc2d90f7f..7c4d67aa6a 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -337,3 +337,21 @@ The remaining data of each directory block is grouped by type:
 
 	SHA-1("TREE" + <binary representation of N> +
 		"REUC" + <binary representation of M>)
+
+== Index Entry Offset Table
+
+  The Index Entry Offset Table (IEOT) is used to help address the CPU
+  cost of loading the index by enabling multi-threading the process of
+  converting cache entries from the on-disk format to the in-memory format.
+  The signature for this extension is { 'I', 'E', 'O', 'T' }.
+
+  The extension consists of:
+
+  - 32-bit version (currently 1)
+
+  - A number of index offset entries each consisting of:
+
+    - 32-bit offset from the begining of the file to the first cache entry
+	in this block of entries.
+
+    - 32-bit count of cache entries in this block
diff --git a/read-cache.c b/read-cache.c
index 8da21c9273..9b0554d4e6 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -45,6 +45,7 @@
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
 #define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
+#define CACHE_EXT_INDEXENTRYOFFSETTABLE 0x49454F54 /* "IEOT" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1696,6 +1697,7 @@ static int read_index_extension(struct index_state *istate,
 		read_fsmonitor_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_ENDOFINDEXENTRIES:
+	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
 		/* already handled in do_read_index() */
 		break;
 	default:
@@ -1888,6 +1890,23 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+struct index_entry_offset
+{
+	/* starting byte offset into index file, count of index entries in this block */
+	int offset, nr;
+};
+
+struct index_entry_offset_table
+{
+	int nr;
+	struct index_entry_offset entries[0];
+};
+
+#ifndef NO_PTHREADS
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset);
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
+#endif
+
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
@@ -1931,6 +1950,15 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+
+#define THREAD_COST		(10000)
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2523,6 +2551,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
 	off_t offset;
+	int ieot_work = 1;
+	struct index_entry_offset_table *ieot = NULL;
+	int nr;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2556,7 +2587,33 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+#ifndef NO_PTHREADS
+	if (!strip_extensions && (nr = git_config_get_index_threads()) != 1) {
+		int ieot_blocks, cpus;
+
+		/*
+		 * ensure default number of ieot blocks maps evenly to the
+		 * default number of threads that will process them
+		 */
+		if (!nr) {
+			ieot_blocks = istate->cache_nr / THREAD_COST;
+			if (ieot_blocks < 1)
+				ieot_blocks = 1;
+			cpus = online_cpus();
+			if (ieot_blocks > cpus - 1)
+				ieot_blocks = cpus - 1;
+		} else {
+			ieot_blocks = nr;
+		}
+		ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
+			+ (ieot_blocks * sizeof(struct index_entry_offset)));
+		ieot->nr = 0;
+		ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
+	}
+#endif
+
 	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	nr = 0;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2578,11 +2635,31 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
+		if (ieot && i && (i % ieot_work == 0)) {
+			ieot->entries[ieot->nr].nr = nr;
+			ieot->entries[ieot->nr].offset = offset;
+			ieot->nr++;
+			/*
+			 * If we have a V4 index, set the first byte to an invalid
+			 * character to ensure there is nothing common with the previous
+			 * entry
+			 */
+			if (previous_name)
+				previous_name->buf[0] = 0;
+			nr = 0;
+			offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+		}
 		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
 
 		if (err)
 			break;
+		nr++;
+	}
+	if (ieot && nr) {
+		ieot->entries[ieot->nr].nr = nr;
+		ieot->entries[ieot->nr].offset = offset;
+		ieot->nr++;
 	}
 	strbuf_release(&previous_name_buf);
 
@@ -2593,6 +2670,24 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
 	the_hash_algo->init_fn(&eoie_c);
 
+	/*
+	 * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
+	 * can minimze the number of extensions we have to scan through to
+	 * find it during load.
+	 */
+#ifndef NO_PTHREADS
+	if (!strip_extensions && ieot) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_ieot_extension(&sb, ieot);
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+#endif
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
@@ -3175,3 +3270,74 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
 	the_hash_algo->final_fn(hash, eoie_context);
 	strbuf_add(sb, hash, the_hash_algo->rawsz);
 }
+
+#ifndef NO_PTHREADS
+#define IEOT_VERSION	(1)
+
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
+{
+       const char *index = NULL;
+       uint32_t extsize, ext_version;
+       struct index_entry_offset_table *ieot;
+       int i, nr;
+
+       /* find the IEOT extension */
+       if (!offset)
+	       return NULL;
+       while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
+	       extsize = get_be32(mmap + offset + 4);
+	       if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
+		       index = mmap + offset + 4 + 4;
+		       break;
+	       }
+	       offset += 8;
+	       offset += extsize;
+       }
+       if (!index)
+	       return NULL;
+
+       /* validate the version is IEOT_VERSION */
+       ext_version = get_be32(index);
+       if (ext_version != IEOT_VERSION)
+	       return NULL;
+       index += sizeof(uint32_t);
+
+       /* extension size - version bytes / bytes per entry */
+       nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
+       if (!nr)
+	       return NULL;
+       ieot = xmalloc(sizeof(struct index_entry_offset_table)
+	       + (nr * sizeof(struct index_entry_offset)));
+       ieot->nr = nr;
+       for (i = 0; i < nr; i++) {
+	       ieot->entries[i].offset = get_be32(index);
+	       index += sizeof(uint32_t);
+	       ieot->entries[i].nr = get_be32(index);
+	       index += sizeof(uint32_t);
+       }
+
+       return ieot;
+}
+
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
+{
+       uint32_t buffer;
+       int i;
+
+       /* version */
+       put_be32(&buffer, IEOT_VERSION);
+       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+       /* ieot */
+       for (i = 0; i < ieot->nr; i++) {
+
+	       /* offset */
+	       put_be32(&buffer, ieot->entries[i].offset);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	       /* count */
+	       put_be32(&buffer, ieot->entries[i].nr);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+       }
+}
+#endif
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v6 7/7] read-cache: load cache entries on worker threads
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (5 preceding siblings ...)
  2018-09-26 19:54   ` [PATCH v6 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
@ 2018-09-26 19:54   ` Ben Peart
  2018-09-26 22:06   ` [PATCH v6 0/7] speed up index load through parallelization Junio C Hamano
  2018-09-27 17:13   ` Duy Nguyen
  8 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-26 19:54 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by utilizing
the Index Entry Offset Table (IEOT) to divide loading and conversion of
the cache entries across multiple threads in parallel.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files reduced the time by 32.24%
Test w/1,000,000 files reduced the time by -4.77%

Note that on the 1,000,000 files case, multi-threading the cache entry parsing
does not yield a performance win.  This is because the cost to parse the
index extensions in this repo, far outweigh the cost of loading the cache
entries.

The high cost of parsing the index extensions is driven by the cache tree
and the untracked cache extensions. As this is currently the longest pole,
any reduction in this time will reduce the overall index load times so is
worth further investigation in another patch series.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 224 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 189 insertions(+), 35 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 9b0554d4e6..f5d766088d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1720,7 +1720,8 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *create_from_disk(struct index_state *istate,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    const struct cache_entry *previous_ce)
@@ -1737,7 +1738,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 	 * number of bytes to be stripped from the end of the previous name,
 	 * and the bytes to append to the result, to come up with its name.
 	 */
-	int expand_name_field = istate->version == 4;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1761,16 +1762,17 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		const unsigned char *cp = (const unsigned char *)name;
 		size_t strip_len, previous_len;
 
-		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		/* If we're at the begining of a block, ignore the previous name */
 		strip_len = decode_varint(&cp);
-		if (previous_len < strip_len) {
-			if (previous_ce)
+		if (previous_ce) {
+			previous_len = previous_ce->ce_namelen;
+			if (previous_len < strip_len)
 				die(_("malformed name field in the index, near path '%s'"),
-				    previous_ce->name);
-			else
-				die(_("malformed name field in the index in the first path"));
+					previous_ce->name);
+			copy_len = previous_len - strip_len;
+		} else {
+			copy_len = 0;
 		}
-		copy_len = previous_len - strip_len;
 		name = (const char *)cp;
 	}
 
@@ -1780,7 +1782,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 			len += copy_len;
 	}
 
-	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1950,6 +1952,52 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
+			unsigned long start_offset, const struct cache_entry *previous_ce)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+		previous_ce = ce;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, NULL);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
 /*
  * Mostly randomly chosen maximum thread counts: we
  * cap the parallelism to online_cpus() threads, and we want
@@ -1959,20 +2007,125 @@ static void *load_index_extensions(void *_data)
 
 #define THREAD_COST		(10000)
 
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset;
+	const char *mmap;
+	struct index_entry_offset_table *ieot;
+	int ieot_offset;        /* starting index into the ieot array */
+	int ieot_work;          /* count of ieot entries to process */
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	int i;
+
+	/* iterate across all ieot blocks assigned to this thread */
+	for (i = p->ieot_offset; i < p->ieot_offset + p->ieot_work; i++) {
+		p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
+		p->offset += p->ieot->entries[i].nr;
+	}
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
+			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
+{
+	int i, offset, ieot_work, ieot_offset, err;
+	struct load_cache_entries_thread_data *data;
+	unsigned long consumed = 0;
+	int nr;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/* ensure we have no more threads than we have blocks to process */
+	if (nr_threads > ieot->nr)
+		nr_threads = ieot->nr;
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	offset = ieot_offset = 0;
+	ieot_work = DIV_ROUND_UP(ieot->nr, nr_threads);
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+		int j;
+
+		if (ieot_offset + ieot_work > ieot->nr)
+			ieot_work = ieot->nr - ieot_offset;
+
+		p->istate = istate;
+		p->offset = offset;
+		p->mmap = mmap;
+		p->ieot = ieot;
+		p->ieot_offset = ieot_offset;
+		p->ieot_work = ieot_work;
+
+		/* create a mem_pool for each thread */
+		nr = 0;
+		for (j = p->ieot_offset; j < p->ieot_offset + p->ieot_work; j++)
+			nr += p->ieot->entries[j].nr;
+		if (istate->version == 4) {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size_from_compressed(nr));
+		}
+		else {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size(mmap_size, nr));
+		}
+
+		err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
+		if (err)
+			die(_("unable to create load_cache_entries thread: %s"), strerror(err));
+
+		/* increment by the number of cache entries in the ieot block being processed */
+		for (j = 0; j < ieot_work; j++)
+			offset += ieot->entries[ieot_offset + j].nr;
+		ieot_offset += ieot_work;
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+
+		err = pthread_join(p->pthread, NULL);
+		if (err)
+			die(_("unable to join load_cache_entries thread: %s"), strerror(err));
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		consumed += p->consumed;
+	}
+
+	free(data);
+
+	return consumed;
+}
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	const struct cache_header *hdr;
 	const char *mmap;
 	size_t mmap_size;
-	const struct cache_entry *previous_ce = NULL;
 	struct load_index_extensions p;
 	size_t extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int nr_threads, cpus;
+	struct index_entry_offset_table *ieot = 0;
 #endif
 
 	if (istate->initialized)
@@ -2014,10 +2167,18 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+
+	/* TODO: does creating more threads than cores help? */
+	if (!nr_threads) {
+		nr_threads = istate->cache_nr / THREAD_COST;
+		cpus = online_cpus();
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
 
 	if (nr_threads > 1) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2032,29 +2193,22 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			nr_threads--;
 		}
 	}
-#endif
-
-	if (istate->version == 4) {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
 
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
+	/*
+	 * Locate and read the index entry offset table so that we can use it
+	 * to multi-thread the reading of the cache entries.
+	 */
+	if (extension_offset && nr_threads > 1)
+		ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
 
-		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
-		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
-		set_index_entry(istate, i, ce);
+	if (ieot)
+		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#endif
 
-		src_offset += consumed;
-		previous_ce = ce;
-	}
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 0/7] speed up index load through parallelization
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (6 preceding siblings ...)
  2018-09-26 19:54   ` [PATCH v6 7/7] read-cache: load cache entries on worker threads Ben Peart
@ 2018-09-26 22:06   ` Junio C Hamano
  2018-09-27 17:13   ` Duy Nguyen
  8 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-26 22:06 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, pclouds, Ben Peart

Ben Peart <peartben@gmail.com> writes:

> Base Ref: master
> Web-Diff: https://github.com/benpeart/git/commit/a0300882d4
> Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v6 && git checkout a0300882d4
>
>
> This iteration brings back the Index Entry Offset Table (IEOT) extension
> which enables us to multi-thread the cache entry parsing without having
> the primary thread have to scan all the entries first.  In cases where the
> cache entry parsing is the most expensive part, this yields some additional
> savings.

Nice.

> Test w/100,000 files    Baseline  Optimize V4    Extensions     Entries
> ----------------------------------------------------------------------------
> 0002.1: read_cache      22.36     18.74 -16.2%   18.64 -16.6%   12.63 -43.5%
>
> Test w/1,000,000 files  Baseline  Optimize V4    Extensions     Entries
> -----------------------------------------------------------------------------
> 0002.1: read_cache      304.40    270.70 -11.1%  195.50 -35.8%  204.82 -32.7%
>
> Note that on the 1,000,000 files case, multi-threading the cache entry parsing
> does not yield a performance win.  This is because the cost to parse the
> index extensions in this repo, far outweigh the cost of loading the cache
> entries.
> ...
> The high cost of parsing the index extensions is driven by the cache tree
> and the untracked cache extensions. As this is currently the longest pole,
> any reduction in this time will reduce the overall index load times so is
> worth further investigation in another patch series.

Interesting.

> One option would be to load each extension on a separate thread but I
> believe that is overkill for the vast majority of repos.  Instead, some
> optimization of the loading code for these two extensions is probably worth
> looking into as a quick examination shows that the bulk of the time for both
> of them is spent in xcalloc().

Thanks.  Looking forward to block some quality time off to read this
through, but from the cursory look (read: diff between the previous
round), this looks quite promising.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 0/7] speed up index load through parallelization
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
                     ` (7 preceding siblings ...)
  2018-09-26 22:06   ` [PATCH v6 0/7] speed up index load through parallelization Junio C Hamano
@ 2018-09-27 17:13   ` Duy Nguyen
  8 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-09-27 17:13 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 26, 2018 at 9:54 PM Ben Peart <peartben@gmail.com> wrote:
> The high cost of parsing the index extensions is driven by the cache tree
> and the untracked cache extensions. As this is currently the longest pole,
> any reduction in this time will reduce the overall index load times so is
> worth further investigation in another patch series.
>
> Name                                    First    Last     Elapsed
> |   + git!read_index_extension          684.052  870.244  186.192
> |    + git!cache_tree_read              684.052  797.801  113.749
> |    + git!read_untracked_extension     797.801  870.244  72.443
>
> One option would be to load each extension on a separate thread but I
> believe that is overkill for the vast majority of repos.

They both grow proportional to the number of trees in worktree, which
probably also scales to the worktree size. Frankly I think the
parallel index loading is already overkill for the majority of repos,
so speeding up further of the 1% giant repos does not sound that bad.
And I think you already lay the foundation for loading index stuff in
parallel with this series.

> Instead, some
> optimization of the loading code for these two extensions is probably worth
> looking into as a quick examination shows that the bulk of the time for both
> of them is spent in xcalloc().

Another easy "optimization" is delaying loading these until we need
them (or load them in background, read_index() returns even before
these extensions are finished, but this is of course trickier).

UNTR extension for example is only useful for "git status" (and maybe
one or two other use cases). Not having to load them all the time is
likely a win. The role of TREE extension has grown bigger these days
so it's still maybe worth putting more effort into making it load it
faster rather than just hiding the cost.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-28  0:19     ` SZEDER Gábor
  2018-09-28 18:38       ` Ben Peart
  2018-09-29  0:51     ` SZEDER Gábor
  2018-09-29  5:45     ` Duy Nguyen
  2 siblings, 1 reply; 153+ messages in thread
From: SZEDER Gábor @ 2018-09-28  0:19 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart, Ben Peart

On Wed, Sep 26, 2018 at 03:54:38PM -0400, Ben Peart wrote:
> The End of Index Entry (EOIE) is used to locate the end of the variable

Nit: perhaps start with: 

  The End of Index Entry (EOIE) optional extension can be used to ...

to make it clearer for those who don't immediately realize the
significance of the upper case 'E' in the extension's signature.

> length index entries and the beginning of the extensions. Code can take
> advantage of this to quickly locate the index extensions without having
> to parse through all of the index entries.
> 
> Because it must be able to be loaded before the variable length cache
> entries and other index extensions, this extension must be written last.
> The signature for this extension is { 'E', 'O', 'I', 'E' }.
> 
> The extension consists of:
> 
> - 32-bit offset to the end of the index entries
> 
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
> 
> SHA-1("TREE" + <binary representation of N> +
> 	"REUC" + <binary representation of M>)
> 
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/technical/index-format.txt |  23 ++++
>  read-cache.c                             | 151 +++++++++++++++++++++--
>  t/README                                 |   5 +
>  t/t1700-split-index.sh                   |   1 +
>  4 files changed, 172 insertions(+), 8 deletions(-)
> 

> diff --git a/t/README b/t/README
> index 3ea6c85460..aa33ac4f26 100644
> --- a/t/README
> +++ b/t/README
> @@ -327,6 +327,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
>  be written after every 'git commit' command, and overrides the
>  'core.commitGraph' setting to true.
>  
> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
> +as they currently hard code SHA values for the index which are no longer
> +valid due to the addition of the EOIE extension.

Is this extension enabled by default?  The commit message doesn't
explicitly say so, but I don't see any way to turn it on or off, while
there is this new GIT_TEST environment variable to disable it for one
particular test, so it seems so.  If that's indeed the case, then
wouldn't it be better to update those hard-coded SHA1 values in t1700
instead?

>  Naming Tests
>  ------------
>  
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index be22398a85..1f168378c8 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
>  
>  test_expect_success 'enable split index' '
>  	git config splitIndex.maxPercentChange 100 &&
> -- 
> 2.18.0.windows.1
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-26 19:54   ` [PATCH v6 4/7] config: add new index.threads config setting Ben Peart
@ 2018-09-28  0:26     ` SZEDER Gábor
  2018-09-28 13:39       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: SZEDER Gábor @ 2018-09-28  0:26 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart, Ben Peart

On Wed, Sep 26, 2018 at 03:54:39PM -0400, Ben Peart wrote:
> Add support for a new index.threads config setting which will be used to
> control the threading code in do_read_index().  A value of 0 will tell the
> index code to automatically determine the correct number of threads to use.
> A value of 1 will make the code single threaded.  A value greater than 1
> will set the maximum number of threads to use.
> 
> For testing purposes, this setting can be overwritten by setting the
> GIT_TEST_INDEX_THREADS=<n> environment variable to a value greater than 0.
> 
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---

> diff --git a/t/README b/t/README
> index aa33ac4f26..0fcecf4500 100644
> --- a/t/README
> +++ b/t/README
> @@ -332,6 +332,11 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
>  as they currently hard code SHA values for the index which are no longer
>  valid due to the addition of the EOIE extension.
>  
> +GIT_TEST_INDEX_THREADS=<n> enables exercising the multi-threaded loading
> +of the index for the whole test suite by bypassing the default number of
> +cache entries and thread minimums. Settting this to 1 will make the

s/ttt/tt/

> +index loading single threaded.
> +
>  Naming Tests
>  ------------
>  
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 1f168378c8..ab205954cf 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -8,6 +8,7 @@ test_description='split index mode tests'
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
>  GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
> +GIT_TEST_INDEX_THREADS=1; export GIT_TEST_INDEX_THREADS

Why does multithreading have to be disabled in this test?

>  test_expect_success 'enable split index' '
>  	git config splitIndex.maxPercentChange 100 &&
> -- 
> 2.18.0.windows.1
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28  0:26     ` SZEDER Gábor
@ 2018-09-28 13:39       ` Ben Peart
  2018-09-28 17:07         ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-28 13:39 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: git, gitster, pclouds, Ben Peart, Ben Peart



On 9/27/2018 8:26 PM, SZEDER Gábor wrote:
> On Wed, Sep 26, 2018 at 03:54:39PM -0400, Ben Peart wrote:
>> Add support for a new index.threads config setting which will be used to
>> control the threading code in do_read_index().  A value of 0 will tell the
>> index code to automatically determine the correct number of threads to use.
>> A value of 1 will make the code single threaded.  A value greater than 1
>> will set the maximum number of threads to use.
>>
>> For testing purposes, this setting can be overwritten by setting the
>> GIT_TEST_INDEX_THREADS=<n> environment variable to a value greater than 0.
>>
>> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
>> ---
> 
>> diff --git a/t/README b/t/README
>> index aa33ac4f26..0fcecf4500 100644
>> --- a/t/README
>> +++ b/t/README
>> @@ -332,6 +332,11 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
>>   as they currently hard code SHA values for the index which are no longer
>>   valid due to the addition of the EOIE extension.
>>   
>> +GIT_TEST_INDEX_THREADS=<n> enables exercising the multi-threaded loading
>> +of the index for the whole test suite by bypassing the default number of
>> +cache entries and thread minimums. Settting this to 1 will make the
> 
> s/ttt/tt/
> 
>> +index loading single threaded.
>> +
>>   Naming Tests
>>   ------------
>>   
>> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
>> index 1f168378c8..ab205954cf 100755
>> --- a/t/t1700-split-index.sh
>> +++ b/t/t1700-split-index.sh
>> @@ -8,6 +8,7 @@ test_description='split index mode tests'
>>   sane_unset GIT_TEST_SPLIT_INDEX
>>   sane_unset GIT_FSMONITOR_TEST
>>   GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
>> +GIT_TEST_INDEX_THREADS=1; export GIT_TEST_INDEX_THREADS
> 
> Why does multithreading have to be disabled in this test?
> 

If multi-threading is enabled, it will write out the IEOT extension 
which changes the SHA and causes the test to fail.  I will update the 
logic in this case to not write out the IEOT extension as it isn't needed.

>>   test_expect_success 'enable split index' '
>>   	git config splitIndex.maxPercentChange 100 &&
>> -- 
>> 2.18.0.windows.1
>>

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28 13:39       ` Ben Peart
@ 2018-09-28 17:07         ` Junio C Hamano
  2018-09-28 19:41           ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-09-28 17:07 UTC (permalink / raw)
  To: Ben Peart; +Cc: SZEDER Gábor, git, pclouds, Ben Peart, Ben Peart

Ben Peart <peartben@gmail.com> writes:

>> Why does multithreading have to be disabled in this test?
>
> If multi-threading is enabled, it will write out the IEOT extension
> which changes the SHA and causes the test to fail.

I think it is a design mistake to let the writing processes's
capability decide what is written in the file to be read later by a
different process, which possibly may have different capability.  If
you are not writing with multiple threads, it should not matter if
that writer process is capable of and configured to spawn 8 threads
if the process were reading the file---as it is not reading the file
it is writing right now.

I can understand if the design is to write IEOT only if the
resulting index is expected to become large enough (above an
arbitrary threshold like 100k entries) to matter.  I also can
understand if IEOT is omitted when the repository configuration says
that no process is allowed to read the index with multi-threaded
codepath in that repository.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-28  0:19     ` SZEDER Gábor
@ 2018-09-28 18:38       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-09-28 18:38 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: git, gitster, pclouds, Ben Peart, Ben Peart



On 9/27/2018 8:19 PM, SZEDER Gábor wrote:
> On Wed, Sep 26, 2018 at 03:54:38PM -0400, Ben Peart wrote:
>> The End of Index Entry (EOIE) is used to locate the end of the variable
> 
> Nit: perhaps start with:
> 
>    The End of Index Entry (EOIE) optional extension can be used to ...
> 
> to make it clearer for those who don't immediately realize the
> significance of the upper case 'E' in the extension's signature.
> 
>> length index entries and the beginning of the extensions. Code can take
>> advantage of this to quickly locate the index extensions without having
>> to parse through all of the index entries.
>>
>> Because it must be able to be loaded before the variable length cache
>> entries and other index extensions, this extension must be written last.
>> The signature for this extension is { 'E', 'O', 'I', 'E' }.
>>
>> The extension consists of:
>>
>> - 32-bit offset to the end of the index entries
>>
>> - 160-bit SHA-1 over the extension types and their sizes (but not
>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> then the hash would be:
>>
>> SHA-1("TREE" + <binary representation of N> +
>> 	"REUC" + <binary representation of M>)
>>
>> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
>> ---
>>   Documentation/technical/index-format.txt |  23 ++++
>>   read-cache.c                             | 151 +++++++++++++++++++++--
>>   t/README                                 |   5 +
>>   t/t1700-split-index.sh                   |   1 +
>>   4 files changed, 172 insertions(+), 8 deletions(-)
>>
> 
>> diff --git a/t/README b/t/README
>> index 3ea6c85460..aa33ac4f26 100644
>> --- a/t/README
>> +++ b/t/README
>> @@ -327,6 +327,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
>>   be written after every 'git commit' command, and overrides the
>>   'core.commitGraph' setting to true.
>>   
>> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
>> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
>> +as they currently hard code SHA values for the index which are no longer
>> +valid due to the addition of the EOIE extension.
> 
> Is this extension enabled by default?  The commit message doesn't
> explicitly say so, but I don't see any way to turn it on or off, while
> there is this new GIT_TEST environment variable to disable it for one
> particular test, so it seems so.  If that's indeed the case, then
> wouldn't it be better to update those hard-coded SHA1 values in t1700
> instead?
> 

Yes, it is enabled by default and the only way to disable it is the 
GIT_TEST_DISABLE_EOIE environment variable.

The tests in t1700-split-index.sh assume that there are no extensions in 
the index file so anything that adds an extension, will break one or 
more of the tests.

First in 'enable split index', they hard code SHA values assuming there 
are no extensions. If some option adds an extension, these hard coded 
values no longer match and the test fails.

Later in 'disable split index' they save off the SHA of the index with 
split-index turned off and then in later tests, compare it to the SHA of 
the shared index.  Because extensions are stripped when the shared index 
is written out this only works if there were not extensions in the 
original index.

I'll document this behavior and reasoning in the test directly.

This did cause me to reexamine how EOIE and IEOT behave when split index 
is turned on.  These two extensions help most with a large index.  When 
split index is turned on, the large index is actually the shared index 
as the index is now the smaller set of deltas.

Currently, the extensions are stripped out of the shared index which 
means they are not available when they are needed to quickly load the 
shared index.  I'll see if I can update the patch so that these 
extensions are still written out and available in the shared index to 
speed up when it is loaded.

Thanks!

>>   Naming Tests
>>   ------------
>>   
>> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
>> index be22398a85..1f168378c8 100755
>> --- a/t/t1700-split-index.sh
>> +++ b/t/t1700-split-index.sh
>> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>>   # We need total control of index splitting here
>>   sane_unset GIT_TEST_SPLIT_INDEX
>>   sane_unset GIT_FSMONITOR_TEST
>> +GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
>>   
>>   test_expect_success 'enable split index' '
>>   	git config splitIndex.maxPercentChange 100 &&
>> -- 
>> 2.18.0.windows.1
>>

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28 17:07         ` Junio C Hamano
@ 2018-09-28 19:41           ` Ben Peart
  2018-09-28 20:30             ` Ramsay Jones
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-09-28 19:41 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: SZEDER Gábor, git, pclouds, Ben Peart, Ben Peart



On 9/28/2018 1:07 PM, Junio C Hamano wrote:
> Ben Peart <peartben@gmail.com> writes:
> 
>>> Why does multithreading have to be disabled in this test?
>>
>> If multi-threading is enabled, it will write out the IEOT extension
>> which changes the SHA and causes the test to fail.
> 
> I think it is a design mistake to let the writing processes's
> capability decide what is written in the file to be read later by a
> different process, which possibly may have different capability.  If
> you are not writing with multiple threads, it should not matter if
> that writer process is capable of and configured to spawn 8 threads
> if the process were reading the file---as it is not reading the file
> it is writing right now.
> 
> I can understand if the design is to write IEOT only if the
> resulting index is expected to become large enough (above an
> arbitrary threshold like 100k entries) to matter.  I also can
> understand if IEOT is omitted when the repository configuration says
> that no process is allowed to read the index with multi-threaded
> codepath in that repository.
> 

There are two different paths which determine how many blocks are 
written to the IEOT.  The first is the default path.  On this path, the 
number of blocks is determined by the number of cache entries divided by 
the THREAD_COST.  If there are sufficient entries to make it faster to 
use threading, then it will automatically use enough blocks to optimize 
the performance of reading the entries across multiple threads.

I currently cap the maximum number of blocks to be the number of cores 
that would be available to process them on that same machine purely as 
an optimization.  The majority of the time, the index will be read from 
the same machine that it was written on so this works well.  Before I 
added that logic, you would usually end up with more blocks than 
available threads which meant some threads had more to do than the other 
threads and resulted in worse performance.  For example, 4 blocks across 
3 threads results in the 1st thread having twice as much work to do as 
the other threads.

If the index is copied to a machine with a different number of cores, it 
will still all work - it just may not be optimal for that machine.  This 
is self correcting because as soon as the index is written out, it will 
be optimized for that machine.

If the "automatically try to make it perform optimally" logic doesn't 
work for some reason, we have path #2.

The second path is when the user specifies a specific number of blocks 
via the GIT_TEST_INDEX_THREADS=<n> environment variable or the 
index.threads=<n> config setting.  If they ask for n blocks, they will 
get n blocks.  This is the "I know what I'm doing and want to control 
the behavior" path.

I just added one additional test (see patch below) to avoid a divide by 
zero bug and simplify things a bit.  With this change, if there are 
fewer than two blocks, the IEOT extension is not written out as it isn't 
needed.  The load would be single threaded anyway so there is no reason 
to write out a IEOT extensions that won't be used.



diff --git a/read-cache.c b/read-cache.c
index f5d766088d..a1006fa824 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -2751,18 +2751,23 @@ static int do_write_index(struct index_state 
*istate, struct tempfile *tempfil
e,
                  */
                 if (!nr) {
                         ieot_blocks = istate->cache_nr / THREAD_COST;
-                       if (ieot_blocks < 1)
-                               ieot_blocks = 1;
                         cpus = online_cpus();
                         if (ieot_blocks > cpus - 1)
                                 ieot_blocks = cpus - 1;
                 } else {
                         ieot_blocks = nr;
                 }
-               ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
-                       + (ieot_blocks * sizeof(struct 
index_entry_offset)));
-               ieot->nr = 0;
-               ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
+
+               /*
+                * no reason to write out the IEOT extension if we don't
+                * have enough blocks to utilize multi-threading
+                */
+               if (ieot_blocks > 1) {
+                       ieot = xcalloc(1, sizeof(struct 
index_entry_offset_table)
+                               + (ieot_blocks * sizeof(struct 
index_entry_offset)));
+                       ieot->nr = 0;
+                       ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
+               }
         }
  #endif


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28 19:41           ` Ben Peart
@ 2018-09-28 20:30             ` Ramsay Jones
  2018-09-28 22:15               ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Ramsay Jones @ 2018-09-28 20:30 UTC (permalink / raw)
  To: Ben Peart, Junio C Hamano
  Cc: SZEDER Gábor, git, pclouds, Ben Peart, Ben Peart



On 28/09/18 20:41, Ben Peart wrote:
> 
> 
> On 9/28/2018 1:07 PM, Junio C Hamano wrote:
>> Ben Peart <peartben@gmail.com> writes:
>>
>>>> Why does multithreading have to be disabled in this test?
>>>
>>> If multi-threading is enabled, it will write out the IEOT extension
>>> which changes the SHA and causes the test to fail.
>>
>> I think it is a design mistake to let the writing processes's
>> capability decide what is written in the file to be read later by a
>> different process, which possibly may have different capability.  If
>> you are not writing with multiple threads, it should not matter if
>> that writer process is capable of and configured to spawn 8 threads
>> if the process were reading the file---as it is not reading the file
>> it is writing right now.
>>
>> I can understand if the design is to write IEOT only if the
>> resulting index is expected to become large enough (above an
>> arbitrary threshold like 100k entries) to matter.  I also can
>> understand if IEOT is omitted when the repository configuration says
>> that no process is allowed to read the index with multi-threaded
>> codepath in that repository.
>>
> 
> There are two different paths which determine how many blocks are written to the IEOT.  The first is the default path.  On this path, the number of blocks is determined by the number of cache entries divided by the THREAD_COST.  If there are sufficient entries to make it faster to use threading, then it will automatically use enough blocks to optimize the performance of reading the entries across multiple threads.
> 
> I currently cap the maximum number of blocks to be the number of cores that would be available to process them on that same machine purely as an optimization.  The majority of the time, the index will be read from the same machine that it was written on so this works well.  Before I added that logic, you would usually end up with more blocks than available threads which meant some threads had more to do than the other threads and resulted in worse performance.  For example, 4 blocks across 3 threads results in the 1st thread having twice as much work to do as the other threads.
> 
> If the index is copied to a machine with a different number of cores, it will still all work - it just may not be optimal for that machine.  This is self correcting because as soon as the index is written out, it will be optimized for that machine.
> 
> If the "automatically try to make it perform optimally" logic doesn't work for some reason, we have path #2.
> 
> The second path is when the user specifies a specific number of blocks via the GIT_TEST_INDEX_THREADS=<n> environment variable or the index.threads=<n> config setting.  If they ask for n blocks, they will get n blocks.  This is the "I know what I'm doing and want to control the behavior" path.
> 
> I just added one additional test (see patch below) to avoid a divide by zero bug and simplify things a bit.  With this change, if there are fewer than two blocks, the IEOT extension is not written out as it isn't needed.  The load would be single threaded anyway so there is no reason to write out a IEOT extensions that won't be used.
> 
> 
> 
> diff --git a/read-cache.c b/read-cache.c
> index f5d766088d..a1006fa824 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -2751,18 +2751,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfil
> e,
>                  */
>                 if (!nr) {
>                         ieot_blocks = istate->cache_nr / THREAD_COST;
> -                       if (ieot_blocks < 1)
> -                               ieot_blocks = 1;
>                         cpus = online_cpus();
>                         if (ieot_blocks > cpus - 1)
>                                 ieot_blocks = cpus - 1;

So, am I reading this correctly - you need cpus > 2 before an
IEOT extension block is written out?

OK.

ATB,
Ramsay Jones

>                 } else {
>                         ieot_blocks = nr;
>                 }
> -               ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
> -                       + (ieot_blocks * sizeof(struct index_entry_offset)));
> -               ieot->nr = 0;
> -               ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
> +
> +               /*
> +                * no reason to write out the IEOT extension if we don't
> +                * have enough blocks to utilize multi-threading
> +                */
> +               if (ieot_blocks > 1) {
> +                       ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
> +                               + (ieot_blocks * sizeof(struct index_entry_offset)));
> +                       ieot->nr = 0;
> +                       ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
> +               }
>         }
>  #endif
> 
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28 20:30             ` Ramsay Jones
@ 2018-09-28 22:15               ` Junio C Hamano
  2018-10-01 13:17                 ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-09-28 22:15 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Ben Peart, SZEDER Gábor, git, pclouds, Ben Peart, Ben Peart

Ramsay Jones <ramsay@ramsayjones.plus.com> writes:

>>                 if (!nr) {
>>                         ieot_blocks = istate->cache_nr / THREAD_COST;
>> -                       if (ieot_blocks < 1)
>> -                               ieot_blocks = 1;
>>                         cpus = online_cpus();
>>                         if (ieot_blocks > cpus - 1)
>>                                 ieot_blocks = cpus - 1;
>
> So, am I reading this correctly - you need cpus > 2 before an
> IEOT extension block is written out?
>
> OK.

Why should we be even calling online_cpus() in this codepath to
write the index in a single thread to begin with?

The number of cpus that readers would use to read this index file
has nothing to do with the number of cpus available to this
particular writer process.  


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-28  0:19     ` SZEDER Gábor
@ 2018-09-29  0:51     ` SZEDER Gábor
  2018-09-29  5:45     ` Duy Nguyen
  2 siblings, 0 replies; 153+ messages in thread
From: SZEDER Gábor @ 2018-09-29  0:51 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart, Ben Peart

On Wed, Sep 26, 2018 at 03:54:38PM -0400, Ben Peart wrote:
> diff --git a/read-cache.c b/read-cache.c
> index 6ba99e2c96..80255d3088 100644
> --- a/read-cache.c
> +++ b/read-cache.c

> +static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
> +{

<....>

> +	the_hash_algo->final_fn(hash, &c);
> +	if (hashcmp(hash, (const unsigned char *)index))
> +		return 0;

Please use !hasheq() instead of hashcmp().


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-28  0:19     ` SZEDER Gábor
  2018-09-29  0:51     ` SZEDER Gábor
@ 2018-09-29  5:45     ` Duy Nguyen
  2018-09-29 18:24       ` Junio C Hamano
  2 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-09-29  5:45 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, Ben Peart, Ben Peart

On Wed, Sep 26, 2018 at 03:54:38PM -0400, Ben Peart wrote:
> +
> +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */

If you make these variables instead of macros, you can use
the_hash_algo, which makes this code sha256-friendlier and probably
can explain less, e.g. ...

> +
> +static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
> +{
> +	/*
> +	 * The end of index entries (EOIE) extension is guaranteed to be last
> +	 * so that it can be found by scanning backwards from the EOF.
> +	 *
> +	 * "EOIE"
> +	 * <4-byte length>
> +	 * <4-byte offset>
> +	 * <20-byte hash>
> +	 */

	uint32_t EOIE_SIZE = 4 + the_hash_algo->rawsz;
	uint32_t EOIE_SIZE_WITH_HEADER = 4 + 4 + EOIE_SIZE;

> +	const char *index, *eoie;
> +	uint32_t extsize;
> +	size_t offset, src_offset;
> +	unsigned char hash[GIT_MAX_RAWSZ];
> +	git_hash_ctx c;
--
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-09-29  5:45     ` Duy Nguyen
@ 2018-09-29 18:24       ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-09-29 18:24 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, git, Ben Peart, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> On Wed, Sep 26, 2018 at 03:54:38PM -0400, Ben Peart wrote:
>> +
>> +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
>> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
>
> If you make these variables instead of macros, you can use
> the_hash_algo, which makes this code sha256-friendlier and probably
> can explain less, e.g. ...
>
>> +
>> +static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
>> +{
>> +	/*
>> +	 * The end of index entries (EOIE) extension is guaranteed to be last
>> +	 * so that it can be found by scanning backwards from the EOF.
>> +	 *
>> +	 * "EOIE"
>> +	 * <4-byte length>
>> +	 * <4-byte offset>
>> +	 * <20-byte hash>

20? ;-)

>> +	 */
>
> 	uint32_t EOIE_SIZE = 4 + the_hash_algo->rawsz;
> 	uint32_t EOIE_SIZE_WITH_HEADER = 4 + 4 + EOIE_SIZE;
>
>> +	const char *index, *eoie;
>> +	uint32_t extsize;
>> +	size_t offset, src_offset;
>> +	unsigned char hash[GIT_MAX_RAWSZ];
>> +	git_hash_ctx c;
> --
> Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-09-28 22:15               ` Junio C Hamano
@ 2018-10-01 13:17                 ` Ben Peart
  2018-10-01 15:06                   ` SZEDER Gábor
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:17 UTC (permalink / raw)
  To: Junio C Hamano, Ramsay Jones
  Cc: SZEDER Gábor, git, pclouds, Ben Peart, Ben Peart



On 9/28/2018 6:15 PM, Junio C Hamano wrote:
> Ramsay Jones <ramsay@ramsayjones.plus.com> writes:
> 
>>>                  if (!nr) {
>>>                          ieot_blocks = istate->cache_nr / THREAD_COST;
>>> -                       if (ieot_blocks < 1)
>>> -                               ieot_blocks = 1;
>>>                          cpus = online_cpus();
>>>                          if (ieot_blocks > cpus - 1)
>>>                                  ieot_blocks = cpus - 1;
>>
>> So, am I reading this correctly - you need cpus > 2 before an
>> IEOT extension block is written out?
>>
>> OK.
> 
> Why should we be even calling online_cpus() in this codepath to
> write the index in a single thread to begin with?
> 
> The number of cpus that readers would use to read this index file
> has nothing to do with the number of cpus available to this
> particular writer process.
> 

As I mentioned in my other reply, this is optimizing for the most common 
case where the index is read from the same machine that wrote it and the 
user is taking the default settings (ie index.threads=true).

Aligning the number of blocks to the number of threads that will be 
processing them avoids situations where one thread may have up to double 
the work to do as the other threads (for example, if there were 3 blocks 
to be processed by 2 threads).

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 0/7] speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (6 preceding siblings ...)
  2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
@ 2018-10-01 13:45 ` " Ben Peart
  2018-10-01 13:45   ` [PATCH v7 1/7] read-cache.c: optimize reading index format v4 Ben Peart
                     ` (6 more replies)
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
  8 siblings, 7 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

Thanks for all the feedback.

The biggest change since the last version is how this patch series interacts
with the split-index feature.  With a split index, most of the cache entries
are stored in the shared index so would benefit from multi-threaded parsing.
To enable that, the EOIE and IEOT extensions are now written into the shared
index (rather than being stripped out like the other extensions).

Because of this, I can now update the tests in t1700-split-index.sh to have
updated SHA values that include the EOIE extension instead of disabling the
extension.

Using p0002-read-cache.sh to generate some performance numbers shows how
each of the various patches contribute to the overall performance win on a
particularly large repo.

Repo w/3M files      Baseline  Optimize V4   Extensions      Entries
--------------------------------------------------------------------------
0002.1: read_cache   693.29    655.65 -5.4%  470.71 -32.1%   399.62 -42.4%

Note how this cuts nearly 300ms off the index load time!

Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/c1125a5d9a
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v7 && git checkout c1125a5d9a


### Patches

Ben Peart (6):
  read-cache: clean up casting and byte decoding
  eoie: add End of Index Entry (EOIE) extension
  config: add new index.threads config setting
  read-cache: load cache extensions on a worker thread
  ieot: add Index Entry Offset Table (IEOT) extension
  read-cache: load cache entries on worker threads

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   7 +
 Documentation/technical/index-format.txt |  41 ++
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 749 +++++++++++++++++++----
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |  13 +-
 7 files changed, 715 insertions(+), 119 deletions(-)


base-commit: fe8321ec057f9231c26c29b364721568e58040f7
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 1/7] read-cache.c: optimize reading index format v4
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 13:45   ` [PATCH v7 2/7] read-cache: clean up casting and byte decoding Ben Peart
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 8d04d78a58..583a4fb1f8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,24 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1789,21 +1750,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1892,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1930,9 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 2/7] read-cache: clean up casting and byte decoding
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
  2018-10-01 13:45   ` [PATCH v7 1/7] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 15:10     ` Duy Nguyen
  2018-10-01 13:45   ` [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 read-cache.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 583a4fb1f8..6ba99e2c96 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1650,7 +1650,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1674,7 +1674,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1889,8 +1889,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd, i;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
 
@@ -1918,7 +1918,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -1943,7 +1943,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1961,21 +1961,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
+					 mmap + src_offset,
+					 mmap + src_offset + 8,
 					 extsize) < 0)
 			goto unmap;
 		src_offset += 8;
 		src_offset += extsize;
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
  2018-10-01 13:45   ` [PATCH v7 1/7] read-cache.c: optimize reading index format v4 Ben Peart
  2018-10-01 13:45   ` [PATCH v7 2/7] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 15:17     ` SZEDER Gábor
  2018-10-01 15:30     ` Duy Nguyen
  2018-10-01 13:45   ` [PATCH v7 4/7] config: add new index.threads config setting Ben Peart
                     ` (3 subsequent siblings)
  6 siblings, 2 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 152 +++++++++++++++++++++--
 t/t1700-split-index.sh                   |   8 +-
 3 files changed, 171 insertions(+), 12 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 6ba99e2c96..af2605a168 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1883,6 +1887,9 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2190,11 +2197,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2437,7 +2448,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2446,6 +2457,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	off_t offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2479,6 +2491,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2512,11 +2525,14 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2527,7 +2543,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2537,7 +2553,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2548,7 +2564,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2559,7 +2575,24 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.  Write it out regardless of the strip_extensions parameter as we need it
+	 * when loading the shared index.
+	 */
+	if (offset) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2975,3 +3008,106 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *index, *eoie;
+	uint32_t extsize;
+	size_t offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *	 "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (!hasheq(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index be22398a85..8e17f8e7a0 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -15,11 +15,11 @@ test_expect_success 'enable split index' '
 	indexversion=$(test-tool index-version <.git/index) &&
 	if test "$indexversion" = "4"
 	then
-		own=432ef4b63f32193984f339431fd50ca796493569
-		base=508851a7f0dfa8691e9f69c7f055865389012491
+		own=3527df833c6c100d3d1d921a9a782d62a8be4b58
+		base=746f7ab2ed44fb839efdfbffcf399d0b113fb4cb
 	else
-		own=8299b0bcd1ac364e5f1d7768efb62fa2da79a339
-		base=39d890139ee5356c7ef572216cebcd27aa41f9df
+		own=5e9b60117ece18da410ddecc8b8d43766a0e4204
+		base=4370042739b31cd17a5c5cd6043a77c9a00df113
 	fi &&
 	cat >expect <<-EOF &&
 	own $own
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 4/7] config: add new index.threads config setting
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
                     ` (2 preceding siblings ...)
  2018-10-01 13:45   ` [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 13:45   ` [PATCH v7 5/7] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

Add support for a new index.threads config setting which will be used to
control the threading code in do_read_index().  A value of 0 will tell the
index code to automatically determine the correct number of threads to use.
A value of 1 will make the code single threaded.  A value greater than 1
will set the maximum number of threads to use.

For testing purposes, this setting can be overwritten by setting the
GIT_TEST_INDEX_THREADS=<n> environment variable to a value greater than 0.

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 Documentation/config.txt |  7 +++++++
 config.c                 | 18 ++++++++++++++++++
 config.h                 |  1 +
 t/README                 |  5 +++++
 t/t1700-split-index.sh   |  5 +++++
 5 files changed, 36 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ad0f4510c3..8fd973b76b 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2413,6 +2413,13 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Specifying 1 or
+	'false' will disable multithreading. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 3461993f0a..2ee29f6f86 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_index_threads(void)
+{
+	int is_bool, val = 0;
+
+	val = git_env_ulong("GIT_TEST_INDEX_THREADS", 0);
+	if (val)
+		return val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/t/README b/t/README
index 3ea6c85460..8f5c0620ea 100644
--- a/t/README
+++ b/t/README
@@ -327,6 +327,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
 be written after every 'git commit' command, and overrides the
 'core.commitGraph' setting to true.
 
+GIT_TEST_INDEX_THREADS=<n> enables exercising the multi-threaded loading
+of the index for the whole test suite by bypassing the default number of
+cache entries and thread minimums. Setting this to 1 will make the
+index loading single threaded.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 8e17f8e7a0..ef9349bd70 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -6,7 +6,12 @@ test_description='split index mode tests'
 
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
+
+# Testing a hard coded SHA against an index with an extension
+# that can vary from run to run is problematic so we disable
+# those extensions.
 sane_unset GIT_FSMONITOR_TEST
+sane_unset GIT_TEST_INDEX_THREADS
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 5/7] read-cache: load cache extensions on a worker thread
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
                     ` (3 preceding siblings ...)
  2018-10-01 13:45   ` [PATCH v7 4/7] config: add new index.threads config setting Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 15:50     ` Duy Nguyen
  2018-10-01 13:45   ` [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
  2018-10-01 13:45   ` [PATCH v7 7/7] read-cache: load cache entries on worker threads Ben Peart
  6 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

	Test w/100,000 files reduced the time by 0.53%
	Test w/1,000,000 files reduced the time by 27.78%

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 read-cache.c | 97 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index af2605a168..77083ab8bb 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,7 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#include "thread-utils.h"
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1890,6 +1891,46 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	const char *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap((void *)p->mmap, p->mmap_size);
+			die(_("index file corrupt"));
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1900,6 +1941,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
+	struct load_index_extensions p;
+	size_t extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,6 +1982,30 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads > 1) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			int err;
+
+			p.src_offset = extension_offset;
+			err = pthread_create(&p.pthread, NULL, load_index_extensions, &p);
+			if (err)
+				die(_("unable to create load_index_extensions thread: %s"), strerror(err));
+
+			nr_threads--;
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
@@ -1960,22 +2030,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		extsize = get_be32(mmap + src_offset + 4);
-		if (read_index_extension(istate,
-					 mmap + src_offset,
-					 mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset) {
+		int ret = pthread_join(p.pthread, NULL);
+		if (ret)
+			die(_("unable to join load_index_extensions thread: %s"), strerror(ret));
+	}
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
                     ` (4 preceding siblings ...)
  2018-10-01 13:45   ` [PATCH v7 5/7] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 16:27     ` Duy Nguyen
  2018-10-01 13:45   ` [PATCH v7 7/7] read-cache: load cache entries on worker threads Ben Peart
  6 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch enables addressing the CPU cost of loading the index by adding
additional data to the index that will allow us to efficiently multi-
thread the loading and conversion of cache entries.

It accomplishes this by adding an (optional) index extension that is a
table of offsets to blocks of cache entries in the index file.  To make
this work for V4 indexes, when writing the cache entries, it periodically
"resets" the prefix-compression by encoding the current entry as if the
path name for the previous entry is completely different and saves the
offset of that entry in the IEOT.  Basically, with V4 indexes, it
generates offsets into blocks of prefix-compressed entries.

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 Documentation/technical/index-format.txt |  18 +++
 read-cache.c                             | 173 +++++++++++++++++++++++
 2 files changed, 191 insertions(+)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index 6bc2d90f7f..7c4d67aa6a 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -337,3 +337,21 @@ The remaining data of each directory block is grouped by type:
 
 	SHA-1("TREE" + <binary representation of N> +
 		"REUC" + <binary representation of M>)
+
+== Index Entry Offset Table
+
+  The Index Entry Offset Table (IEOT) is used to help address the CPU
+  cost of loading the index by enabling multi-threading the process of
+  converting cache entries from the on-disk format to the in-memory format.
+  The signature for this extension is { 'I', 'E', 'O', 'T' }.
+
+  The extension consists of:
+
+  - 32-bit version (currently 1)
+
+  - A number of index offset entries each consisting of:
+
+    - 32-bit offset from the begining of the file to the first cache entry
+	in this block of entries.
+
+    - 32-bit count of cache entries in this block
diff --git a/read-cache.c b/read-cache.c
index 77083ab8bb..9557376e78 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -45,6 +45,7 @@
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
 #define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
+#define CACHE_EXT_INDEXENTRYOFFSETTABLE 0x49454F54 /* "IEOT" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1696,6 +1697,7 @@ static int read_index_extension(struct index_state *istate,
 		read_fsmonitor_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_ENDOFINDEXENTRIES:
+	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
 		/* already handled in do_read_index() */
 		break;
 	default:
@@ -1888,6 +1890,23 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+struct index_entry_offset
+{
+	/* starting byte offset into index file, count of index entries in this block */
+	int offset, nr;
+};
+
+struct index_entry_offset_table
+{
+	int nr;
+	struct index_entry_offset entries[0];
+};
+
+#ifndef NO_PTHREADS
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset);
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
+#endif
+
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
@@ -1931,6 +1950,15 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+
+#define THREAD_COST		(10000)
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2523,6 +2551,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
 	off_t offset;
+	int ieot_work = 1;
+	struct index_entry_offset_table *ieot = NULL;
+	int nr;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2556,7 +2587,38 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+#ifndef NO_PTHREADS
+	if ((nr = git_config_get_index_threads()) != 1) {
+		int ieot_blocks, cpus;
+
+		/*
+		 * ensure default number of ieot blocks maps evenly to the
+		 * default number of threads that will process them
+		 */
+		if (!nr) {
+			ieot_blocks = istate->cache_nr / THREAD_COST;
+			cpus = online_cpus();
+			if (ieot_blocks > cpus - 1)
+				ieot_blocks = cpus - 1;
+		} else {
+			ieot_blocks = nr;
+		}
+
+		/*
+		 * no reason to write out the IEOT extension if we don't
+		 * have enough blocks to utilize multi-threading
+		 */
+		if (ieot_blocks > 1) {
+			ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
+				+ (ieot_blocks * sizeof(struct index_entry_offset)));
+			ieot->nr = 0;
+			ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
+		}
+	}
+#endif
+
 	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	nr = 0;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2578,11 +2640,31 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
+		if (ieot && i && (i % ieot_work == 0)) {
+			ieot->entries[ieot->nr].nr = nr;
+			ieot->entries[ieot->nr].offset = offset;
+			ieot->nr++;
+			/*
+			 * If we have a V4 index, set the first byte to an invalid
+			 * character to ensure there is nothing common with the previous
+			 * entry
+			 */
+			if (previous_name)
+				previous_name->buf[0] = 0;
+			nr = 0;
+			offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+		}
 		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
 
 		if (err)
 			break;
+		nr++;
+	}
+	if (ieot && nr) {
+		ieot->entries[ieot->nr].nr = nr;
+		ieot->entries[ieot->nr].offset = offset;
+		ieot->nr++;
 	}
 	strbuf_release(&previous_name_buf);
 
@@ -2593,6 +2675,26 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
 	the_hash_algo->init_fn(&eoie_c);
 
+	/*
+	 * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
+	 * can minimze the number of extensions we have to scan through to
+	 * find it during load.  Write it out regardless of the
+	 * strip_extensions parameter as we need it when loading the shared
+	 * index.
+	 */
+#ifndef NO_PTHREADS
+	if (ieot) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_ieot_extension(&sb, ieot);
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+#endif
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
@@ -3176,3 +3278,74 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
 	the_hash_algo->final_fn(hash, eoie_context);
 	strbuf_add(sb, hash, the_hash_algo->rawsz);
 }
+
+#ifndef NO_PTHREADS
+#define IEOT_VERSION	(1)
+
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
+{
+       const char *index = NULL;
+       uint32_t extsize, ext_version;
+       struct index_entry_offset_table *ieot;
+       int i, nr;
+
+       /* find the IEOT extension */
+       if (!offset)
+	       return NULL;
+       while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
+	       extsize = get_be32(mmap + offset + 4);
+	       if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
+		       index = mmap + offset + 4 + 4;
+		       break;
+	       }
+	       offset += 8;
+	       offset += extsize;
+       }
+       if (!index)
+	       return NULL;
+
+       /* validate the version is IEOT_VERSION */
+       ext_version = get_be32(index);
+       if (ext_version != IEOT_VERSION)
+	       return NULL;
+       index += sizeof(uint32_t);
+
+       /* extension size - version bytes / bytes per entry */
+       nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
+       if (!nr)
+	       return NULL;
+       ieot = xmalloc(sizeof(struct index_entry_offset_table)
+	       + (nr * sizeof(struct index_entry_offset)));
+       ieot->nr = nr;
+       for (i = 0; i < nr; i++) {
+	       ieot->entries[i].offset = get_be32(index);
+	       index += sizeof(uint32_t);
+	       ieot->entries[i].nr = get_be32(index);
+	       index += sizeof(uint32_t);
+       }
+
+       return ieot;
+}
+
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
+{
+       uint32_t buffer;
+       int i;
+
+       /* version */
+       put_be32(&buffer, IEOT_VERSION);
+       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+       /* ieot */
+       for (i = 0; i < ieot->nr; i++) {
+
+	       /* offset */
+	       put_be32(&buffer, ieot->entries[i].offset);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	       /* count */
+	       put_be32(&buffer, ieot->entries[i].nr);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+       }
+}
+#endif
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v7 7/7] read-cache: load cache entries on worker threads
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
                     ` (5 preceding siblings ...)
  2018-10-01 13:45   ` [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
@ 2018-10-01 13:45   ` Ben Peart
  2018-10-01 17:09     ` Duy Nguyen
  6 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-01 13:45 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch helps address the CPU cost of loading the index by utilizing
the Index Entry Offset Table (IEOT) to divide loading and conversion of
the cache entries across multiple threads in parallel.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files reduced the time by 32.24%
Test w/1,000,000 files reduced the time by -4.77%

Note that on the 1,000,000 files case, multi-threading the cache entry parsing
does not yield a performance win.  This is because the cost to parse the
index extensions in this repo, far outweigh the cost of loading the cache
entries.

The high cost of parsing the index extensions is driven by the cache tree
and the untracked cache extensions. As this is currently the longest pole,
any reduction in this time will reduce the overall index load times so is
worth further investigation in another patch series.

Signed-off-by: Ben Peart <peartben@gmail.com>
---
 read-cache.c | 224 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 189 insertions(+), 35 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 9557376e78..14402a0738 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1720,7 +1720,8 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *create_from_disk(struct index_state *istate,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    const struct cache_entry *previous_ce)
@@ -1737,7 +1738,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 	 * number of bytes to be stripped from the end of the previous name,
 	 * and the bytes to append to the result, to come up with its name.
 	 */
-	int expand_name_field = istate->version == 4;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1761,16 +1762,17 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		const unsigned char *cp = (const unsigned char *)name;
 		size_t strip_len, previous_len;
 
-		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		/* If we're at the begining of a block, ignore the previous name */
 		strip_len = decode_varint(&cp);
-		if (previous_len < strip_len) {
-			if (previous_ce)
+		if (previous_ce) {
+			previous_len = previous_ce->ce_namelen;
+			if (previous_len < strip_len)
 				die(_("malformed name field in the index, near path '%s'"),
-				    previous_ce->name);
-			else
-				die(_("malformed name field in the index in the first path"));
+					previous_ce->name);
+			copy_len = previous_len - strip_len;
+		} else {
+			copy_len = 0;
 		}
-		copy_len = previous_len - strip_len;
 		name = (const char *)cp;
 	}
 
@@ -1780,7 +1782,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 			len += copy_len;
 	}
 
-	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1950,6 +1952,52 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
+			unsigned long start_offset, const struct cache_entry *previous_ce)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+		previous_ce = ce;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, NULL);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
 /*
  * Mostly randomly chosen maximum thread counts: we
  * cap the parallelism to online_cpus() threads, and we want
@@ -1959,20 +2007,125 @@ static void *load_index_extensions(void *_data)
 
 #define THREAD_COST		(10000)
 
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset;
+	const char *mmap;
+	struct index_entry_offset_table *ieot;
+	int ieot_offset;        /* starting index into the ieot array */
+	int ieot_work;          /* count of ieot entries to process */
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	int i;
+
+	/* iterate across all ieot blocks assigned to this thread */
+	for (i = p->ieot_offset; i < p->ieot_offset + p->ieot_work; i++) {
+		p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
+		p->offset += p->ieot->entries[i].nr;
+	}
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
+			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
+{
+	int i, offset, ieot_work, ieot_offset, err;
+	struct load_cache_entries_thread_data *data;
+	unsigned long consumed = 0;
+	int nr;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/* ensure we have no more threads than we have blocks to process */
+	if (nr_threads > ieot->nr)
+		nr_threads = ieot->nr;
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	offset = ieot_offset = 0;
+	ieot_work = DIV_ROUND_UP(ieot->nr, nr_threads);
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+		int j;
+
+		if (ieot_offset + ieot_work > ieot->nr)
+			ieot_work = ieot->nr - ieot_offset;
+
+		p->istate = istate;
+		p->offset = offset;
+		p->mmap = mmap;
+		p->ieot = ieot;
+		p->ieot_offset = ieot_offset;
+		p->ieot_work = ieot_work;
+
+		/* create a mem_pool for each thread */
+		nr = 0;
+		for (j = p->ieot_offset; j < p->ieot_offset + p->ieot_work; j++)
+			nr += p->ieot->entries[j].nr;
+		if (istate->version == 4) {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size_from_compressed(nr));
+		}
+		else {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size(mmap_size, nr));
+		}
+
+		err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
+		if (err)
+			die(_("unable to create load_cache_entries thread: %s"), strerror(err));
+
+		/* increment by the number of cache entries in the ieot block being processed */
+		for (j = 0; j < ieot_work; j++)
+			offset += ieot->entries[ieot_offset + j].nr;
+		ieot_offset += ieot_work;
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+
+		err = pthread_join(p->pthread, NULL);
+		if (err)
+			die(_("unable to join load_cache_entries thread: %s"), strerror(err));
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		consumed += p->consumed;
+	}
+
+	free(data);
+
+	return consumed;
+}
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	const struct cache_header *hdr;
 	const char *mmap;
 	size_t mmap_size;
-	const struct cache_entry *previous_ce = NULL;
 	struct load_index_extensions p;
 	size_t extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int nr_threads, cpus;
+	struct index_entry_offset_table *ieot = NULL;
 #endif
 
 	if (istate->initialized)
@@ -2014,10 +2167,18 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+
+	/* TODO: does creating more threads than cores help? */
+	if (!nr_threads) {
+		nr_threads = istate->cache_nr / THREAD_COST;
+		cpus = online_cpus();
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
 
 	if (nr_threads > 1) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2032,29 +2193,22 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			nr_threads--;
 		}
 	}
-#endif
-
-	if (istate->version == 4) {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
 
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
+	/*
+	 * Locate and read the index entry offset table so that we can use it
+	 * to multi-thread the reading of the cache entries.
+	 */
+	if (extension_offset && nr_threads > 1)
+		ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
 
-		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
-		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
-		set_index_entry(istate, i, ce);
+	if (ieot)
+		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#endif
 
-		src_offset += consumed;
-		previous_ce = ce;
-	}
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v6 4/7] config: add new index.threads config setting
  2018-10-01 13:17                 ` Ben Peart
@ 2018-10-01 15:06                   ` SZEDER Gábor
  0 siblings, 0 replies; 153+ messages in thread
From: SZEDER Gábor @ 2018-10-01 15:06 UTC (permalink / raw)
  To: Ben Peart
  Cc: Junio C Hamano, Ramsay Jones, git, pclouds, Ben Peart, Ben Peart

On Mon, Oct 01, 2018 at 09:17:53AM -0400, Ben Peart wrote:
> 
> 
> On 9/28/2018 6:15 PM, Junio C Hamano wrote:
> >Ramsay Jones <ramsay@ramsayjones.plus.com> writes:
> >
> >>>                 if (!nr) {
> >>>                         ieot_blocks = istate->cache_nr / THREAD_COST;
> >>>-                       if (ieot_blocks < 1)
> >>>-                               ieot_blocks = 1;
> >>>                         cpus = online_cpus();
> >>>                         if (ieot_blocks > cpus - 1)
> >>>                                 ieot_blocks = cpus - 1;
> >>
> >>So, am I reading this correctly - you need cpus > 2 before an
> >>IEOT extension block is written out?
> >>
> >>OK.
> >
> >Why should we be even calling online_cpus() in this codepath to
> >write the index in a single thread to begin with?
> >
> >The number of cpus that readers would use to read this index file
> >has nothing to do with the number of cpus available to this
> >particular writer process.
> >
> 
> As I mentioned in my other reply, this is optimizing for the most common
> case where the index is read from the same machine that wrote it and the
> user is taking the default settings (ie index.threads=true).

I think this is a reasonable assumption to make, but it should be
mentioned in the relevant commit message.  Alas, as far as I can tell,
not a single commit message has been updated in v7.

> Aligning the number of blocks to the number of threads that will be
> processing them avoids situations where one thread may have up to double the
> work to do as the other threads (for example, if there were 3 blocks to be
> processed by 2 threads).

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 2/7] read-cache: clean up casting and byte decoding
  2018-10-01 13:45   ` [PATCH v7 2/7] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-10-01 15:10     ` Duy Nguyen
  0 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-10-01 15:10 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
>
> From: Ben Peart <benpeart@microsoft.com>
>
> This patch does a clean up pass to minimize the casting required to work
> with the memory mapped index (mmap).
>
> It also makes the decoding of network byte order more consistent by using
> get_be32() where possible.
>
> Signed-off-by: Ben Peart <peartben@gmail.com>
> ---
>  read-cache.c | 23 +++++++++++------------
>  1 file changed, 11 insertions(+), 12 deletions(-)
>
> diff --git a/read-cache.c b/read-cache.c
> index 583a4fb1f8..6ba99e2c96 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1650,7 +1650,7 @@ int verify_index_checksum;
>  /* Allow fsck to force verification of the cache entry order. */
>  int verify_ce_order;
>
> -static int verify_hdr(struct cache_header *hdr, unsigned long size)
> +static int verify_hdr(const struct cache_header *hdr, unsigned long size)

OK more constness. Good.

>  {
>         git_hash_ctx c;
>         unsigned char hash[GIT_MAX_RAWSZ];
> @@ -1674,7 +1674,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
>  }
>
>  static int read_index_extension(struct index_state *istate,
> -                               const char *ext, void *data, unsigned long sz)
> +                               const char *ext, const char *data, unsigned long sz)

But it's not clear why you need to change the data type from void * to
char * here. I guess all the consumer functions take 'const char *'
anyway, so it's best to use 'const char *'?

Not worth a reroll (to give a reason why you do this in the commit
message), unless there are other changes.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-01 13:45   ` [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-10-01 15:17     ` SZEDER Gábor
  2018-10-02 14:34       ` Ben Peart
  2018-10-01 15:30     ` Duy Nguyen
  1 sibling, 1 reply; 153+ messages in thread
From: SZEDER Gábor @ 2018-10-01 15:17 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

On Mon, Oct 01, 2018 at 09:45:52AM -0400, Ben Peart wrote:
> From: Ben Peart <benpeart@microsoft.com>
> 
> The End of Index Entry (EOIE) is used to locate the end of the variable
> length index entries and the beginning of the extensions. Code can take
> advantage of this to quickly locate the index extensions without having
> to parse through all of the index entries.
> 
> Because it must be able to be loaded before the variable length cache
> entries and other index extensions, this extension must be written last.
> The signature for this extension is { 'E', 'O', 'I', 'E' }.
> 
> The extension consists of:
> 
> - 32-bit offset to the end of the index entries
> 
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
> 
> SHA-1("TREE" + <binary representation of N> +
> 	"REUC" + <binary representation of M>)
> 
> Signed-off-by: Ben Peart <peartben@gmail.com>

I think the commit message should explicitly mention that this this
extension

  - will always be written and why,
  - but is optional, so other Git implementations not supporting it will
    have no troubles reading the index,
  - and that it is written even to the shared index and why, and that
    because of this the index checksums in t1700 had to be updated.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-01 13:45   ` [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-10-01 15:17     ` SZEDER Gábor
@ 2018-10-01 15:30     ` Duy Nguyen
  2018-10-02 15:13       ` Ben Peart
  1 sibling, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-10-01 15:30 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
> @@ -2479,6 +2491,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
>                 return -1;
>
> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;

Note, lseek() could in theory return -1 on error. Looking at the error
code list in the man page it's pretty unlikely though, unless

> +static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
> +{
> +       /*
> +        * The end of index entries (EOIE) extension is guaranteed to be last
> +        * so that it can be found by scanning backwards from the EOF.
> +        *
> +        * "EOIE"
> +        * <4-byte length>
> +        * <4-byte offset>
> +        * <20-byte hash>
> +        */
> +       const char *index, *eoie;
> +       uint32_t extsize;
> +       size_t offset, src_offset;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +       git_hash_ctx c;
> +
> +       /* ensure we have an index big enough to contain an EOIE extension */
> +       if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)

Using sizeof() for on-disk structures could be dangerous because you
don't know how much padding there could be (I'm not sure if it's
actually specified in the C language spec). I've checked, on at least
x86 and amd64, sizeof(struct cache_header) is 12 bytes, but I don't
know if there are any crazy architectures out there that set higher
padding.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 5/7] read-cache: load cache extensions on a worker thread
  2018-10-01 13:45   ` [PATCH v7 5/7] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-10-01 15:50     ` Duy Nguyen
  2018-10-02 15:00       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-10-01 15:50 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
> @@ -1890,6 +1891,46 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>  static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
>  static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
>
> +struct load_index_extensions
> +{
> +#ifndef NO_PTHREADS
> +       pthread_t pthread;
> +#endif
> +       struct index_state *istate;
> +       const char *mmap;
> +       size_t mmap_size;
> +       unsigned long src_offset;
> +};
> +
> +static void *load_index_extensions(void *_data)
> +{
> +       struct load_index_extensions *p = _data;
> +       unsigned long src_offset = p->src_offset;
> +
> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, p->mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);

This could be get_be32() so that the next person will not need to do
another cleanup patch.

> +               if (read_index_extension(p->istate,
> +                       p->mmap + src_offset,
> +                       p->mmap + src_offset + 8,
> +                       extsize) < 0) {

This alignment is misleading because the conditions are aligned with
the code block below. If you can't align it with the '(', then just
add another tab.

> +                       munmap((void *)p->mmap, p->mmap_size);

This made me pause for a bit since we should not need to cast back to
void *. It turns out you need this because mmap pointer is const. But
you don't even need to munmap here. We're dying, the OS will clean
everything up.

> +                       die(_("index file corrupt"));
> +               }
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +
> +       return NULL;
> +}
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-10-01 13:45   ` [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
@ 2018-10-01 16:27     ` Duy Nguyen
  2018-10-02 16:34       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-10-01 16:27 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
> @@ -1888,6 +1890,23 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +struct index_entry_offset
> +{
> +       /* starting byte offset into index file, count of index entries in this block */
> +       int offset, nr;

uint32_t?

> +};
> +
> +struct index_entry_offset_table
> +{
> +       int nr;
> +       struct index_entry_offset entries[0];

Use FLEX_ARRAY. Some compilers are not happy with an array of zero
items if I remember correctly.

> @@ -2523,6 +2551,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>         int drop_cache_tree = istate->drop_cache_tree;
>         off_t offset;
> +       int ieot_work = 1;
> +       struct index_entry_offset_table *ieot = NULL;
> +       int nr;

There are a bunch of stuff going on in this function, maybe rename
this to nr_threads or nr_blocks to be less generic.

>
>         for (i = removed = extended = 0; i < entries; i++) {
>                 if (cache[i]->ce_flags & CE_REMOVE)
> @@ -2556,7 +2587,38 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
>                 return -1;
>
> +#ifndef NO_PTHREADS
> +       if ((nr = git_config_get_index_threads()) != 1) {

Maybe keep this assignment out of "if".

> +               int ieot_blocks, cpus;
> +
> +               /*
> +                * ensure default number of ieot blocks maps evenly to the
> +                * default number of threads that will process them
> +                */
> +               if (!nr) {
> +                       ieot_blocks = istate->cache_nr / THREAD_COST;
> +                       cpus = online_cpus();
> +                       if (ieot_blocks > cpus - 1)
> +                               ieot_blocks = cpus - 1;

The " - 1" here is for extension thread, yes? Probably worth a comment.

> +               } else {
> +                       ieot_blocks = nr;
> +               }
> +
> +               /*
> +                * no reason to write out the IEOT extension if we don't
> +                * have enough blocks to utilize multi-threading
> +                */
> +               if (ieot_blocks > 1) {
> +                       ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
> +                               + (ieot_blocks * sizeof(struct index_entry_offset)));

Use FLEX_ALLOC_MEM() after you declare ..._table with FLEX_ARRAY.

This ieot needs to be freed also and should be before any "return -1"
in this function.

> +                       ieot->nr = 0;
> +                       ieot_work = DIV_ROUND_UP(entries, ieot_blocks);

Perhaps a better name for ioet_work? This looks like the number of
cache entries per block.

> +               }
> +       }
> +#endif
> +
>         offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> +       nr = 0;

Eh.. repurpose nr to count cache entries now? It's kinda hard to follow.

>         previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
>
>         for (i = 0; i < entries; i++) {
> @@ -2578,11 +2640,31 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>
>                         drop_cache_tree = 1;
>                 }
> +               if (ieot && i && (i % ieot_work == 0)) {
> +                       ieot->entries[ieot->nr].nr = nr;
> +                       ieot->entries[ieot->nr].offset = offset;
> +                       ieot->nr++;
> +                       /*
> +                        * If we have a V4 index, set the first byte to an invalid
> +                        * character to ensure there is nothing common with the previous
> +                        * entry
> +                        */
> +                       if (previous_name)
> +                               previous_name->buf[0] = 0;
> +                       nr = 0;
> +                       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;

This only works correctly if the ce_write_entry() from the last
iteration has flushed everything to out to newfd. Maybe it does, but
it's error prone to rely on that in my opinion. Maybe we need an
explicit ce_write_flush() here to make sure.

> +               }
>                 if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
>                         err = -1;
>
>                 if (err)
>                         break;
> +               nr++;
> +       }
> +       if (ieot && nr) {
> +               ieot->entries[ieot->nr].nr = nr;
> +               ieot->entries[ieot->nr].offset = offset;
> +               ieot->nr++;
>         }
>         strbuf_release(&previous_name_buf);
>
> @@ -2593,6 +2675,26 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>         the_hash_algo->init_fn(&eoie_c);
>
> +       /*
> +        * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
> +        * can minimze the number of extensions we have to scan through to

s/minimze/minimize/

> +        * find it during load.  Write it out regardless of the
> +        * strip_extensions parameter as we need it when loading the shared
> +        * index.
> +        */
> +#ifndef NO_PTHREADS
> +       if (ieot) {
> +               struct strbuf sb = STRBUF_INIT;
> +
> +               write_ieot_extension(&sb, ieot);
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
> +                       || ce_write(&c, newfd, sb.buf, sb.len) < 0;
> +               strbuf_release(&sb);
> +               if (err)
> +                       return -1;
> +       }
> +#endif
> +
>         if (!strip_extensions && istate->split_index) {
>                 struct strbuf sb = STRBUF_INIT;
>
> @@ -3176,3 +3278,74 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
>         the_hash_algo->final_fn(hash, eoie_context);
>         strbuf_add(sb, hash, the_hash_algo->rawsz);
>  }
> +
> +#ifndef NO_PTHREADS
> +#define IEOT_VERSION   (1)
> +
> +static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
> +{
> +       const char *index = NULL;
> +       uint32_t extsize, ext_version;
> +       struct index_entry_offset_table *ieot;
> +       int i, nr;
> +
> +       /* find the IEOT extension */
> +       if (!offset)
> +              return NULL;
> +       while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
> +              extsize = get_be32(mmap + offset + 4);
> +              if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
> +                      index = mmap + offset + 4 + 4;
> +                      break;
> +              }
> +              offset += 8;
> +              offset += extsize;
> +       }

Maybe refactor this loop. I think I've seen this in at least two
places now. Probably three?

> +       if (!index)
> +              return NULL;
> +
> +       /* validate the version is IEOT_VERSION */
> +       ext_version = get_be32(index);
> +       if (ext_version != IEOT_VERSION)
> +              return NULL;

Report the error (e.g. "unsupported version" or something)

> +       index += sizeof(uint32_t);
> +
> +       /* extension size - version bytes / bytes per entry */
> +       nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));

Do we need to check if "(extsize - version) % sizeof(entry) == 0"?

> +       if (!nr)
> +              return NULL;
> +       ieot = xmalloc(sizeof(struct index_entry_offset_table)
> +              + (nr * sizeof(struct index_entry_offset)));
> +       ieot->nr = nr;
> +       for (i = 0; i < nr; i++) {
> +              ieot->entries[i].offset = get_be32(index);
> +              index += sizeof(uint32_t);
> +              ieot->entries[i].nr = get_be32(index);
> +              index += sizeof(uint32_t);
> +       }
> +
> +       return ieot;
> +}
> +
> +static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
> +{
> +       uint32_t buffer;
> +       int i;
> +
> +       /* version */
> +       put_be32(&buffer, IEOT_VERSION);
> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
> +
> +       /* ieot */
> +       for (i = 0; i < ieot->nr; i++) {
> +
> +              /* offset */
> +              put_be32(&buffer, ieot->entries[i].offset);
> +              strbuf_add(sb, &buffer, sizeof(uint32_t));
> +
> +              /* count */
> +              put_be32(&buffer, ieot->entries[i].nr);
> +              strbuf_add(sb, &buffer, sizeof(uint32_t));
> +       }
> +}
> +#endif
> --
> 2.18.0.windows.1
>


-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 7/7] read-cache: load cache entries on worker threads
  2018-10-01 13:45   ` [PATCH v7 7/7] read-cache: load cache entries on worker threads Ben Peart
@ 2018-10-01 17:09     ` Duy Nguyen
  2018-10-02 19:09       ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-10-01 17:09 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> +                       struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,

Please use unsigned long for offset (here and in the thread_data
struct). We should use off_t instead, but that's out of scope. At
least keep offset type consistent in here.

> +                       unsigned long start_offset, const struct cache_entry *previous_ce)

I don't think you want to pass previous_ce in. You always pass NULL
anyway. And if this function is about loading a block (i.e. at block
boundary) then initial previous_ce _must_ be NULL or things break
horribly.

> @@ -1959,20 +2007,125 @@ static void *load_index_extensions(void *_data)
>
>  #define THREAD_COST            (10000)
>
> +struct load_cache_entries_thread_data
> +{
> +       pthread_t pthread;
> +       struct index_state *istate;
> +       struct mem_pool *ce_mem_pool;
> +       int offset;
> +       const char *mmap;
> +       struct index_entry_offset_table *ieot;
> +       int ieot_offset;        /* starting index into the ieot array */

If it's an index, maybe just name it ieot_index and we can get rid of
the comment.

> +       int ieot_work;          /* count of ieot entries to process */

Maybe instead of saving the whole "ieot" table here. Add

     struct index_entry_offset *blocks;

which points to the starting block for this thread and rename that
mysterious (to me) ieot_work to nr_blocks. The thread will have access
from blocks[0] to blocks[nr_blocks - 1]

> +       unsigned long consumed; /* return # of bytes in index file processed */
> +};
> +
> +/*
> + * A thread proc to run the load_cache_entries() computation
> + * across multiple background threads.
> + */
> +static void *load_cache_entries_thread(void *_data)
> +{
> +       struct load_cache_entries_thread_data *p = _data;
> +       int i;
> +
> +       /* iterate across all ieot blocks assigned to this thread */
> +       for (i = p->ieot_offset; i < p->ieot_offset + p->ieot_work; i++) {
> +               p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);

Please wrap this long line.

> +               p->offset += p->ieot->entries[i].nr;
> +       }
> +       return NULL;
> +}
> +
> +static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
> +                       unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
> +{
> +       int i, offset, ieot_work, ieot_offset, err;
> +       struct load_cache_entries_thread_data *data;
> +       unsigned long consumed = 0;
> +       int nr;
> +
> +       /* a little sanity checking */
> +       if (istate->name_hash_initialized)
> +               BUG("the name hash isn't thread safe");
> +
> +       mem_pool_init(&istate->ce_mem_pool, 0);
> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));

we normally use sizeof(*data) instead of sizeof(struct ...)

> +
> +       /* ensure we have no more threads than we have blocks to process */
> +       if (nr_threads > ieot->nr)
> +               nr_threads = ieot->nr;
> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));

eh.. reallocate the same "data"?

> +
> +       offset = ieot_offset = 0;
> +       ieot_work = DIV_ROUND_UP(ieot->nr, nr_threads);
> +       for (i = 0; i < nr_threads; i++) {
> +               struct load_cache_entries_thread_data *p = &data[i];
> +               int j;
> +
> +               if (ieot_offset + ieot_work > ieot->nr)
> +                       ieot_work = ieot->nr - ieot_offset;
> +
> +               p->istate = istate;
> +               p->offset = offset;
> +               p->mmap = mmap;
> +               p->ieot = ieot;
> +               p->ieot_offset = ieot_offset;
> +               p->ieot_work = ieot_work;
> +
> +               /* create a mem_pool for each thread */
> +               nr = 0;

Since nr is only used in this for loop. Declare it in this scope
instead of declaring it for the whole function.

> +               for (j = p->ieot_offset; j < p->ieot_offset + p->ieot_work; j++)
> +                       nr += p->ieot->entries[j].nr;
> +               if (istate->version == 4) {
> +                       mem_pool_init(&p->ce_mem_pool,
> +                               estimate_cache_size_from_compressed(nr));
> +               }
> +               else {
> +                       mem_pool_init(&p->ce_mem_pool,
> +                               estimate_cache_size(mmap_size, nr));
> +               }

Maybe keep this mem_pool_init code inside load_cache_entries_thread(),
similar to how you do it for load_cache_entries_thread(). It's mostly
to keep this loop shorter to see (and understand), of course
parallelizing this mem_pool_init() is just noise.

> +
> +               err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
> +               if (err)
> +                       die(_("unable to create load_cache_entries thread: %s"), strerror(err));
> +
> +               /* increment by the number of cache entries in the ieot block being processed */
> +               for (j = 0; j < ieot_work; j++)
> +                       offset += ieot->entries[ieot_offset + j].nr;

I wonder if it makes things simpler if you store cache_entry _index_
in entrie[] array instead of storing the number of entries. You can
easily calculate nr then by doing entries[i].index -
entries[i-1].index. And you can count multiple blocks the same way,
without looping like this.

> +               ieot_offset += ieot_work;
> +       }
> +
> +       for (i = 0; i < nr_threads; i++) {
> +               struct load_cache_entries_thread_data *p = &data[i];
> +
> +               err = pthread_join(p->pthread, NULL);
> +               if (err)
> +                       die(_("unable to join load_cache_entries thread: %s"), strerror(err));
> +               mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
> +               consumed += p->consumed;
> +       }
> +
> +       free(data);
> +
> +       return consumed;
> +}
> +#endif
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> -       int fd, i;
> +       int fd;
>         struct stat st;
>         unsigned long src_offset;
>         const struct cache_header *hdr;
>         const char *mmap;
>         size_t mmap_size;
> -       const struct cache_entry *previous_ce = NULL;
>         struct load_index_extensions p;
>         size_t extension_offset = 0;
>  #ifndef NO_PTHREADS
> -       int nr_threads;
> +       int nr_threads, cpus;
> +       struct index_entry_offset_table *ieot = NULL;
>  #endif
>
>         if (istate->initialized)
> @@ -2014,10 +2167,18 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         p.mmap = mmap;
>         p.mmap_size = mmap_size;
>
> +       src_offset = sizeof(*hdr);

OK we've been doing this since forever, sizeof(struct cache_header)
probably does not have extra padding on any supported platform.

> @@ -2032,29 +2193,22 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>                         nr_threads--;
>                 }
>         }
> -#endif
> -
> -       if (istate->version == 4) {
> -               mem_pool_init(&istate->ce_mem_pool,
> -                             estimate_cache_size_from_compressed(istate->cache_nr));
> -       } else {
> -               mem_pool_init(&istate->ce_mem_pool,
> -                             estimate_cache_size(mmap_size, istate->cache_nr));
> -       }
>
> -       src_offset = sizeof(*hdr);
> -       for (i = 0; i < istate->cache_nr; i++) {
> -               struct ondisk_cache_entry *disk_ce;
> -               struct cache_entry *ce;
> -               unsigned long consumed;
> +       /*
> +        * Locate and read the index entry offset table so that we can use it
> +        * to multi-thread the reading of the cache entries.
> +        */
> +       if (extension_offset && nr_threads > 1)
> +               ieot = read_ieot_extension(mmap, mmap_size, extension_offset);

You need to free ieot at some point.

>
> -               disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
> -               ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
> -               set_index_entry(istate, i, ce);
> +       if (ieot)
> +               src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
> +       else
> +               src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> +#else
> +       src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> +#endif
>
> -               src_offset += consumed;
> -               previous_ce = ce;
> -       }
>         istate->timestamp.sec = st.st_mtime;
>         istate->timestamp.nsec = ST_MTIME_NSEC(st);
>
> --
> 2.18.0.windows.1
>
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-01 15:17     ` SZEDER Gábor
@ 2018-10-02 14:34       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-02 14:34 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: git, gitster, pclouds, Ben Peart



On 10/1/2018 11:17 AM, SZEDER Gábor wrote:
> On Mon, Oct 01, 2018 at 09:45:52AM -0400, Ben Peart wrote:
>> From: Ben Peart <benpeart@microsoft.com>
>>
>> The End of Index Entry (EOIE) is used to locate the end of the variable
>> length index entries and the beginning of the extensions. Code can take
>> advantage of this to quickly locate the index extensions without having
>> to parse through all of the index entries.
>>
>> Because it must be able to be loaded before the variable length cache
>> entries and other index extensions, this extension must be written last.
>> The signature for this extension is { 'E', 'O', 'I', 'E' }.
>>
>> The extension consists of:
>>
>> - 32-bit offset to the end of the index entries
>>
>> - 160-bit SHA-1 over the extension types and their sizes (but not
>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> then the hash would be:
>>
>> SHA-1("TREE" + <binary representation of N> +
>> 	"REUC" + <binary representation of M>)
>>
>> Signed-off-by: Ben Peart <peartben@gmail.com>
> 
> I think the commit message should explicitly mention that this this
> extension
> 
>    - will always be written and why,
>    - but is optional, so other Git implementations not supporting it will
>      have no troubles reading the index,
>    - and that it is written even to the shared index and why, and that
>      because of this the index checksums in t1700 had to be updated.
> 

Sure, I'll add that additional information to the commit message on the 
next spin.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 5/7] read-cache: load cache extensions on a worker thread
  2018-10-01 15:50     ` Duy Nguyen
@ 2018-10-02 15:00       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-02 15:00 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 10/1/2018 11:50 AM, Duy Nguyen wrote:
> On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
>> @@ -1890,6 +1891,46 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>>   static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
>>   static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
>>
>> +struct load_index_extensions
>> +{
>> +#ifndef NO_PTHREADS
>> +       pthread_t pthread;
>> +#endif
>> +       struct index_state *istate;
>> +       const char *mmap;
>> +       size_t mmap_size;
>> +       unsigned long src_offset;
>> +};
>> +
>> +static void *load_index_extensions(void *_data)
>> +{
>> +       struct load_index_extensions *p = _data;
>> +       unsigned long src_offset = p->src_offset;
>> +
>> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
>> +               /* After an array of active_nr index entries,
>> +                * there can be arbitrary number of extended
>> +                * sections, each of which is prefixed with
>> +                * extension name (4-byte) and section length
>> +                * in 4-byte network byte order.
>> +                */
>> +               uint32_t extsize;
>> +               memcpy(&extsize, p->mmap + src_offset + 4, 4);
>> +               extsize = ntohl(extsize);
> 
> This could be get_be32() so that the next person will not need to do
> another cleanup patch.
> 

Good point, it was existing code so I focused on doing the minimal 
change possible but I can clean it up since I'm touching it already.

>> +               if (read_index_extension(p->istate,
>> +                       p->mmap + src_offset,
>> +                       p->mmap + src_offset + 8,
>> +                       extsize) < 0) {
> 
> This alignment is misleading because the conditions are aligned with
> the code block below. If you can't align it with the '(', then just
> add another tab.
> 

Ditto. I'll make it:

		uint32_t extsize = get_be32(p->mmap + src_offset + 4);
		if (read_index_extension(p->istate,
					 p->mmap + src_offset,
					 p->mmap + src_offset + 8,
					 extsize) < 0) {
			munmap((void *)p->mmap, p->mmap_size);
			die(_("index file corrupt"));
		}


>> +                       munmap((void *)p->mmap, p->mmap_size);
> 
> This made me pause for a bit since we should not need to cast back to
> void *. It turns out you need this because mmap pointer is const. But
> you don't even need to munmap here. We're dying, the OS will clean
> everything up.
> 

I had the same thought about "we're about to die so why bother calling 
munmap() here" but I decided rather than change it, I'd follow the 
existing pattern just in case there was some platform/bug that required 
it.  I apparently doesn't cause harm as it's been that way a long time.

>> +                       die(_("index file corrupt"));
>> +               }
>> +               src_offset += 8;
>> +               src_offset += extsize;
>> +       }
>> +
>> +       return NULL;
>> +}

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-01 15:30     ` Duy Nguyen
@ 2018-10-02 15:13       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-02 15:13 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 10/1/2018 11:30 AM, Duy Nguyen wrote:
> On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
>> @@ -2479,6 +2491,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
>>                  return -1;
>>
>> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> 
> Note, lseek() could in theory return -1 on error. Looking at the error
> code list in the man page it's pretty unlikely though, unless
> 

Good catch. I'll add the logic to check for an error.

>> +static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
>> +{
>> +       /*
>> +        * The end of index entries (EOIE) extension is guaranteed to be last
>> +        * so that it can be found by scanning backwards from the EOF.
>> +        *
>> +        * "EOIE"
>> +        * <4-byte length>
>> +        * <4-byte offset>
>> +        * <20-byte hash>
>> +        */
>> +       const char *index, *eoie;
>> +       uint32_t extsize;
>> +       size_t offset, src_offset;
>> +       unsigned char hash[GIT_MAX_RAWSZ];
>> +       git_hash_ctx c;
>> +
>> +       /* ensure we have an index big enough to contain an EOIE extension */
>> +       if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
> 
> Using sizeof() for on-disk structures could be dangerous because you
> don't know how much padding there could be (I'm not sure if it's
> actually specified in the C language spec). I've checked, on at least
> x86 and amd64, sizeof(struct cache_header) is 12 bytes, but I don't
> know if there are any crazy architectures out there that set higher
> padding.
> 

This must be safe as the same code has been in do_read_index() and 
verify_index_from() for a long time.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-10-01 16:27     ` Duy Nguyen
@ 2018-10-02 16:34       ` Ben Peart
  2018-10-02 17:02         ` Duy Nguyen
  0 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-02 16:34 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 10/1/2018 12:27 PM, Duy Nguyen wrote:
> On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
>> @@ -1888,6 +1890,23 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>>          return ondisk_size + entries * per_entry;
>>   }
>>
>> +struct index_entry_offset
>> +{
>> +       /* starting byte offset into index file, count of index entries in this block */
>> +       int offset, nr;
> 
> uint32_t?
> 
>> +};
>> +
>> +struct index_entry_offset_table
>> +{
>> +       int nr;
>> +       struct index_entry_offset entries[0];
> 
> Use FLEX_ARRAY. Some compilers are not happy with an array of zero
> items if I remember correctly.
> 

Thanks for the warning, I'll update that.

>> @@ -2523,6 +2551,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>          int drop_cache_tree = istate->drop_cache_tree;
>>          off_t offset;
>> +       int ieot_work = 1;
>> +       struct index_entry_offset_table *ieot = NULL;
>> +       int nr;
> 
> There are a bunch of stuff going on in this function, maybe rename
> this to nr_threads or nr_blocks to be less generic.
> 

I can add a nr_threads variable to make this more obvious.

>>
>>          for (i = removed = extended = 0; i < entries; i++) {
>>                  if (cache[i]->ce_flags & CE_REMOVE)
>> @@ -2556,7 +2587,38 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
>>                  return -1;
>>
>> +#ifndef NO_PTHREADS
>> +       if ((nr = git_config_get_index_threads()) != 1) {
> 
> Maybe keep this assignment out of "if".
> 
>> +               int ieot_blocks, cpus;
>> +
>> +               /*
>> +                * ensure default number of ieot blocks maps evenly to the
>> +                * default number of threads that will process them
>> +                */
>> +               if (!nr) {
>> +                       ieot_blocks = istate->cache_nr / THREAD_COST;
>> +                       cpus = online_cpus();
>> +                       if (ieot_blocks > cpus - 1)
>> +                               ieot_blocks = cpus - 1;
> 
> The " - 1" here is for extension thread, yes? Probably worth a comment.
> 
>> +               } else {
>> +                       ieot_blocks = nr;
>> +               }
>> +
>> +               /*
>> +                * no reason to write out the IEOT extension if we don't
>> +                * have enough blocks to utilize multi-threading
>> +                */
>> +               if (ieot_blocks > 1) {
>> +                       ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
>> +                               + (ieot_blocks * sizeof(struct index_entry_offset)));
> 
> Use FLEX_ALLOC_MEM() after you declare ..._table with FLEX_ARRAY.
> 

FLEX_ALLOC_MEM() is focused on variable length "char" data.  All uses of 
FLEX_ARRAY with non char data did the allocation themselves to avoid the 
unnecessary memcpy() that comes with FLEX_ALLOC_MEM.

> This ieot needs to be freed also and should be before any "return -1"
> in this function.
> 

Good catch. Will do.

>> +                       ieot->nr = 0;
>> +                       ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
> 
> Perhaps a better name for ioet_work? This looks like the number of
> cache entries per block.
> 
>> +               }
>> +       }
>> +#endif
>> +
>>          offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>> +       nr = 0;
> 
> Eh.. repurpose nr to count cache entries now? It's kinda hard to follow.
> 
>>          previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
>>
>>          for (i = 0; i < entries; i++) {
>> @@ -2578,11 +2640,31 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>
>>                          drop_cache_tree = 1;
>>                  }
>> +               if (ieot && i && (i % ieot_work == 0)) {
>> +                       ieot->entries[ieot->nr].nr = nr;
>> +                       ieot->entries[ieot->nr].offset = offset;
>> +                       ieot->nr++;
>> +                       /*
>> +                        * If we have a V4 index, set the first byte to an invalid
>> +                        * character to ensure there is nothing common with the previous
>> +                        * entry
>> +                        */
>> +                       if (previous_name)
>> +                               previous_name->buf[0] = 0;
>> +                       nr = 0;
>> +                       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> 
> This only works correctly if the ce_write_entry() from the last
> iteration has flushed everything to out to newfd. Maybe it does, but
> it's error prone to rely on that in my opinion. Maybe we need an
> explicit ce_write_flush() here to make sure.
> 

This logic already takes any unflushed data into account - the offset is 
what has been flushed to disk (lseek) plus the amount still in the 
buffer (write_buffer_len) waiting to be flushed.  I don't see any need 
to force an additional flush and adding one could have a negative impact 
on performance.

>> +               }
>>                  if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
>>                          err = -1;
>>
>>                  if (err)
>>                          break;
>> +               nr++;
>> +       }
>> +       if (ieot && nr) {
>> +               ieot->entries[ieot->nr].nr = nr;
>> +               ieot->entries[ieot->nr].offset = offset;
>> +               ieot->nr++;
>>          }
>>          strbuf_release(&previous_name_buf);
>>
>> @@ -2593,6 +2675,26 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>>          the_hash_algo->init_fn(&eoie_c);
>>
>> +       /*
>> +        * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
>> +        * can minimze the number of extensions we have to scan through to
> 
> s/minimze/minimize/
> 
>> +        * find it during load.  Write it out regardless of the
>> +        * strip_extensions parameter as we need it when loading the shared
>> +        * index.
>> +        */
>> +#ifndef NO_PTHREADS
>> +       if (ieot) {
>> +               struct strbuf sb = STRBUF_INIT;
>> +
>> +               write_ieot_extension(&sb, ieot);
>> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
>> +                       || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>> +               strbuf_release(&sb);
>> +               if (err)
>> +                       return -1;
>> +       }
>> +#endif
>> +
>>          if (!strip_extensions && istate->split_index) {
>>                  struct strbuf sb = STRBUF_INIT;
>>
>> @@ -3176,3 +3278,74 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
>>          the_hash_algo->final_fn(hash, eoie_context);
>>          strbuf_add(sb, hash, the_hash_algo->rawsz);
>>   }
>> +
>> +#ifndef NO_PTHREADS
>> +#define IEOT_VERSION   (1)
>> +
>> +static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
>> +{
>> +       const char *index = NULL;
>> +       uint32_t extsize, ext_version;
>> +       struct index_entry_offset_table *ieot;
>> +       int i, nr;
>> +
>> +       /* find the IEOT extension */
>> +       if (!offset)
>> +              return NULL;
>> +       while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
>> +              extsize = get_be32(mmap + offset + 4);
>> +              if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
>> +                      index = mmap + offset + 4 + 4;
>> +                      break;
>> +              }
>> +              offset += 8;
>> +              offset += extsize;
>> +       }
> 
> Maybe refactor this loop. I think I've seen this in at least two
> places now. Probably three?
> 
>> +       if (!index)
>> +              return NULL;
>> +
>> +       /* validate the version is IEOT_VERSION */
>> +       ext_version = get_be32(index);
>> +       if (ext_version != IEOT_VERSION)
>> +              return NULL;
> 
> Report the error (e.g. "unsupported version" or something)
> 

Sure.  I'll add reporting here and in the error check below.

>> +       index += sizeof(uint32_t);
>> +
>> +       /* extension size - version bytes / bytes per entry */
>> +       nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
> 
> Do we need to check if "(extsize - version) % sizeof(entry) == 0"?
> 
>> +       if (!nr)
>> +              return NULL;
>> +       ieot = xmalloc(sizeof(struct index_entry_offset_table)
>> +              + (nr * sizeof(struct index_entry_offset)));
>> +       ieot->nr = nr;
>> +       for (i = 0; i < nr; i++) {
>> +              ieot->entries[i].offset = get_be32(index);
>> +              index += sizeof(uint32_t);
>> +              ieot->entries[i].nr = get_be32(index);
>> +              index += sizeof(uint32_t);
>> +       }
>> +
>> +       return ieot;
>> +}
>> +
>> +static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
>> +{
>> +       uint32_t buffer;
>> +       int i;
>> +
>> +       /* version */
>> +       put_be32(&buffer, IEOT_VERSION);
>> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +
>> +       /* ieot */
>> +       for (i = 0; i < ieot->nr; i++) {
>> +
>> +              /* offset */
>> +              put_be32(&buffer, ieot->entries[i].offset);
>> +              strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +
>> +              /* count */
>> +              put_be32(&buffer, ieot->entries[i].nr);
>> +              strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +       }
>> +}
>> +#endif
>> --
>> 2.18.0.windows.1
>>
> 
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-10-02 16:34       ` Ben Peart
@ 2018-10-02 17:02         ` Duy Nguyen
  0 siblings, 0 replies; 153+ messages in thread
From: Duy Nguyen @ 2018-10-02 17:02 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Tue, Oct 2, 2018 at 6:34 PM Ben Peart <peartben@gmail.com> wrote:
> >> +                       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> >
> > This only works correctly if the ce_write_entry() from the last
> > iteration has flushed everything to out to newfd. Maybe it does, but
> > it's error prone to rely on that in my opinion. Maybe we need an
> > explicit ce_write_flush() here to make sure.
> >
>
> This logic already takes any unflushed data into account - the offset is
> what has been flushed to disk (lseek) plus the amount still in the
> buffer (write_buffer_len) waiting to be flushed.  I don't see any need
> to force an additional flush and adding one could have a negative impact
> on performance.

Eck! How did I miss that write_buffer_len :P
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v7 7/7] read-cache: load cache entries on worker threads
  2018-10-01 17:09     ` Duy Nguyen
@ 2018-10-02 19:09       ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-02 19:09 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 10/1/2018 1:09 PM, Duy Nguyen wrote:
> On Mon, Oct 1, 2018 at 3:46 PM Ben Peart <peartben@gmail.com> wrote:
>> +/*
>> + * A helper function that will load the specified range of cache entries
>> + * from the memory mapped file and add them to the given index.
>> + */
>> +static unsigned long load_cache_entry_block(struct index_state *istate,
>> +                       struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
> 
> Please use unsigned long for offset (here and in the thread_data
> struct). We should use off_t instead, but that's out of scope. At
> least keep offset type consistent in here.
> 

Unfortunately, this code is littered with different types for size and 
offset.  "int" is the most common but there are also off_t, size_t and 
some unsigned long as well.  Currently all of them are at least 32 bits 
so until we need to have an index larger than 32 bits, we should be OK. 
I agree, fixing them all is outside the scope of this patch.

>> +                       unsigned long start_offset, const struct cache_entry *previous_ce)
> 
> I don't think you want to pass previous_ce in. You always pass NULL
> anyway. And if this function is about loading a block (i.e. at block
> boundary) then initial previous_ce _must_ be NULL or things break
> horribly.
> 

The function as written can load any arbitrary subset of cache entries 
as long as previous_ce is set correctly.  I currently only use it on 
block boundaries but I don't see any good reason to limit its 
capabilities by moving what code passes the NULL in one function deeper.

>> @@ -1959,20 +2007,125 @@ static void *load_index_extensions(void *_data)
>>
>>   #define THREAD_COST            (10000)
>>
>> +struct load_cache_entries_thread_data
>> +{
>> +       pthread_t pthread;
>> +       struct index_state *istate;
>> +       struct mem_pool *ce_mem_pool;
>> +       int offset;
>> +       const char *mmap;
>> +       struct index_entry_offset_table *ieot;
>> +       int ieot_offset;        /* starting index into the ieot array */
> 
> If it's an index, maybe just name it ieot_index and we can get rid of
> the comment.
> 
>> +       int ieot_work;          /* count of ieot entries to process */
> 
> Maybe instead of saving the whole "ieot" table here. Add
> 
>       struct index_entry_offset *blocks;
> 
> which points to the starting block for this thread and rename that
> mysterious (to me) ieot_work to nr_blocks. The thread will have access
> from blocks[0] to blocks[nr_blocks - 1]
> 

Meh. Either way you have to figure out there are a block of entries and 
each thread is going to process some subset of those entries.  You can 
do the base + offset math here or down in the calling function but it 
has to happen (and be understood) either way.

I'll rename ieot_offset to ieot_start and ieot_work to ieot_blocks which 
should hopefully help make it more obvious what they do.

>> +       unsigned long consumed; /* return # of bytes in index file processed */
>> +};
>> +
>> +/*
>> + * A thread proc to run the load_cache_entries() computation
>> + * across multiple background threads.
>> + */
>> +static void *load_cache_entries_thread(void *_data)
>> +{
>> +       struct load_cache_entries_thread_data *p = _data;
>> +       int i;
>> +
>> +       /* iterate across all ieot blocks assigned to this thread */
>> +       for (i = p->ieot_offset; i < p->ieot_offset + p->ieot_work; i++) {
>> +               p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
> 
> Please wrap this long line.
> 
>> +               p->offset += p->ieot->entries[i].nr;
>> +       }
>> +       return NULL;
>> +}
>> +
>> +static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
>> +                       unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
>> +{
>> +       int i, offset, ieot_work, ieot_offset, err;
>> +       struct load_cache_entries_thread_data *data;
>> +       unsigned long consumed = 0;
>> +       int nr;
>> +
>> +       /* a little sanity checking */
>> +       if (istate->name_hash_initialized)
>> +               BUG("the name hash isn't thread safe");
>> +
>> +       mem_pool_init(&istate->ce_mem_pool, 0);
>> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> 
> we normally use sizeof(*data) instead of sizeof(struct ...)
> 
>> +
>> +       /* ensure we have no more threads than we have blocks to process */
>> +       if (nr_threads > ieot->nr)
>> +               nr_threads = ieot->nr;
>> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> 
> eh.. reallocate the same "data"?
> 

Thanks, good catch - I hate leaky code.

>> +
>> +       offset = ieot_offset = 0;
>> +       ieot_work = DIV_ROUND_UP(ieot->nr, nr_threads);
>> +       for (i = 0; i < nr_threads; i++) {
>> +               struct load_cache_entries_thread_data *p = &data[i];
>> +               int j;
>> +
>> +               if (ieot_offset + ieot_work > ieot->nr)
>> +                       ieot_work = ieot->nr - ieot_offset;
>> +
>> +               p->istate = istate;
>> +               p->offset = offset;
>> +               p->mmap = mmap;
>> +               p->ieot = ieot;
>> +               p->ieot_offset = ieot_offset;
>> +               p->ieot_work = ieot_work;
>> +
>> +               /* create a mem_pool for each thread */
>> +               nr = 0;
> 
> Since nr is only used in this for loop. Declare it in this scope
> instead of declaring it for the whole function.
> 
>> +               for (j = p->ieot_offset; j < p->ieot_offset + p->ieot_work; j++)
>> +                       nr += p->ieot->entries[j].nr;
>> +               if (istate->version == 4) {
>> +                       mem_pool_init(&p->ce_mem_pool,
>> +                               estimate_cache_size_from_compressed(nr));
>> +               }
>> +               else {
>> +                       mem_pool_init(&p->ce_mem_pool,
>> +                               estimate_cache_size(mmap_size, nr));
>> +               }
> 
> Maybe keep this mem_pool_init code inside load_cache_entries_thread(),
> similar to how you do it for load_cache_entries_thread(). It's mostly
> to keep this loop shorter to see (and understand), of course
> parallelizing this mem_pool_init() is just noise.
> 

I understand the desire to get that part of the thread initialization 
out of the main line of this function (it's a bit messy between the 
entry counting and version differences) but I prefer to have all the 
thread initialization completed before creating the thread.  That allows 
for simpler error handling and helps minimize the state you have to pass 
into the thread (mmap_size in this case).

>> +
>> +               err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
>> +               if (err)
>> +                       die(_("unable to create load_cache_entries thread: %s"), strerror(err));
>> +
>> +               /* increment by the number of cache entries in the ieot block being processed */
>> +               for (j = 0; j < ieot_work; j++)
>> +                       offset += ieot->entries[ieot_offset + j].nr;
> 
> I wonder if it makes things simpler if you store cache_entry _index_
> in entrie[] array instead of storing the number of entries. You can
> easily calculate nr then by doing entries[i].index -
> entries[i-1].index. And you can count multiple blocks the same way,
> without looping like this.
> 
>> +               ieot_offset += ieot_work;
>> +       }
>> +
>> +       for (i = 0; i < nr_threads; i++) {
>> +               struct load_cache_entries_thread_data *p = &data[i];
>> +
>> +               err = pthread_join(p->pthread, NULL);
>> +               if (err)
>> +                       die(_("unable to join load_cache_entries thread: %s"), strerror(err));
>> +               mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
>> +               consumed += p->consumed;
>> +       }
>> +
>> +       free(data);
>> +
>> +       return consumed;
>> +}
>> +#endif
>> +
>>   /* remember to discard_cache() before reading a different cache! */
>>   int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>   {
>> -       int fd, i;
>> +       int fd;
>>          struct stat st;
>>          unsigned long src_offset;
>>          const struct cache_header *hdr;
>>          const char *mmap;
>>          size_t mmap_size;
>> -       const struct cache_entry *previous_ce = NULL;
>>          struct load_index_extensions p;
>>          size_t extension_offset = 0;
>>   #ifndef NO_PTHREADS
>> -       int nr_threads;
>> +       int nr_threads, cpus;
>> +       struct index_entry_offset_table *ieot = NULL;
>>   #endif
>>
>>          if (istate->initialized)
>> @@ -2014,10 +2167,18 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>          p.mmap = mmap;
>>          p.mmap_size = mmap_size;
>>
>> +       src_offset = sizeof(*hdr);
> 
> OK we've been doing this since forever, sizeof(struct cache_header)
> probably does not have extra padding on any supported platform.
> 
>> @@ -2032,29 +2193,22 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>                          nr_threads--;
>>                  }
>>          }
>> -#endif
>> -
>> -       if (istate->version == 4) {
>> -               mem_pool_init(&istate->ce_mem_pool,
>> -                             estimate_cache_size_from_compressed(istate->cache_nr));
>> -       } else {
>> -               mem_pool_init(&istate->ce_mem_pool,
>> -                             estimate_cache_size(mmap_size, istate->cache_nr));
>> -       }
>>
>> -       src_offset = sizeof(*hdr);
>> -       for (i = 0; i < istate->cache_nr; i++) {
>> -               struct ondisk_cache_entry *disk_ce;
>> -               struct cache_entry *ce;
>> -               unsigned long consumed;
>> +       /*
>> +        * Locate and read the index entry offset table so that we can use it
>> +        * to multi-thread the reading of the cache entries.
>> +        */
>> +       if (extension_offset && nr_threads > 1)
>> +               ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
> 
> You need to free ieot at some point.
> 

Good catch - I hate leaky code.

>>
>> -               disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
>> -               ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
>> -               set_index_entry(istate, i, ce);
>> +       if (ieot)
>> +               src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
>> +       else
>> +               src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
>> +#else
>> +       src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
>> +#endif
>>
>> -               src_offset += consumed;
>> -               previous_ce = ce;
>> -       }
>>          istate->timestamp.sec = st.st_mtime;
>>          istate->timestamp.nsec = ST_MTIME_NSEC(st);
>>
>> --
>> 2.18.0.windows.1
>>

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 0/7] speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (7 preceding siblings ...)
  2018-10-01 13:45 ` [PATCH v7 " Ben Peart
@ 2018-10-10 15:59 ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 1/7] read-cache.c: optimize reading index format v4 Ben Peart
                     ` (9 more replies)
  8 siblings, 10 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

Fixed issues identified in review the most impactful probably being plugging
some leaks and improved error handling.  Also added better error messages
and some code cleanup to code I'd touched.

The biggest change in the interdiff is the impact of renaming ieot_offset to
ieot_start and ieot_work to ieot_blocks in hopes of making it easier to read
and understand the code.

Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/6caa0bac46
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v8 && git checkout 6caa0bac46


### Interdiff (v7..v8):

diff --git a/read-cache.c b/read-cache.c
index 14402a0738..7acc2c86f4 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1901,7 +1901,7 @@ struct index_entry_offset
 struct index_entry_offset_table
 {
 	int nr;
-	struct index_entry_offset entries[0];
+	struct index_entry_offset entries[FLEX_ARRAY];
 };
 
 #ifndef NO_PTHREADS
@@ -1935,9 +1935,7 @@ static void *load_index_extensions(void *_data)
 		 * extension name (4-byte) and section length
 		 * in 4-byte network byte order.
 		 */
-		uint32_t extsize;
-		memcpy(&extsize, p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		uint32_t extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
 					 p->mmap + src_offset,
 					 p->mmap + src_offset + 8,
@@ -2015,8 +2013,8 @@ struct load_cache_entries_thread_data
 	int offset;
 	const char *mmap;
 	struct index_entry_offset_table *ieot;
-	int ieot_offset;        /* starting index into the ieot array */
-	int ieot_work;          /* count of ieot entries to process */
+	int ieot_start;		/* starting index into the ieot array */
+	int ieot_blocks;	/* count of ieot entries to process */
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2030,8 +2028,9 @@ static void *load_cache_entries_thread(void *_data)
 	int i;
 
 	/* iterate across all ieot blocks assigned to this thread */
-	for (i = p->ieot_offset; i < p->ieot_offset + p->ieot_work; i++) {
-		p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
+	for (i = p->ieot_start; i < p->ieot_start + p->ieot_blocks; i++) {
+		p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+			p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
 		p->offset += p->ieot->entries[i].nr;
 	}
 	return NULL;
@@ -2040,48 +2039,45 @@ static void *load_cache_entries_thread(void *_data)
 static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
 			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
 {
-	int i, offset, ieot_work, ieot_offset, err;
+	int i, offset, ieot_blocks, ieot_start, err;
 	struct load_cache_entries_thread_data *data;
 	unsigned long consumed = 0;
-	int nr;
 
 	/* a little sanity checking */
 	if (istate->name_hash_initialized)
 		BUG("the name hash isn't thread safe");
 
 	mem_pool_init(&istate->ce_mem_pool, 0);
-	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
 
 	/* ensure we have no more threads than we have blocks to process */
 	if (nr_threads > ieot->nr)
 		nr_threads = ieot->nr;
-	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+	data = xcalloc(nr_threads, sizeof(*data));
 
-	offset = ieot_offset = 0;
-	ieot_work = DIV_ROUND_UP(ieot->nr, nr_threads);
+	offset = ieot_start = 0;
+	ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
 	for (i = 0; i < nr_threads; i++) {
 		struct load_cache_entries_thread_data *p = &data[i];
-		int j;
+		int nr, j;
 
-		if (ieot_offset + ieot_work > ieot->nr)
-			ieot_work = ieot->nr - ieot_offset;
+		if (ieot_start + ieot_blocks > ieot->nr)
+			ieot_blocks = ieot->nr - ieot_start;
 
 		p->istate = istate;
 		p->offset = offset;
 		p->mmap = mmap;
 		p->ieot = ieot;
-		p->ieot_offset = ieot_offset;
-		p->ieot_work = ieot_work;
+		p->ieot_start = ieot_start;
+		p->ieot_blocks = ieot_blocks;
 
 		/* create a mem_pool for each thread */
 		nr = 0;
-		for (j = p->ieot_offset; j < p->ieot_offset + p->ieot_work; j++)
+		for (j = p->ieot_start; j < p->ieot_start + p->ieot_blocks; j++)
 			nr += p->ieot->entries[j].nr;
 		if (istate->version == 4) {
 			mem_pool_init(&p->ce_mem_pool,
 				estimate_cache_size_from_compressed(nr));
-		}
-		else {
+		} else {
 			mem_pool_init(&p->ce_mem_pool,
 				estimate_cache_size(mmap_size, nr));
 		}
@@ -2091,9 +2087,9 @@ static unsigned long load_cache_entries_threaded(struct index_state *istate, con
 			die(_("unable to create load_cache_entries thread: %s"), strerror(err));
 
 		/* increment by the number of cache entries in the ieot block being processed */
-		for (j = 0; j < ieot_work; j++)
-			offset += ieot->entries[ieot_offset + j].nr;
-		ieot_offset += ieot_work;
+		for (j = 0; j < ieot_blocks; j++)
+			offset += ieot->entries[ieot_start + j].nr;
+		ieot_start += ieot_blocks;
 	}
 
 	for (i = 0; i < nr_threads; i++) {
@@ -2201,10 +2197,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	if (extension_offset && nr_threads > 1)
 		ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
 
-	if (ieot)
+	if (ieot) {
 		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
-	else
+		free(ieot);
+	} else {
 		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+	}
 #else
 	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
@@ -2705,9 +2703,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
 	off_t offset;
-	int ieot_work = 1;
+	int ieot_entries = 1;
 	struct index_entry_offset_table *ieot = NULL;
-	int nr;
+	int nr, nr_threads;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2742,20 +2740,24 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return -1;
 
 #ifndef NO_PTHREADS
-	if ((nr = git_config_get_index_threads()) != 1) {
+	nr_threads = git_config_get_index_threads();
+	if (nr_threads != 1) {
 		int ieot_blocks, cpus;
 
 		/*
 		 * ensure default number of ieot blocks maps evenly to the
-		 * default number of threads that will process them
+		 * default number of threads that will process them leaving
+		 * room for the thread to load the index extensions.
 		 */
-		if (!nr) {
+		if (!nr_threads) {
 			ieot_blocks = istate->cache_nr / THREAD_COST;
 			cpus = online_cpus();
 			if (ieot_blocks > cpus - 1)
 				ieot_blocks = cpus - 1;
 		} else {
-			ieot_blocks = nr;
+			ieot_blocks = nr_threads;
+			if (ieot_blocks > istate->cache_nr)
+				ieot_blocks = istate->cache_nr;
 		}
 
 		/*
@@ -2765,13 +2767,17 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		if (ieot_blocks > 1) {
 			ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
 				+ (ieot_blocks * sizeof(struct index_entry_offset)));
-			ieot->nr = 0;
-			ieot_work = DIV_ROUND_UP(entries, ieot_blocks);
+			ieot_entries = DIV_ROUND_UP(entries, ieot_blocks);
 		}
 	}
 #endif
 
-	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	offset = lseek(newfd, 0, SEEK_CUR);
+	if (offset < 0) {
+		free(ieot);
+		return -1;
+	}
+	offset += write_buffer_len;
 	nr = 0;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
@@ -2794,7 +2800,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
-		if (ieot && i && (i % ieot_work == 0)) {
+		if (ieot && i && (i % ieot_entries == 0)) {
 			ieot->entries[ieot->nr].nr = nr;
 			ieot->entries[ieot->nr].offset = offset;
 			ieot->nr++;
@@ -2806,7 +2812,12 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 			if (previous_name)
 				previous_name->buf[0] = 0;
 			nr = 0;
-			offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+			offset = lseek(newfd, 0, SEEK_CUR);
+			if (offset < 0) {
+				free(ieot);
+				return -1;
+			}
+			offset += write_buffer_len;
 		}
 		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
@@ -2822,16 +2833,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	}
 	strbuf_release(&previous_name_buf);
 
-	if (err)
+	if (err) {
+		free(ieot);
 		return err;
+	}
 
 	/* Write extension data here */
-	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	offset = lseek(newfd, 0, SEEK_CUR);
+	if (offset < 0) {
+		free(ieot);
+		return -1;
+	}
+	offset += write_buffer_len;
 	the_hash_algo->init_fn(&eoie_c);
 
 	/*
 	 * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
-	 * can minimze the number of extensions we have to scan through to
+	 * can minimize the number of extensions we have to scan through to
 	 * find it during load.  Write it out regardless of the
 	 * strip_extensions parameter as we need it when loading the shared
 	 * index.
@@ -2844,6 +2862,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
+		free(ieot);
 		if (err)
 			return -1;
 	}
@@ -3460,14 +3479,18 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
 
        /* validate the version is IEOT_VERSION */
        ext_version = get_be32(index);
-       if (ext_version != IEOT_VERSION)
+       if (ext_version != IEOT_VERSION) {
+	       error("invalid IEOT version %d", ext_version);
 	       return NULL;
+       }
        index += sizeof(uint32_t);
 
        /* extension size - version bytes / bytes per entry */
        nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
-       if (!nr)
+       if (!nr) {
+	       error("invalid number of IEOT entries %d", nr);
 	       return NULL;
+       }
        ieot = xmalloc(sizeof(struct index_entry_offset_table)
 	       + (nr * sizeof(struct index_entry_offset)));
        ieot->nr = nr;


### Patches

Ben Peart (6):
  read-cache: clean up casting and byte decoding
  eoie: add End of Index Entry (EOIE) extension
  config: add new index.threads config setting
  read-cache: load cache extensions on a worker thread
  ieot: add Index Entry Offset Table (IEOT) extension
  read-cache: load cache entries on worker threads

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   7 +
 Documentation/technical/index-format.txt |  41 ++
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 774 +++++++++++++++++++----
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |  13 +-
 7 files changed, 739 insertions(+), 120 deletions(-)


base-commit: fe8321ec057f9231c26c29b364721568e58040f7
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 1/7] read-cache.c: optimize reading index format v4
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 2/7] read-cache: clean up casting and byte decoding Ben Peart
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 8d04d78a58..583a4fb1f8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,24 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1789,21 +1750,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1892,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1930,9 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 2/7] read-cache: clean up casting and byte decoding
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
  2018-10-10 15:59   ` [PATCH v8 1/7] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 583a4fb1f8..6ba99e2c96 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1650,7 +1650,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1674,7 +1674,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1889,8 +1889,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd, i;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
 
@@ -1918,7 +1918,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -1943,7 +1943,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1961,21 +1961,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
+					 mmap + src_offset,
+					 mmap + src_offset + 8,
 					 extsize) < 0)
 			goto unmap;
 		src_offset += 8;
 		src_offset += extsize;
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 3/7] eoie: add End of Index Entry (EOIE) extension
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
  2018-10-10 15:59   ` [PATCH v8 1/7] read-cache.c: optimize reading index format v4 Ben Peart
  2018-10-10 15:59   ` [PATCH v8 2/7] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 4/7] config: add new index.threads config setting Ben Peart
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

The EOIE extension is always written out to the index file including to
the shared index when using the split index feature. Because it is always
written out, the SHA checksums in t/t1700-split-index.sh were updated
to reflect its inclusion.

It is written as an optional extension to ensure compatibility with other
git implementations that do not yet support it.  It is always written out
to ensure it is available as often as possible to speed up index operations.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
    "REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 158 +++++++++++++++++++++--
 t/t1700-split-index.sh                   |   8 +-
 3 files changed, 177 insertions(+), 12 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 6ba99e2c96..4781515252 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1883,6 +1887,9 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2190,11 +2197,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2437,7 +2448,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2446,6 +2457,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	off_t offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2479,6 +2491,10 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+	offset = lseek(newfd, 0, SEEK_CUR);
+	if (offset < 0)
+		return -1;
+	offset += write_buffer_len;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2512,11 +2528,17 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR);
+	if (offset < 0)
+		return -1;
+	offset += write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2527,7 +2549,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2537,7 +2559,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2548,7 +2570,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2559,7 +2581,24 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.  Write it out regardless of the strip_extensions parameter as we need it
+	 * when loading the shared index.
+	 */
+	if (offset) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2975,3 +3014,106 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *index, *eoie;
+	uint32_t extsize;
+	size_t offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *	 "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (!hasheq(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index be22398a85..8e17f8e7a0 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -15,11 +15,11 @@ test_expect_success 'enable split index' '
 	indexversion=$(test-tool index-version <.git/index) &&
 	if test "$indexversion" = "4"
 	then
-		own=432ef4b63f32193984f339431fd50ca796493569
-		base=508851a7f0dfa8691e9f69c7f055865389012491
+		own=3527df833c6c100d3d1d921a9a782d62a8be4b58
+		base=746f7ab2ed44fb839efdfbffcf399d0b113fb4cb
 	else
-		own=8299b0bcd1ac364e5f1d7768efb62fa2da79a339
-		base=39d890139ee5356c7ef572216cebcd27aa41f9df
+		own=5e9b60117ece18da410ddecc8b8d43766a0e4204
+		base=4370042739b31cd17a5c5cd6043a77c9a00df113
 	fi &&
 	cat >expect <<-EOF &&
 	own $own
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 4/7] config: add new index.threads config setting
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (2 preceding siblings ...)
  2018-10-10 15:59   ` [PATCH v8 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 5/7] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

Add support for a new index.threads config setting which will be used to
control the threading code in do_read_index().  A value of 0 will tell the
index code to automatically determine the correct number of threads to use.
A value of 1 will make the code single threaded.  A value greater than 1
will set the maximum number of threads to use.

For testing purposes, this setting can be overwritten by setting the
GIT_TEST_INDEX_THREADS=<n> environment variable to a value greater than 0.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 Documentation/config.txt |  7 +++++++
 config.c                 | 18 ++++++++++++++++++
 config.h                 |  1 +
 t/README                 |  5 +++++
 t/t1700-split-index.sh   |  5 +++++
 5 files changed, 36 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ad0f4510c3..8fd973b76b 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2413,6 +2413,13 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Specifying 1 or
+	'false' will disable multithreading. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 3461993f0a..2ee29f6f86 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_index_threads(void)
+{
+	int is_bool, val = 0;
+
+	val = git_env_ulong("GIT_TEST_INDEX_THREADS", 0);
+	if (val)
+		return val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/t/README b/t/README
index 3ea6c85460..8f5c0620ea 100644
--- a/t/README
+++ b/t/README
@@ -327,6 +327,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
 be written after every 'git commit' command, and overrides the
 'core.commitGraph' setting to true.
 
+GIT_TEST_INDEX_THREADS=<n> enables exercising the multi-threaded loading
+of the index for the whole test suite by bypassing the default number of
+cache entries and thread minimums. Setting this to 1 will make the
+index loading single threaded.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 8e17f8e7a0..ef9349bd70 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -6,7 +6,12 @@ test_description='split index mode tests'
 
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
+
+# Testing a hard coded SHA against an index with an extension
+# that can vary from run to run is problematic so we disable
+# those extensions.
 sane_unset GIT_FSMONITOR_TEST
+sane_unset GIT_TEST_INDEX_THREADS
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 5/7] read-cache: load cache extensions on a worker thread
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (3 preceding siblings ...)
  2018-10-10 15:59   ` [PATCH v8 4/7] config: add new index.threads config setting Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

	Test w/100,000 files reduced the time by 0.53%
	Test w/1,000,000 files reduced the time by 27.78%

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 95 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 16 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 4781515252..2214b3153d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,7 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#include "thread-utils.h"
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1890,6 +1891,44 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	const char *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize = get_be32(p->mmap + src_offset + 4);
+		if (read_index_extension(p->istate,
+					 p->mmap + src_offset,
+					 p->mmap + src_offset + 8,
+					 extsize) < 0) {
+			munmap((void *)p->mmap, p->mmap_size);
+			die(_("index file corrupt"));
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1900,6 +1939,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	const char *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
+	struct load_index_extensions p;
+	size_t extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,6 +1980,30 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads > 1) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			int err;
+
+			p.src_offset = extension_offset;
+			err = pthread_create(&p.pthread, NULL, load_index_extensions, &p);
+			if (err)
+				die(_("unable to create load_index_extensions thread: %s"), strerror(err));
+
+			nr_threads--;
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
@@ -1960,22 +2028,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		extsize = get_be32(mmap + src_offset + 4);
-		if (read_index_extension(istate,
-					 mmap + src_offset,
-					 mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset) {
+		int ret = pthread_join(p.pthread, NULL);
+		if (ret)
+			die(_("unable to join load_index_extensions thread: %s"), strerror(ret));
+	}
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 6/7] ieot: add Index Entry Offset Table (IEOT) extension
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (4 preceding siblings ...)
  2018-10-10 15:59   ` [PATCH v8 5/7] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-10 15:59   ` [PATCH v8 7/7] read-cache: load cache entries on worker threads Ben Peart
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch enables addressing the CPU cost of loading the index by adding
additional data to the index that will allow us to efficiently multi-
thread the loading and conversion of cache entries.

It accomplishes this by adding an (optional) index extension that is a
table of offsets to blocks of cache entries in the index file.  To make
this work for V4 indexes, when writing the cache entries, it periodically
"resets" the prefix-compression by encoding the current entry as if the
path name for the previous entry is completely different and saves the
offset of that entry in the IEOT.  Basically, with V4 indexes, it
generates offsets into blocks of prefix-compressed entries.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 Documentation/technical/index-format.txt |  18 +++
 read-cache.c                             | 196 ++++++++++++++++++++++-
 2 files changed, 211 insertions(+), 3 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index 6bc2d90f7f..7c4d67aa6a 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -337,3 +337,21 @@ The remaining data of each directory block is grouped by type:
 
 	SHA-1("TREE" + <binary representation of N> +
 		"REUC" + <binary representation of M>)
+
+== Index Entry Offset Table
+
+  The Index Entry Offset Table (IEOT) is used to help address the CPU
+  cost of loading the index by enabling multi-threading the process of
+  converting cache entries from the on-disk format to the in-memory format.
+  The signature for this extension is { 'I', 'E', 'O', 'T' }.
+
+  The extension consists of:
+
+  - 32-bit version (currently 1)
+
+  - A number of index offset entries each consisting of:
+
+    - 32-bit offset from the begining of the file to the first cache entry
+	in this block of entries.
+
+    - 32-bit count of cache entries in this block
diff --git a/read-cache.c b/read-cache.c
index 2214b3153d..3ace29d58f 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -45,6 +45,7 @@
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
 #define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
+#define CACHE_EXT_INDEXENTRYOFFSETTABLE 0x49454F54 /* "IEOT" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1696,6 +1697,7 @@ static int read_index_extension(struct index_state *istate,
 		read_fsmonitor_extension(istate, data, sz);
 		break;
 	case CACHE_EXT_ENDOFINDEXENTRIES:
+	case CACHE_EXT_INDEXENTRYOFFSETTABLE:
 		/* already handled in do_read_index() */
 		break;
 	default:
@@ -1888,6 +1890,23 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+struct index_entry_offset
+{
+	/* starting byte offset into index file, count of index entries in this block */
+	int offset, nr;
+};
+
+struct index_entry_offset_table
+{
+	int nr;
+	struct index_entry_offset entries[FLEX_ARRAY];
+};
+
+#ifndef NO_PTHREADS
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset);
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
+#endif
+
 static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
 
@@ -1929,6 +1948,15 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+
+#define THREAD_COST		(10000)
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2521,6 +2549,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
 	off_t offset;
+	int ieot_blocks = 1;
+	struct index_entry_offset_table *ieot = NULL;
+	int nr, nr_threads;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2554,10 +2585,44 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (nr_threads != 1) {
+		int ieot_blocks, cpus;
+
+		/*
+		 * ensure default number of ieot blocks maps evenly to the
+		 * default number of threads that will process them leaving
+		 * room for the thread to load the index extensions.
+		 */
+		if (!nr_threads) {
+			ieot_blocks = istate->cache_nr / THREAD_COST;
+			cpus = online_cpus();
+			if (ieot_blocks > cpus - 1)
+				ieot_blocks = cpus - 1;
+		} else {
+			ieot_blocks = nr_threads;
+		}
+
+		/*
+		 * no reason to write out the IEOT extension if we don't
+		 * have enough blocks to utilize multi-threading
+		 */
+		if (ieot_blocks > 1) {
+			ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
+				+ (ieot_blocks * sizeof(struct index_entry_offset)));
+			ieot_blocks = DIV_ROUND_UP(entries, ieot_blocks);
+		}
+	}
+#endif
+
 	offset = lseek(newfd, 0, SEEK_CUR);
-	if (offset < 0)
+	if (offset < 0) {
+		free(ieot);
 		return -1;
+	}
 	offset += write_buffer_len;
+	nr = 0;
 	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
@@ -2579,24 +2644,74 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
+		if (ieot && i && (i % ieot_blocks == 0)) {
+			ieot->entries[ieot->nr].nr = nr;
+			ieot->entries[ieot->nr].offset = offset;
+			ieot->nr++;
+			/*
+			 * If we have a V4 index, set the first byte to an invalid
+			 * character to ensure there is nothing common with the previous
+			 * entry
+			 */
+			if (previous_name)
+				previous_name->buf[0] = 0;
+			nr = 0;
+			offset = lseek(newfd, 0, SEEK_CUR);
+			if (offset < 0) {
+				free(ieot);
+				return -1;
+			}
+			offset += write_buffer_len;
+		}
 		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
 
 		if (err)
 			break;
+		nr++;
+	}
+	if (ieot && nr) {
+		ieot->entries[ieot->nr].nr = nr;
+		ieot->entries[ieot->nr].offset = offset;
+		ieot->nr++;
 	}
 	strbuf_release(&previous_name_buf);
 
-	if (err)
+	if (err) {
+		free(ieot);
 		return err;
+	}
 
 	/* Write extension data here */
 	offset = lseek(newfd, 0, SEEK_CUR);
-	if (offset < 0)
+	if (offset < 0) {
+		free(ieot);
 		return -1;
+	}
 	offset += write_buffer_len;
 	the_hash_algo->init_fn(&eoie_c);
 
+	/*
+	 * Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
+	 * can minimize the number of extensions we have to scan through to
+	 * find it during load.  Write it out regardless of the
+	 * strip_extensions parameter as we need it when loading the shared
+	 * index.
+	 */
+#ifndef NO_PTHREADS
+	if (ieot) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_ieot_extension(&sb, ieot);
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		free(ieot);
+		if (err)
+			return -1;
+	}
+#endif
+
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
@@ -3180,3 +3295,78 @@ static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context,
 	the_hash_algo->final_fn(hash, eoie_context);
 	strbuf_add(sb, hash, the_hash_algo->rawsz);
 }
+
+#ifndef NO_PTHREADS
+#define IEOT_VERSION	(1)
+
+static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
+{
+       const char *index = NULL;
+       uint32_t extsize, ext_version;
+       struct index_entry_offset_table *ieot;
+       int i, nr;
+
+       /* find the IEOT extension */
+       if (!offset)
+	       return NULL;
+       while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
+	       extsize = get_be32(mmap + offset + 4);
+	       if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
+		       index = mmap + offset + 4 + 4;
+		       break;
+	       }
+	       offset += 8;
+	       offset += extsize;
+       }
+       if (!index)
+	       return NULL;
+
+       /* validate the version is IEOT_VERSION */
+       ext_version = get_be32(index);
+       if (ext_version != IEOT_VERSION) {
+	       error("invalid IEOT version %d", ext_version);
+	       return NULL;
+       }
+       index += sizeof(uint32_t);
+
+       /* extension size - version bytes / bytes per entry */
+       nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
+       if (!nr) {
+	       error("invalid number of IEOT entries %d", nr);
+	       return NULL;
+       }
+       ieot = xmalloc(sizeof(struct index_entry_offset_table)
+	       + (nr * sizeof(struct index_entry_offset)));
+       ieot->nr = nr;
+       for (i = 0; i < nr; i++) {
+	       ieot->entries[i].offset = get_be32(index);
+	       index += sizeof(uint32_t);
+	       ieot->entries[i].nr = get_be32(index);
+	       index += sizeof(uint32_t);
+       }
+
+       return ieot;
+}
+
+static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
+{
+       uint32_t buffer;
+       int i;
+
+       /* version */
+       put_be32(&buffer, IEOT_VERSION);
+       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+       /* ieot */
+       for (i = 0; i < ieot->nr; i++) {
+
+	       /* offset */
+	       put_be32(&buffer, ieot->entries[i].offset);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	       /* count */
+	       put_be32(&buffer, ieot->entries[i].nr);
+	       strbuf_add(sb, &buffer, sizeof(uint32_t));
+       }
+}
+#endif
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH v8 7/7] read-cache: load cache entries on worker threads
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (5 preceding siblings ...)
  2018-10-10 15:59   ` [PATCH v8 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
@ 2018-10-10 15:59   ` Ben Peart
  2018-10-19 16:11     ` Jeff King
  2018-10-12  3:18   ` [PATCH v8 0/7] speed up index load through parallelization Junio C Hamano
                     ` (2 subsequent siblings)
  9 siblings, 1 reply; 153+ messages in thread
From: Ben Peart @ 2018-10-10 15:59 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Ben Peart <benpeart@microsoft.com>

This patch helps address the CPU cost of loading the index by utilizing
the Index Entry Offset Table (IEOT) to divide loading and conversion of
the cache entries across multiple threads in parallel.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files reduced the time by 32.24%
Test w/1,000,000 files reduced the time by -4.77%

Note that on the 1,000,000 files case, multi-threading the cache entry parsing
does not yield a performance win.  This is because the cost to parse the
index extensions in this repo, far outweigh the cost of loading the cache
entries.

The high cost of parsing the index extensions is driven by the cache tree
and the untracked cache extensions. As this is currently the longest pole,
any reduction in this time will reduce the overall index load times so is
worth further investigation in another patch series.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 230 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 193 insertions(+), 37 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 3ace29d58f..7acc2c86f4 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1720,7 +1720,8 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *create_from_disk(struct index_state *istate,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    const struct cache_entry *previous_ce)
@@ -1737,7 +1738,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 	 * number of bytes to be stripped from the end of the previous name,
 	 * and the bytes to append to the result, to come up with its name.
 	 */
-	int expand_name_field = istate->version == 4;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1761,16 +1762,17 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		const unsigned char *cp = (const unsigned char *)name;
 		size_t strip_len, previous_len;
 
-		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		/* If we're at the begining of a block, ignore the previous name */
 		strip_len = decode_varint(&cp);
-		if (previous_len < strip_len) {
-			if (previous_ce)
+		if (previous_ce) {
+			previous_len = previous_ce->ce_namelen;
+			if (previous_len < strip_len)
 				die(_("malformed name field in the index, near path '%s'"),
-				    previous_ce->name);
-			else
-				die(_("malformed name field in the index in the first path"));
+					previous_ce->name);
+			copy_len = previous_len - strip_len;
+		} else {
+			copy_len = 0;
 		}
-		copy_len = previous_len - strip_len;
 		name = (const char *)cp;
 	}
 
@@ -1780,7 +1782,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 			len += copy_len;
 	}
 
-	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1948,6 +1950,52 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
+			unsigned long start_offset, const struct cache_entry *previous_ce)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+		previous_ce = ce;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, NULL);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
 /*
  * Mostly randomly chosen maximum thread counts: we
  * cap the parallelism to online_cpus() threads, and we want
@@ -1957,20 +2005,123 @@ static void *load_index_extensions(void *_data)
 
 #define THREAD_COST		(10000)
 
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset;
+	const char *mmap;
+	struct index_entry_offset_table *ieot;
+	int ieot_start;		/* starting index into the ieot array */
+	int ieot_blocks;	/* count of ieot entries to process */
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	int i;
+
+	/* iterate across all ieot blocks assigned to this thread */
+	for (i = p->ieot_start; i < p->ieot_start + p->ieot_blocks; i++) {
+		p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+			p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
+		p->offset += p->ieot->entries[i].nr;
+	}
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
+			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
+{
+	int i, offset, ieot_blocks, ieot_start, err;
+	struct load_cache_entries_thread_data *data;
+	unsigned long consumed = 0;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+
+	/* ensure we have no more threads than we have blocks to process */
+	if (nr_threads > ieot->nr)
+		nr_threads = ieot->nr;
+	data = xcalloc(nr_threads, sizeof(*data));
+
+	offset = ieot_start = 0;
+	ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+		int nr, j;
+
+		if (ieot_start + ieot_blocks > ieot->nr)
+			ieot_blocks = ieot->nr - ieot_start;
+
+		p->istate = istate;
+		p->offset = offset;
+		p->mmap = mmap;
+		p->ieot = ieot;
+		p->ieot_start = ieot_start;
+		p->ieot_blocks = ieot_blocks;
+
+		/* create a mem_pool for each thread */
+		nr = 0;
+		for (j = p->ieot_start; j < p->ieot_start + p->ieot_blocks; j++)
+			nr += p->ieot->entries[j].nr;
+		if (istate->version == 4) {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size_from_compressed(nr));
+		} else {
+			mem_pool_init(&p->ce_mem_pool,
+				estimate_cache_size(mmap_size, nr));
+		}
+
+		err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
+		if (err)
+			die(_("unable to create load_cache_entries thread: %s"), strerror(err));
+
+		/* increment by the number of cache entries in the ieot block being processed */
+		for (j = 0; j < ieot_blocks; j++)
+			offset += ieot->entries[ieot_start + j].nr;
+		ieot_start += ieot_blocks;
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = &data[i];
+
+		err = pthread_join(p->pthread, NULL);
+		if (err)
+			die(_("unable to join load_cache_entries thread: %s"), strerror(err));
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		consumed += p->consumed;
+	}
+
+	free(data);
+
+	return consumed;
+}
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	const struct cache_header *hdr;
 	const char *mmap;
 	size_t mmap_size;
-	const struct cache_entry *previous_ce = NULL;
 	struct load_index_extensions p;
 	size_t extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int nr_threads, cpus;
+	struct index_entry_offset_table *ieot = NULL;
 #endif
 
 	if (istate->initialized)
@@ -2012,10 +2163,18 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+
+	/* TODO: does creating more threads than cores help? */
+	if (!nr_threads) {
+		nr_threads = istate->cache_nr / THREAD_COST;
+		cpus = online_cpus();
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
 
 	if (nr_threads > 1) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2030,29 +2189,24 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			nr_threads--;
 		}
 	}
-#endif
 
-	if (istate->version == 4) {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
+	/*
+	 * Locate and read the index entry offset table so that we can use it
+	 * to multi-thread the reading of the cache entries.
+	 */
+	if (extension_offset && nr_threads > 1)
+		ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
+
+	if (ieot) {
+		src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, src_offset, nr_threads, ieot);
+		free(ieot);
 	} else {
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 	}
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#endif
 
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
-		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-		previous_ce = ce;
-	}
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
@@ -2549,7 +2703,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
 	off_t offset;
-	int ieot_blocks = 1;
+	int ieot_entries = 1;
 	struct index_entry_offset_table *ieot = NULL;
 	int nr, nr_threads;
 
@@ -2602,6 +2756,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 				ieot_blocks = cpus - 1;
 		} else {
 			ieot_blocks = nr_threads;
+			if (ieot_blocks > istate->cache_nr)
+				ieot_blocks = istate->cache_nr;
 		}
 
 		/*
@@ -2611,7 +2767,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		if (ieot_blocks > 1) {
 			ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
 				+ (ieot_blocks * sizeof(struct index_entry_offset)));
-			ieot_blocks = DIV_ROUND_UP(entries, ieot_blocks);
+			ieot_entries = DIV_ROUND_UP(entries, ieot_blocks);
 		}
 	}
 #endif
@@ -2644,7 +2800,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
-		if (ieot && i && (i % ieot_blocks == 0)) {
+		if (ieot && i && (i % ieot_entries == 0)) {
 			ieot->entries[ieot->nr].nr = nr;
 			ieot->entries[ieot->nr].offset = offset;
 			ieot->nr++;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 0/7] speed up index load through parallelization
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (6 preceding siblings ...)
  2018-10-10 15:59   ` [PATCH v8 7/7] read-cache: load cache entries on worker threads Ben Peart
@ 2018-10-12  3:18   ` Junio C Hamano
  2018-10-14 12:28   ` Duy Nguyen
  2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
  9 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-10-12  3:18 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, pclouds, Ben Peart

Ben Peart <peartben@gmail.com> writes:

> From: Ben Peart <benpeart@microsoft.com>
>
> Fixed issues identified in review the most impactful probably being plugging
> some leaks and improved error handling.  Also added better error messages
> and some code cleanup to code I'd touched.
>
> The biggest change in the interdiff is the impact of renaming ieot_offset to
> ieot_start and ieot_work to ieot_blocks in hopes of making it easier to read
> and understand the code.

Thanks, I think this one is ready to be in 'next' and any further
tweaks can be done incrementally.


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 0/7] speed up index load through parallelization
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (7 preceding siblings ...)
  2018-10-12  3:18   ` [PATCH v8 0/7] speed up index load through parallelization Junio C Hamano
@ 2018-10-14 12:28   ` Duy Nguyen
  2018-10-15 17:33     ` Ben Peart
  2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
  9 siblings, 1 reply; 153+ messages in thread
From: Duy Nguyen @ 2018-10-14 12:28 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Oct 10, 2018 at 5:59 PM Ben Peart <peartben@gmail.com> wrote:
> @@ -3460,14 +3479,18 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
>
>         /* validate the version is IEOT_VERSION */
>         ext_version = get_be32(index);
> -       if (ext_version != IEOT_VERSION)
> +       if (ext_version != IEOT_VERSION) {
> +              error("invalid IEOT version %d", ext_version);

Please wrap this string in _() so that it can be translated.

>                return NULL;
> +       }
>         index += sizeof(uint32_t);
>
>         /* extension size - version bytes / bytes per entry */
>         nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
> -       if (!nr)
> +       if (!nr) {
> +              error("invalid number of IEOT entries %d", nr);

Ditto. And reporting extsize may be more useful than nr, which we know
is zero, but we don't know why it's calculated zero unless we know
extsize.
-- 
Duy

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 0/7] speed up index load through parallelization
  2018-10-14 12:28   ` Duy Nguyen
@ 2018-10-15 17:33     ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-15 17:33 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

fixup! IEOT error messages

Enable localizing new error messages and improve the error message for
invalid IEOT extension sizes.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
  read-cache.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7acc2c86f4..f9fa6a7979 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -3480,7 +3480,7 @@ static struct index_entry_offset_table 
*read_ieot_extension(const char *mmap, si
         /* validate the version is IEOT_VERSION */
         ext_version = get_be32(index);
         if (ext_version != IEOT_VERSION) {
-	       error("invalid IEOT version %d", ext_version);
+	       error(_("invalid IEOT version %d"), ext_version);
  	       return NULL;
         }
         index += sizeof(uint32_t);
@@ -3488,7 +3488,7 @@ static struct index_entry_offset_table 
*read_ieot_extension(const char *mmap, si
         /* extension size - version bytes / bytes per entry */
         nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + 
sizeof(uint32_t));
         if (!nr) {
-	       error("invalid number of IEOT entries %d", nr);
+	       error(_("invalid IEOT extension size %d"), extsize);
  	       return NULL;
         }
         ieot = xmalloc(sizeof(struct index_entry_offset_table)
-- 
2.18.0.windows.1



On 10/14/2018 8:28 AM, Duy Nguyen wrote:
> On Wed, Oct 10, 2018 at 5:59 PM Ben Peart <peartben@gmail.com> wrote:
>> @@ -3460,14 +3479,18 @@ static struct index_entry_offset_table *read_ieot_extension(const char *mmap, si
>>
>>          /* validate the version is IEOT_VERSION */
>>          ext_version = get_be32(index);
>> -       if (ext_version != IEOT_VERSION)
>> +       if (ext_version != IEOT_VERSION) {
>> +              error("invalid IEOT version %d", ext_version);
> 
> Please wrap this string in _() so that it can be translated.
> 
>>                 return NULL;
>> +       }
>>          index += sizeof(uint32_t);
>>
>>          /* extension size - version bytes / bytes per entry */
>>          nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
>> -       if (!nr)
>> +       if (!nr) {
>> +              error("invalid number of IEOT entries %d", nr);
> 
> Ditto. And reporting extsize may be more useful than nr, which we know
> is zero, but we don't know why it's calculated zero unless we know
> extsize.
> 

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 7/7] read-cache: load cache entries on worker threads
  2018-10-10 15:59   ` [PATCH v8 7/7] read-cache: load cache entries on worker threads Ben Peart
@ 2018-10-19 16:11     ` Jeff King
  2018-10-22  2:14       ` Junio C Hamano
  0 siblings, 1 reply; 153+ messages in thread
From: Jeff King @ 2018-10-19 16:11 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

On Wed, Oct 10, 2018 at 11:59:38AM -0400, Ben Peart wrote:

> +static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
> +			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)

The src_offset parameter isn't used in this function.

In early versions of the series, it was used to feed the p->start_offset
field of each load_cache_entries_thread_data. But after the switch to
ieot, we don't, and instead feed p->ieot_start. But we always begin that
at 0.

Is that right (and we can drop the parameter), or should this logic:

> +	offset = ieot_start = 0;
> +	ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
> +	for (i = 0; i < nr_threads; i++) {
> [...]

be starting at src_offset instead of 0?

-Peff

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 7/7] read-cache: load cache entries on worker threads
  2018-10-19 16:11     ` Jeff King
@ 2018-10-22  2:14       ` Junio C Hamano
  2018-10-22 14:40         ` Ben Peart
  0 siblings, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-10-22  2:14 UTC (permalink / raw)
  To: Jeff King; +Cc: Ben Peart, git, pclouds, Ben Peart

Jeff King <peff@peff.net> writes:

> On Wed, Oct 10, 2018 at 11:59:38AM -0400, Ben Peart wrote:
>
>> +static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
>> +			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
>
> The src_offset parameter isn't used in this function.
>
> In early versions of the series, it was used to feed the p->start_offset
> field of each load_cache_entries_thread_data. But after the switch to
> ieot, we don't, and instead feed p->ieot_start. But we always begin that
> at 0.
>
> Is that right (and we can drop the parameter), or should this logic:
>
>> +	offset = ieot_start = 0;
>> +	ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
>> +	for (i = 0; i < nr_threads; i++) {
>> [...]
>
> be starting at src_offset instead of 0?

I think "offset" has nothing to do with the offset into the mmapped
region of memory.  It is an integer index into a (virtual) array
that is a concatenation of ieot->entries[].entries[], and it is
correct to count from zero.  The value taken from that array using
the index is used to compute the offset into the mmapped region.

Unlike load_all_cache_entries() called from the other side of the
same if() statement in the same caller, this does not depend on the
fact that the first index entry in the mmapped region appears
immediately after the index-file header.  It goes from the offsets
into the file that are recorded in the entry offset table that is an
index extension, so the sizeof(*hdr) that initializes src_offset is
not used by the codepath.

The number of bytes consumed, i.e. its return value from the
function, is not really used, either, as the caller does not use
src_offset for anything other than updating it with "+=" and passing
it to this function (which does not use it) when it calls this
function (i.e. when ieot extension exists--and by definition when
that extension exists extension_offset is not 0, so we do not make
the final load_index_extensions() call in the caller that uses
src_offset).

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH v8 7/7] read-cache: load cache entries on worker threads
  2018-10-22  2:14       ` Junio C Hamano
@ 2018-10-22 14:40         ` Ben Peart
  0 siblings, 0 replies; 153+ messages in thread
From: Ben Peart @ 2018-10-22 14:40 UTC (permalink / raw)
  To: Junio C Hamano, Jeff King; +Cc: git, pclouds, Ben Peart



On 10/21/2018 10:14 PM, Junio C Hamano wrote:
> Jeff King <peff@peff.net> writes:
> 
>> On Wed, Oct 10, 2018 at 11:59:38AM -0400, Ben Peart wrote:
>>
>>> +static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
>>> +			unsigned long src_offset, int nr_threads, struct index_entry_offset_table *ieot)
>>
>> The src_offset parameter isn't used in this function.
>>
>> In early versions of the series, it was used to feed the p->start_offset
>> field of each load_cache_entries_thread_data. But after the switch to
>> ieot, we don't, and instead feed p->ieot_start. But we always begin that
>> at 0.
>>
>> Is that right (and we can drop the parameter), or should this logic:
>>
>>> +	offset = ieot_start = 0;
>>> +	ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
>>> +	for (i = 0; i < nr_threads; i++) {
>>> [...]
>>
>> be starting at src_offset instead of 0?
> 
> I think "offset" has nothing to do with the offset into the mmapped
> region of memory.  It is an integer index into a (virtual) array
> that is a concatenation of ieot->entries[].entries[], and it is
> correct to count from zero.  The value taken from that array using
> the index is used to compute the offset into the mmapped region.
> 
> Unlike load_all_cache_entries() called from the other side of the
> same if() statement in the same caller, this does not depend on the
> fact that the first index entry in the mmapped region appears
> immediately after the index-file header.  It goes from the offsets
> into the file that are recorded in the entry offset table that is an
> index extension, so the sizeof(*hdr) that initializes src_offset is
> not used by the codepath.
> 
> The number of bytes consumed, i.e. its return value from the
> function, is not really used, either, as the caller does not use
> src_offset for anything other than updating it with "+=" and passing
> it to this function (which does not use it) when it calls this
> function (i.e. when ieot extension exists--and by definition when
> that extension exists extension_offset is not 0, so we do not make
> the final load_index_extensions() call in the caller that uses
> src_offset).
> 

Thanks for discovering/analyzing this.  You're right, I missed removing 
this when we switched from a single offset to an array of offsets via 
the IEOT.  I'll send a patch to fix both issues shortly.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization)
  2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
                     ` (8 preceding siblings ...)
  2018-10-14 12:28   ` Duy Nguyen
@ 2018-11-13  0:38   ` Jonathan Nieder
  2018-11-13  0:39     ` [PATCH 1/3] eoie: default to not writing EOIE section Jonathan Nieder
                       ` (2 more replies)
  9 siblings, 3 replies; 153+ messages in thread
From: Jonathan Nieder @ 2018-11-13  0:38 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

Hi,

Ben Peart wrote:

> Ben Peart (6):
>   read-cache: clean up casting and byte decoding
>   eoie: add End of Index Entry (EOIE) extension
>   config: add new index.threads config setting
>   read-cache: load cache extensions on a worker thread
>   ieot: add Index Entry Offset Table (IEOT) extension
>   read-cache: load cache entries on worker threads

I love this, but when deploying it I ran into a problem.

How about these patches?

Thanks,
Jonathan Nieder (3):
  eoie: default to not writing EOIE section
  ieot: default to not writing IEOT section
  index: do not warn about unrecognized extensions

 Documentation/config.txt | 14 ++++++++++++++
 read-cache.c             | 24 +++++++++++++++++++++---
 t/t1700-split-index.sh   | 11 +++++++----
 3 files changed, 42 insertions(+), 7 deletions(-)

^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH 1/3] eoie: default to not writing EOIE section
  2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
@ 2018-11-13  0:39     ` Jonathan Nieder
  2018-11-13  1:05       ` Junio C Hamano
  2018-11-13  0:39     ` [PATCH 2/3] ieot: default to not writing IEOT section Jonathan Nieder
  2018-11-13  0:40     ` [PATCH 3/3] index: do not warn about unrecognized extensions Jonathan Nieder
  2 siblings, 1 reply; 153+ messages in thread
From: Jonathan Nieder @ 2018-11-13  0:39 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

Since 3b1d9e04 (eoie: add End of Index Entry (EOIE) extension,
2018-10-10) Git defaults to writing the new EOIE section when writing
out an index file.  Usually that is a good thing because it improves
threaded performance, but when a Git repository is shared with older
versions of Git, it produces a confusing warning:

  $ git status
  ignoring EOIE extension
  HEAD detached at 371ed0defa
  nothing to commit, working tree clean

Let's introduce the new index extension more gently.  First we'll roll
out the new version of Git that understands it, and then once
sufficiently many users are using such a version, we can flip the
default to writing it by default.

Introduce a '[index] recordEndOfIndexEntries' configuration variable
to allow interested users to benefit from this index extension early.

Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 Documentation/config.txt |  7 +++++++
 read-cache.c             | 11 ++++++++++-
 t/t1700-split-index.sh   | 11 +++++++----
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 41a9ff2b6a..d702379db4 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2188,6 +2188,13 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.recordEndOfIndexEntries::
+	Specifies whether the index file should include an "End Of Index
+	Entry" section. This reduces index load time on multiprocessor
+	machines but produces a message "ignoring EOIE extension" when
+	reading the index using Git versions before 2.20. Defaults to
+	'false'.
+
 index.threads::
 	Specifies the number of threads to spawn when loading the index.
 	This is meant to reduce index load time on multiprocessor machines.
diff --git a/read-cache.c b/read-cache.c
index f3a848d61c..4bfe93c4c2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -2698,6 +2698,15 @@ void update_index_if_able(struct index_state *istate, struct lock_file *lockfile
 		rollback_lock_file(lockfile);
 }
 
+static int record_eoie(void)
+{
+	int val;
+
+	if (!git_config_get_bool("index.recordendofindexentries", &val))
+		return val;
+	return 0;
+}
+
 /*
  * On success, `tempfile` is closed. If it is the temporary file
  * of a `struct lock_file`, we will therefore effectively perform
@@ -2945,7 +2954,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	 * read.  Write it out regardless of the strip_extensions parameter as we need it
 	 * when loading the shared index.
 	 */
-	if (offset) {
+	if (offset && record_eoie()) {
 		struct strbuf sb = STRBUF_INIT;
 
 		write_eoie_extension(&sb, &eoie_c, offset);
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 2ac47aa0e4..0cbac64e28 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -25,14 +25,17 @@ test_expect_success 'enable split index' '
 	git update-index --split-index &&
 	test-tool dump-split-index .git/index >actual &&
 	indexversion=$(test-tool index-version <.git/index) &&
+
+	# NEEDSWORK: Stop hard-coding checksums.
 	if test "$indexversion" = "4"
 	then
-		own=3527df833c6c100d3d1d921a9a782d62a8be4b58
-		base=746f7ab2ed44fb839efdfbffcf399d0b113fb4cb
+		own=432ef4b63f32193984f339431fd50ca796493569
+		base=508851a7f0dfa8691e9f69c7f055865389012491
 	else
-		own=5e9b60117ece18da410ddecc8b8d43766a0e4204
-		base=4370042739b31cd17a5c5cd6043a77c9a00df113
+		own=8299b0bcd1ac364e5f1d7768efb62fa2da79a339
+		base=39d890139ee5356c7ef572216cebcd27aa41f9df
 	fi &&
+
 	cat >expect <<-EOF &&
 	own $own
 	base $base
-- 
2.19.1.930.g4563a0d9d0


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH 2/3] ieot: default to not writing IEOT section
  2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
  2018-11-13  0:39     ` [PATCH 1/3] eoie: default to not writing EOIE section Jonathan Nieder
@ 2018-11-13  0:39     ` Jonathan Nieder
  2018-11-13  0:58       ` Jonathan Tan
  2018-11-13  1:09       ` Junio C Hamano
  2018-11-13  0:40     ` [PATCH 3/3] index: do not warn about unrecognized extensions Jonathan Nieder
  2 siblings, 2 replies; 153+ messages in thread
From: Jonathan Nieder @ 2018-11-13  0:39 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

As with EOIE, popular versions of Git do not support the new IEOT
extension yet.  When accessing a Git repository written by a more
modern version of Git, they correctly ignore the unrecognized section,
but in the process they loudly warn

	ignoring IEOT extension

resulting in confusion for users.  Introduce the index extension more
gently by not writing it yet in this first version with support for
it.  Soon, once sufficiently many users are running a modern version
of Git, we can flip the default so users benefit from this index
extension by default.

Introduce a '[index] recordOffsetTable' configuration variable to
control whether the new index extension is written.

Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 Documentation/config.txt |  7 +++++++
 read-cache.c             | 11 ++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index d702379db4..cc66fb7de3 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2195,6 +2195,13 @@ index.recordEndOfIndexEntries::
 	reading the index using Git versions before 2.20. Defaults to
 	'false'.
 
+index.recordOffsetTable::
+	Specifies whether the index file should include an "Index Entry
+	Offset Table" section. This reduces index load time on
+	multiprocessor machines but produces a message "ignoring IEOT
+	extension" when reading the index using Git versions before 2.20.
+	Defaults to 'false'.
+
 index.threads::
 	Specifies the number of threads to spawn when loading the index.
 	This is meant to reduce index load time on multiprocessor machines.
diff --git a/read-cache.c b/read-cache.c
index 4bfe93c4c2..290bd54708 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -2707,6 +2707,15 @@ static int record_eoie(void)
 	return 0;
 }
 
+static int record_ieot(void)
+{
+	int val;
+
+	if (!git_config_get_bool("index.recordoffsettable", &val))
+		return val;
+	return 0;
+}
+
 /*
  * On success, `tempfile` is closed. If it is the temporary file
  * of a `struct lock_file`, we will therefore effectively perform
@@ -2767,7 +2776,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (nr_threads != 1) {
+	if (nr_threads != 1 && record_ieot()) {
 		int ieot_blocks, cpus;
 
 		/*
-- 
2.19.1.930.g4563a0d9d0


^ permalink raw reply	[flat|nested] 153+ messages in thread

* [PATCH 3/3] index: do not warn about unrecognized extensions
  2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
  2018-11-13  0:39     ` [PATCH 1/3] eoie: default to not writing EOIE section Jonathan Nieder
  2018-11-13  0:39     ` [PATCH 2/3] ieot: default to not writing IEOT section Jonathan Nieder
@ 2018-11-13  0:40     ` Jonathan Nieder
  2018-11-13  1:10       ` Junio C Hamano
  2 siblings, 1 reply; 153+ messages in thread
From: Jonathan Nieder @ 2018-11-13  0:40 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart

Documentation/technical/index-format explains:

     4-byte extension signature. If the first byte is 'A'..'Z' the
     extension is optional and can be ignored.

This allows gracefully introducing a new index extension without
having to rely on all readers having support for it.  Mandatory
extensions start with a lowercase letter and optional ones start with
a capital.  Thus the versions of Git acting on a shared local
repository do not have to upgrade in lockstep.

We almost obey that convention, but there is a problem: when
encountering an unrecognized optional extension, we write

	ignoring FNCY extension

to stderr, which alarms users.  This means that in practice we have
had to introduce index extensions in two steps: first add read
support, and then a while later, start writing by default.  This
delays when users can benefit from improvements to the index format.

We cannot change the past, but for index extensions of the future,
there is a straightforward improvement: silence that message except
when tracing.  This way, the message is still available when
debugging, but in everyday use it does not show up so (once most Git
users have this patch) we can turn on new optional extensions right
away without alarming people.

Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
Thanks for reading.  Thoughts?

Sincerely,
Jonathan

 read-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read-cache.c b/read-cache.c
index 290bd54708..65530a68c2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1720,7 +1720,7 @@ static int read_index_extension(struct index_state *istate,
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
 				     ext);
-		fprintf(stderr, "ignoring %.4s extension\n", ext);
+		trace_printf("ignoring %.4s extension\n", ext);
 		break;
 	}
 	return 0;
-- 
2.19.1.930.g4563a0d9d0


^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH 2/3] ieot: default to not writing IEOT section
  2018-11-13  0:39     ` [PATCH 2/3] ieot: default to not writing IEOT section Jonathan Nieder
@ 2018-11-13  0:58       ` Jonathan Tan
  2018-11-13  1:09       ` Junio C Hamano
  1 sibling, 0 replies; 153+ messages in thread
From: Jonathan Tan @ 2018-11-13  0:58 UTC (permalink / raw)
  To: jrnieder; +Cc: peartben, git, gitster, pclouds, benpeart, Jonathan Tan

> +index.recordOffsetTable::
> +	Specifies whether the index file should include an "Index Entry
> +	Offset Table" section. This reduces index load time on
> +	multiprocessor machines but produces a message "ignoring IEOT
> +	extension" when reading the index using Git versions before 2.20.
> +	Defaults to 'false'.

Probably worth adding a test that exercises this new config option -
somehow create an index with index.recordOffsetTable=1, check that the
index contains the appropriate string (a few ways to do this, but I'm
not sure which are portable), and then run a Git command that reads the
index to make sure it is valid; then do the same except
index.recordOffsetTable=0.

The code itself looks good to me.

Same comment for patch 1.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH 1/3] eoie: default to not writing EOIE section
  2018-11-13  0:39     ` [PATCH 1/3] eoie: default to not writing EOIE section Jonathan Nieder
@ 2018-11-13  1:05       ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-11-13  1:05 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: Ben Peart, git, pclouds, Ben Peart

Jonathan Nieder <jrnieder@gmail.com> writes:

> Since 3b1d9e04 (eoie: add End of Index Entry (EOIE) extension,
> 2018-10-10) Git defaults to writing the new EOIE section when writing
> out an index file.  Usually that is a good thing because it improves
> threaded performance, but when a Git repository is shared with older
> versions of Git, it produces a confusing warning:
>
>   $ git status
>   ignoring EOIE extension
>   HEAD detached at 371ed0defa
>   nothing to commit, working tree clean
>
> Let's introduce the new index extension more gently.  First we'll roll
> out the new version of Git that understands it, and then once
> sufficiently many users are using such a version, we can flip the
> default to writing it by default.
>
> Introduce a '[index] recordEndOfIndexEntries' configuration variable
> to allow interested users to benefit from this index extension early.

Thanks.  I am in principle OK with this approach.  In fact, I
suspect that the default may want to be dynamically determined, and
we give this knob to let the users further force their preference.
When no extension that benefits from multi-threading is written, the
default can stay "no" in future versions of Git, for example.

> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 41a9ff2b6a..d702379db4 100644

The timing is a bit unfortunate for any topic to touch this file,
and contrib/cocci would not help us in this case X-<.

> diff --git a/read-cache.c b/read-cache.c
> index f3a848d61c..4bfe93c4c2 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -2698,6 +2698,15 @@ void update_index_if_able(struct index_state *istate, struct lock_file *lockfile
>  		rollback_lock_file(lockfile);
>  }
>  
> +static int record_eoie(void)
> +{
> +	int val;
> +
> +	if (!git_config_get_bool("index.recordendofindexentries", &val))
> +		return val;
> +	return 0;
> +}

Unconditionally defaulting to no in this round is perfectly fine.
Let's make a mental note that this is the place to decide dynamic
default in the future when we want to.  It would probably have to
ask around various "extension writing" helpers if they want to have
a say in the outcome (e.g. if there are very many cache entries in
the istate, the entry offset table may want to be written and
otherwise not).

> @@ -2945,7 +2954,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>  	 * read.  Write it out regardless of the strip_extensions parameter as we need it
>  	 * when loading the shared index.
>  	 */
> -	if (offset) {
> +	if (offset && record_eoie()) {
>  		struct strbuf sb = STRBUF_INIT;
>  
>  		write_eoie_extension(&sb, &eoie_c, offset);
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 2ac47aa0e4..0cbac64e28 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -25,14 +25,17 @@ test_expect_success 'enable split index' '
>  	git update-index --split-index &&
>  	test-tool dump-split-index .git/index >actual &&
>  	indexversion=$(test-tool index-version <.git/index) &&
> +
> +	# NEEDSWORK: Stop hard-coding checksums.

Also let's stop hard-coding the assumption that the new knob is off
by default.  Ideally, you'd want to test both cases, right?

Perhaps you'd call "git update-index --split-index" we see in the
precontext twice, with "-c VAR=false" and "-c VAR=true", to prepare
"actual.without-eoie" and "actual.with-eoie", or something like
that?

Thanks.

>  	if test "$indexversion" = "4"
>  	then
> -		own=3527df833c6c100d3d1d921a9a782d62a8be4b58
> -		base=746f7ab2ed44fb839efdfbffcf399d0b113fb4cb
> +		own=432ef4b63f32193984f339431fd50ca796493569
> +		base=508851a7f0dfa8691e9f69c7f055865389012491
>  	else
> -		own=5e9b60117ece18da410ddecc8b8d43766a0e4204
> -		base=4370042739b31cd17a5c5cd6043a77c9a00df113
> +		own=8299b0bcd1ac364e5f1d7768efb62fa2da79a339
> +		base=39d890139ee5356c7ef572216cebcd27aa41f9df
>  	fi &&
> +
>  	cat >expect <<-EOF &&
>  	own $own
>  	base $base

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH 2/3] ieot: default to not writing IEOT section
  2018-11-13  0:39     ` [PATCH 2/3] ieot: default to not writing IEOT section Jonathan Nieder
  2018-11-13  0:58       ` Jonathan Tan
@ 2018-11-13  1:09       ` Junio C Hamano
  2018-11-13  1:12         ` Jonathan Nieder
  1 sibling, 1 reply; 153+ messages in thread
From: Junio C Hamano @ 2018-11-13  1:09 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: Ben Peart, git, pclouds, Ben Peart

Jonathan Nieder <jrnieder@gmail.com> writes:

> As with EOIE, popular versions of Git do not support the new IEOT
> extension yet.  When accessing a Git repository written by a more
> modern version of Git, they correctly ignore the unrecognized section,
> but in the process they loudly warn
>
> 	ignoring IEOT extension
>
> resulting in confusion for users.

Then removing the message is throwing it with bathwater.  First
think about which part of the message is confusiong and then make it
less confusing.

How about

	hint: ignoring an optional IEOT extension

to make it clear that it is totally harmless?

With that, we can add advise.unknownIndexExtension=false to turn all
of them off with a single switch.



^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH 3/3] index: do not warn about unrecognized extensions
  2018-11-13  0:40     ` [PATCH 3/3] index: do not warn about unrecognized extensions Jonathan Nieder
@ 2018-11-13  1:10       ` Junio C Hamano
  0 siblings, 0 replies; 153+ messages in thread
From: Junio C Hamano @ 2018-11-13  1:10 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: Ben Peart, git, pclouds, Ben Peart

Jonathan Nieder <jrnieder@gmail.com> writes:

> We almost obey that convention, but there is a problem: when
> encountering an unrecognized optional extension, we write
>
> 	ignoring FNCY extension
>
> to stderr, which alarms users.

Then the same comment as 2/3 applies to this step.

^ permalink raw reply	[flat|nested] 153+ messages in thread

* Re: [PATCH 2/3] ieot: default to not writing IEOT section
  2018-11-13  1:09       ` Junio C Hamano
@ 2018-11-13  1:12         ` Jonathan Nieder
  0 siblings, 0 replies; 153+ messages in thread
From: Jonathan Nieder @ 2018-11-13  1:12 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, git, pclouds, Ben Peart

Junio C Hamano wrote:

> How about
>
> 	hint: ignoring an optional IEOT extension
>
> to make it clear that it is totally harmless?
>
> With that, we can add advise.unknownIndexExtension=false to turn all
> of them off with a single switch.

I like it.  Expect a patch soon (tonight or tomorrow) that does that.

We'll have to find some appropriate place in the documentation to
explain what the message is about, still.

Thanks,
Jonathan

^ permalink raw reply	[flat|nested] 153+ messages in thread

end of thread, back to index

Thread overview: 153+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
2018-08-23 17:31 ` Stefan Beller
2018-08-23 19:44   ` Ben Peart
2018-08-24 18:40   ` Duy Nguyen
2018-08-28 14:53     ` Ben Peart
2018-08-23 18:06 ` Junio C Hamano
2018-08-23 20:33   ` Ben Peart
2018-08-24 15:37     ` Duy Nguyen
2018-08-24 15:57       ` Duy Nguyen
2018-08-24 17:28         ` Ben Peart
2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
2018-08-27 19:36           ` Junio C Hamano
2018-08-28 19:25             ` Duy Nguyen
2018-08-28 23:54               ` Ben Peart
2018-08-29 17:14               ` Junio C Hamano
2018-09-04 16:08             ` Duy Nguyen
2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
2018-09-04 18:58               ` Junio C Hamano
2018-09-04 19:31               ` Junio C Hamano
2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
2018-08-24 18:40         ` Ben Peart
2018-08-24 19:00           ` Duy Nguyen
2018-08-24 19:57             ` Ben Peart
2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
2018-08-29 17:14     ` Junio C Hamano
2018-08-29 21:35       ` Ben Peart
2018-09-03 19:16     ` Duy Nguyen
2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
2018-08-29 17:12     ` Junio C Hamano
2018-08-29 21:42       ` Ben Peart
2018-08-29 22:19         ` Junio C Hamano
2018-09-03 19:21     ` Duy Nguyen
2018-09-03 19:27       ` Duy Nguyen
2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-07 17:55     ` Junio C Hamano
2018-09-07 20:23       ` Ben Peart
2018-09-08  6:29         ` Martin Ågren
2018-09-08 14:03           ` Ben Peart
2018-09-08 17:08             ` Martin Ågren
2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-07 21:10     ` Junio C Hamano
2018-09-08 14:56       ` Ben Peart
2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
2018-09-07  4:16     ` Torsten Bögershausen
2018-09-07 13:43       ` Ben Peart
2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
2018-09-07 18:31     ` Ben Peart
2018-09-08 13:18     ` Duy Nguyen
2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
2018-09-12 16:18 ` [PATCH v5 " Ben Peart
2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-13 22:44     ` Junio C Hamano
2018-09-15 10:02     ` Duy Nguyen
2018-09-17 14:54       ` Ben Peart
2018-09-17 16:05         ` Duy Nguyen
2018-09-17 17:31           ` Junio C Hamano
2018-09-17 17:38             ` Duy Nguyen
2018-09-17 19:08               ` Junio C Hamano
2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-15 10:22     ` Duy Nguyen
2018-09-15 10:24       ` Duy Nguyen
2018-09-17 16:38         ` Ben Peart
2018-09-15 16:23       ` Duy Nguyen
2018-09-17 17:19         ` Junio C Hamano
2018-09-17 16:26       ` Ben Peart
2018-09-17 16:45         ` Duy Nguyen
2018-09-17 21:32       ` Junio C Hamano
2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
2018-09-15 10:31     ` Duy Nguyen
2018-09-17 17:25       ` Ben Peart
2018-09-15 11:07     ` Duy Nguyen
2018-09-15 11:09       ` Duy Nguyen
2018-09-17 18:52         ` Ben Peart
2018-09-15 11:29     ` Duy Nguyen
2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart
2018-09-26 19:54 ` [PATCH v6 0/7] speed up index load through parallelization Ben Peart
2018-09-26 19:54   ` [PATCH v6 1/7] read-cache.c: optimize reading index format v4 Ben Peart
2018-09-26 19:54   ` [PATCH v6 2/7] read-cache: clean up casting and byte decoding Ben Peart
2018-09-26 19:54   ` [PATCH v6 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-28  0:19     ` SZEDER Gábor
2018-09-28 18:38       ` Ben Peart
2018-09-29  0:51     ` SZEDER Gábor
2018-09-29  5:45     ` Duy Nguyen
2018-09-29 18:24       ` Junio C Hamano
2018-09-26 19:54   ` [PATCH v6 4/7] config: add new index.threads config setting Ben Peart
2018-09-28  0:26     ` SZEDER Gábor
2018-09-28 13:39       ` Ben Peart
2018-09-28 17:07         ` Junio C Hamano
2018-09-28 19:41           ` Ben Peart
2018-09-28 20:30             ` Ramsay Jones
2018-09-28 22:15               ` Junio C Hamano
2018-10-01 13:17                 ` Ben Peart
2018-10-01 15:06                   ` SZEDER Gábor
2018-09-26 19:54   ` [PATCH v6 5/7] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-26 19:54   ` [PATCH v6 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
2018-09-26 19:54   ` [PATCH v6 7/7] read-cache: load cache entries on worker threads Ben Peart
2018-09-26 22:06   ` [PATCH v6 0/7] speed up index load through parallelization Junio C Hamano
2018-09-27 17:13   ` Duy Nguyen
2018-10-01 13:45 ` [PATCH v7 " Ben Peart
2018-10-01 13:45   ` [PATCH v7 1/7] read-cache.c: optimize reading index format v4 Ben Peart
2018-10-01 13:45   ` [PATCH v7 2/7] read-cache: clean up casting and byte decoding Ben Peart
2018-10-01 15:10     ` Duy Nguyen
2018-10-01 13:45   ` [PATCH v7 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-10-01 15:17     ` SZEDER Gábor
2018-10-02 14:34       ` Ben Peart
2018-10-01 15:30     ` Duy Nguyen
2018-10-02 15:13       ` Ben Peart
2018-10-01 13:45   ` [PATCH v7 4/7] config: add new index.threads config setting Ben Peart
2018-10-01 13:45   ` [PATCH v7 5/7] read-cache: load cache extensions on a worker thread Ben Peart
2018-10-01 15:50     ` Duy Nguyen
2018-10-02 15:00       ` Ben Peart
2018-10-01 13:45   ` [PATCH v7 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
2018-10-01 16:27     ` Duy Nguyen
2018-10-02 16:34       ` Ben Peart
2018-10-02 17:02         ` Duy Nguyen
2018-10-01 13:45   ` [PATCH v7 7/7] read-cache: load cache entries on worker threads Ben Peart
2018-10-01 17:09     ` Duy Nguyen
2018-10-02 19:09       ` Ben Peart
2018-10-10 15:59 ` [PATCH v8 0/7] speed up index load through parallelization Ben Peart
2018-10-10 15:59   ` [PATCH v8 1/7] read-cache.c: optimize reading index format v4 Ben Peart
2018-10-10 15:59   ` [PATCH v8 2/7] read-cache: clean up casting and byte decoding Ben Peart
2018-10-10 15:59   ` [PATCH v8 3/7] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-10-10 15:59   ` [PATCH v8 4/7] config: add new index.threads config setting Ben Peart
2018-10-10 15:59   ` [PATCH v8 5/7] read-cache: load cache extensions on a worker thread Ben Peart
2018-10-10 15:59   ` [PATCH v8 6/7] ieot: add Index Entry Offset Table (IEOT) extension Ben Peart
2018-10-10 15:59   ` [PATCH v8 7/7] read-cache: load cache entries on worker threads Ben Peart
2018-10-19 16:11     ` Jeff King
2018-10-22  2:14       ` Junio C Hamano
2018-10-22 14:40         ` Ben Peart
2018-10-12  3:18   ` [PATCH v8 0/7] speed up index load through parallelization Junio C Hamano
2018-10-14 12:28   ` Duy Nguyen
2018-10-15 17:33     ` Ben Peart
2018-11-13  0:38   ` [PATCH 0/3] Avoid confusing messages from new index extensions (Re: [PATCH v8 0/7] speed up index load through parallelization) Jonathan Nieder
2018-11-13  0:39     ` [PATCH 1/3] eoie: default to not writing EOIE section Jonathan Nieder
2018-11-13  1:05       ` Junio C Hamano
2018-11-13  0:39     ` [PATCH 2/3] ieot: default to not writing IEOT section Jonathan Nieder
2018-11-13  0:58       ` Jonathan Tan
2018-11-13  1:09       ` Junio C Hamano
2018-11-13  1:12         ` Jonathan Nieder
2018-11-13  0:40     ` [PATCH 3/3] index: do not warn about unrecognized extensions Jonathan Nieder
2018-11-13  1:10       ` Junio C Hamano

git@vger.kernel.org mailing list mirror (one of many)

Archives are clonable:
	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://ou63pmih66umazou.onion/inbox.comp.version-control.git
	nntp://czquwvybam4bgbro.onion/inbox.comp.version-control.git
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.version-control.git
	nntp://news.gmane.org/gmane.comp.version-control.git

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox