git@vger.kernel.org mailing list mirror (one of many)
 help / Atom feed
* [PATCH v1] read-cache: speed up index load through parallelization
@ 2018-08-23 15:41 Ben Peart
  2018-08-23 17:31 ` Stefan Beller
                   ` (5 more replies)
  0 siblings, 6 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-23 15:41 UTC (permalink / raw)
  To: git; +Cc: gitster, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them. Once the
threads are complete and the cache entries are loaded, the rest of the
extensions can be loaded and processed normally on the primary thread.

Performance impact:

read cache .git/index times on a synthetic repo with:

100,000 entries
FALSE       TRUE        Savings     %Savings
0.014798767 0.009580433 0.005218333 35.26%

1,000,000 entries
FALSE       TRUE        Savings     %Savings
0.240896533 0.1751243   0.065772233 27.30%

read cache .git/index times on an actual repo with:

~3M entries
FALSE       TRUE        Savings     %Savings
0.59898098  0.4513169   0.14766408  24.65%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---

Notes:
    Base Ref: master
    Web-Diff: https://github.com/benpeart/git/commit/67a700419b
    Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b

 Documentation/config.txt |   8 ++
 config.c                 |  13 +++
 config.h                 |   1 +
 read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
 4 files changed, 216 insertions(+), 24 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..3344685cc4 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
 index comparison to the filesystem data in parallel, allowing
 overlapping IO's.  Defaults to true.
 
+core.fastIndex::
+       Enable parallel index loading
++
+This can speed up operations like 'git diff' and 'git status' especially
+when the index is very large.  When enabled, Git will do the index
+loading from the on disk format to the in-memory format in parallel.
+Defaults to true.
+
 core.createObject::
 	You can set this to 'link', in which case a hardlink followed by
 	a delete of the source are used to make sure that object creation
diff --git a/config.c b/config.c
index 9a0b10d4bc..883092fdd3 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,19 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_fast_index(void)
+{
+	int val;
+
+	if (!git_config_get_maybe_bool("core.fastindex", &val))
+		return val;
+
+	if (getenv("GIT_FASTINDEX_TEST"))
+		return 1;
+
+	return -1; /* default value */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..74ca4e7db5 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_fast_index(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..0fa7e1a04c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -24,6 +24,10 @@
 #include "utf8.h"
 #include "fsmonitor.h"
 
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
 /* Mask for the name length in ce_flags in the on-disk index */
 
 #define CE_NAMEMASK  (0x0fff)
@@ -1889,16 +1893,203 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifdef NO_PTHREADS
+
+#define load_cache_entries load_all_cache_entries
+
+#else
+
+#include "thread-utils.h"
+
+/*
+* Mostly randomly chosen maximum thread counts: we
+* cap the parallelism to online_cpus() threads, and we want
+* to have at least 7500 cache entries per thread for it to
+* be worth starting a thread.
+*/
+#define THREAD_COST		(7500)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+* A thread proc to run the load_cache_entries() computation
+* across multiple background threads.
+*/
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int threads, cpus, thread_nr;
+	unsigned long consumed;
+	int i, thread;
+
+	cpus = online_cpus();
+	threads = istate->cache_nr / THREAD_COST;
+	if (threads > cpus)
+		threads = cpus;
+
+	/* enable testing with fewer than default minimum of entries */
+	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
+		threads = 2;
+
+	if (threads < 2 || !git_config_get_fast_index())
+		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	thread_nr = (istate->cache_nr + threads - 1) / threads;
+	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
+
+	/* loop through index entries starting a thread for every thread_nr entries */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
+		if (0 == i % thread_nr) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = min(thread_nr, istate->cache_nr - i);
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+			if (++thread == threads || p->nr != thread_nr)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1935,29 +2126,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
 	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
+	src_offset += load_cache_entries(istate, mmap, mmap_size, src_offset);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 

base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
@ 2018-08-23 17:31 ` Stefan Beller
  2018-08-23 19:44   ` Ben Peart
  2018-08-24 18:40   ` Duy Nguyen
  2018-08-23 18:06 ` Junio C Hamano
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 87+ messages in thread
From: Stefan Beller @ 2018-08-23 17:31 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, Junio C Hamano

On Thu, Aug 23, 2018 at 8:45 AM Ben Peart <Ben.Peart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them. Once the
> threads are complete and the cache entries are loaded, the rest of the
> extensions can be loaded and processed normally on the primary thread.
>
> Performance impact:
>
> read cache .git/index times on a synthetic repo with:
>
> 100,000 entries
> FALSE       TRUE        Savings     %Savings
> 0.014798767 0.009580433 0.005218333 35.26%
>
> 1,000,000 entries
> FALSE       TRUE        Savings     %Savings
> 0.240896533 0.1751243   0.065772233 27.30%
>
> read cache .git/index times on an actual repo with:
>
> ~3M entries
> FALSE       TRUE        Savings     %Savings
> 0.59898098  0.4513169   0.14766408  24.65%
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>
> Notes:
>     Base Ref: master
>     Web-Diff: https://github.com/benpeart/git/commit/67a700419b
>     Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b
>
>  Documentation/config.txt |   8 ++
>  config.c                 |  13 +++
>  config.h                 |   1 +
>  read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
>  4 files changed, 216 insertions(+), 24 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..3344685cc4 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
>  index comparison to the filesystem data in parallel, allowing
>  overlapping IO's.  Defaults to true.
>
> +core.fastIndex::
> +       Enable parallel index loading
> ++
> +This can speed up operations like 'git diff' and 'git status' especially
> +when the index is very large.  When enabled, Git will do the index
> +loading from the on disk format to the in-memory format in parallel.
> +Defaults to true.

"fast" is a non-descriptive word as we try to be fast in any operation?
Maybe core.parallelIndexReading as that just describes what it
turns on/off, without second guessing its effects?
(Are there still computers with just a single CPU, where this would not
make it faster? ;-))


> +int git_config_get_fast_index(void)
> +{
> +       int val;
> +
> +       if (!git_config_get_maybe_bool("core.fastindex", &val))
> +               return val;
> +
> +       if (getenv("GIT_FASTINDEX_TEST"))
> +               return 1;

We look at this env value just before calling this function,
can be write it to only look at the evn variable once?

> +++ b/config.h
> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>  extern int git_config_get_split_index(void);
>  extern int git_config_get_max_percent_split_change(void);
>  extern int git_config_get_fsmonitor(void);
> +extern int git_config_get_fast_index(void);

Oh. nd/no-extern did not cover config.h


>
> +#ifndef min
> +#define min(a,b) (((a) < (b)) ? (a) : (b))
> +#endif

We do not have a minimum function in the tree,
except for xdiff/xmacros.h:29: XDL_MIN.
I wonder what the rationale is for not having a MIN()
definition, I think we discussed that on the list a couple
times but the rationale escaped me.

If we introduce a min/max macro, can we put it somewhere
more prominent? (I would find it useful elsewhere)

> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST            (7500)

This reads very similar to preload-index.c THREAD_COST

> +       /* loop through index entries starting a thread for every thread_nr entries */
> +       consumed = thread = 0;
> +       for (i = 0; ; i++) {
> +               struct ondisk_cache_entry *ondisk;
> +               const char *name;
> +               unsigned int flags;
> +
> +               /* we've reached the begining of a block of cache entries, kick off a thread to process them */

beginning

Thanks,
Stefan

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
  2018-08-23 17:31 ` Stefan Beller
@ 2018-08-23 18:06 ` Junio C Hamano
  2018-08-23 20:33   ` Ben Peart
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-08-23 18:06 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\

Ben Peart <Ben.Peart@microsoft.com> writes:

> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.

Nice.

> +int git_config_get_fast_index(void)
> +{
> +	int val;
> +
> +	if (!git_config_get_maybe_bool("core.fastindex", &val))
> +		return val;
> +
> +	if (getenv("GIT_FASTINDEX_TEST"))
> +		return 1;

It probably makes sense to use git_env_bool() to be consistent,
which allows GIT_FASTINDEX_TEST=0 to turn it off after this becomes
the default.

> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..0fa7e1a04c 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -24,6 +24,10 @@
>  #include "utf8.h"
>  #include "fsmonitor.h"
>  
> +#ifndef min
> +#define min(a,b) (((a) < (b)) ? (a) : (b))
> +#endif

Let's lose this, which is used only once, even though it could be
used elsewhere but not used (e.g. threads vs cpus near the beginning
of load_cache_entries()).

> +static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)

Wrap and possibly add comment before the function to describe what
it does and what its parameters mean?

> +{
> +	int i;
> +	unsigned long src_offset = start_offset;
> +
> +	for (i = offset; i < offset + nr; i++) {
> +		struct ondisk_cache_entry *disk_ce;
> +		struct cache_entry *ce;
> +		unsigned long consumed;
> +
> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +		set_index_entry(istate, i, ce);
> +
> +		src_offset += consumed;
> +	}
> +	return src_offset - start_offset;
> +}

OK.

> +static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
> +{

(following aloud) This "all" variant is "one thread does all", iow,
unthreaded version.  Makes sense.

> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	unsigned long consumed;
> +
> +	if (istate->version == 4) {
> +		previous_name = &previous_name_buf;
> +		mem_pool_init(&istate->ce_mem_pool,
> +			      estimate_cache_size_from_compressed(istate->cache_nr));
> +	} else {
> +		previous_name = NULL;
> +		mem_pool_init(&istate->ce_mem_pool,
> +			      estimate_cache_size(mmap_size, istate->cache_nr));
> +	}

I count there are three instances of "if version 4 use the strbuf
for name-buf, otherwise..." in this patch, which made me wonder if
we can make them shared more and/or if it makes sense to attempt to
do so.

> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
> +	strbuf_release(&previous_name_buf);
> +	return consumed;
> +}
> +
> +#ifdef NO_PTHREADS
> +
> +#define load_cache_entries load_all_cache_entries
> +
> +#else
> +
> +#include "thread-utils.h"
> +
> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST		(7500)
> +
> +struct load_cache_entries_thread_data
> +{
> +	pthread_t pthread;
> +	struct index_state *istate;
> +	struct mem_pool *ce_mem_pool;
> +	int offset, nr;
> +	void *mmap;
> +	unsigned long start_offset;
> +	struct strbuf previous_name_buf;
> +	struct strbuf *previous_name;
> +	unsigned long consumed;	/* return # of bytes in index file processed */
> +};
> +
> +/*
> +* A thread proc to run the load_cache_entries() computation
> +* across multiple background threads.
> +*/
> +static void *load_cache_entries_thread(void *_data)
> +{
> +	struct load_cache_entries_thread_data *p = _data;
> +
> +	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
> +	return NULL;
> +}

(following aloud) And the threaded version chews the block of ce's
given to each thread.  Makes sense.

> +static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	struct load_cache_entries_thread_data *data;
> +	int threads, cpus, thread_nr;
> +	unsigned long consumed;
> +	int i, thread;
> +
> +	cpus = online_cpus();
> +	threads = istate->cache_nr / THREAD_COST;
> +	if (threads > cpus)
> +		threads = cpus;

No other caller of online_cpus() is prepared to deal with faulty
return from the function (e.g. 0 or negative), so it is perfectly
fine for this caller to trust it would return at least 1.  OK.

Not using min() and it still is very readable ;-).

> +	/* enable testing with fewer than default minimum of entries */
> +	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
> +		threads = 2;

Another good place to use git_env_bool().

> +	if (threads < 2 || !git_config_get_fast_index())
> +		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);

config_get_fast_index() can return -1 to signal "no strong
preference either way".  A caller that negates the value without
paying special attention to negative return makes the reader wonder
if the code is buggy or actively interpreting "do not care" as "I do
not mind if you use it" (it is the latter in this case).

I actually think git_config_get_fast_index() is a helper that does a
bit too little.  Perhaps the above two if() statements can be
combined into a single call to

	threads = use_fast_index(istate);
	if (threads < 2)
		return load_all_cache_entries(...);

and let it call online_cpus(), determination of thread-count taking
THREADS_COST into account, and also reading the configuration
variable?  The configuration variable might even want to say how
many threads it wants to cap us at maximum in the future.

> +	mem_pool_init(&istate->ce_mem_pool, 0);
> +	if (istate->version == 4)
> +		previous_name = &previous_name_buf;
> +	else
> +		previous_name = NULL;
> +
> +	thread_nr = (istate->cache_nr + threads - 1) / threads;

(following aloud) threads is the number of threads that we are going
to spawn.  thread_nr is not any number about threads---it is number
of cache entries each thread will work on.  The latter is
confusingly named.

ce_per_thread perhaps?

As the division is rounded up, among "threads" threads, we know we
will cover all "cache_nr" cache entries.  The last thread may handle
fewer than "thread_nr" entries, or even just a single entry in the
worst case.

When cache_nr == 1 and FASTINDEX_TEST tells us to use threads == 2,
then thread_nr = (1 + 2 - 1) / 2 = 1.

The first one in the loop is given (offset, nr) = (0, 1) in the loop
The second one is given (offset, nr) = (1, 0) in the loop.  Two
questions come to mind:

 - Is load_cache_entries_thread() prepared to be given offset that
   is beyond the end of istate->cache[] and become a no-op?

 - Does the next loop even terminate without running beyond the end
   of istate->cache[]?

> +	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
> +
> +	/* loop through index entries starting a thread for every thread_nr entries */
> +	consumed = thread = 0;
> +	for (i = 0; ; i++) {

Uncapped for() loop makes readers a bit nervous.
An extra "i < istate->cache_nr" would not hurt, perhaps?

> +		struct ondisk_cache_entry *ondisk;
> +		const char *name;
> +		unsigned int flags;
> +
> +		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
> +		if (0 == i % thread_nr) {
> +			struct load_cache_entries_thread_data *p = &data[thread];
> +
> +			p->istate = istate;
> +			p->offset = i;
> +			p->nr = min(thread_nr, istate->cache_nr - i);

(following aloud) p->nr is the number of entries this thread will
work on.

> +			/* create a mem_pool for each thread */
> +			if (istate->version == 4)
> +				mem_pool_init(&p->ce_mem_pool,
> +						  estimate_cache_size_from_compressed(p->nr));
> +			else
> +				mem_pool_init(&p->ce_mem_pool,
> +						  estimate_cache_size(mmap_size, p->nr));
> +
> +			p->mmap = mmap;
> +			p->start_offset = src_offset;
> +			if (previous_name) {
> +				strbuf_addbuf(&p->previous_name_buf, previous_name);
> +				p->previous_name = &p->previous_name_buf;
> +			}
> +
> +			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
> +				die("unable to create load_cache_entries_thread");
> +			if (++thread == threads || p->nr != thread_nr)
> +				break;
> +		}
> +
> +		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +
> +		/* On-disk flags are just 16 bits */
> +		flags = get_be16(&ondisk->flags);
> +
> +		if (flags & CE_EXTENDED) {
> +			struct ondisk_cache_entry_extended *ondisk2;
> +			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
> +			name = ondisk2->name;
> +		} else
> +			name = ondisk->name;
> +
> +		if (!previous_name) {
> +			size_t len;
> +
> +			/* v3 and earlier */
> +			len = flags & CE_NAMEMASK;
> +			if (len == CE_NAMEMASK)
> +				len = strlen(name);
> +			src_offset += (flags & CE_EXTENDED) ?
> +				ondisk_cache_entry_extended_size(len) :
> +				ondisk_cache_entry_size(len);
> +		} else
> +			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);

Nice to see this done without a new index extension that records
offsets, so that we can load existing index files in parallel.

> +	}
> +
> +	for (i = 0; i < threads; i++) {
> +		struct load_cache_entries_thread_data *p = data + i;
> +		if (pthread_join(p->pthread, NULL))
> +			die("unable to join load_cache_entries_thread");
> +		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
> +		strbuf_release(&p->previous_name_buf);
> +		consumed += p->consumed;
> +	}
> +
> +	free(data);
> +	strbuf_release(&previous_name_buf);
> +
> +	return consumed;
> +}
> +
> +#endif

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 17:31 ` Stefan Beller
@ 2018-08-23 19:44   ` Ben Peart
  2018-08-24 18:40   ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-23 19:44 UTC (permalink / raw)
  To: Stefan Beller, Ben Peart; +Cc: git, Junio C Hamano



On 8/23/2018 1:31 PM, Stefan Beller wrote:
> On Thu, Aug 23, 2018 at 8:45 AM Ben Peart <Ben.Peart@microsoft.com> wrote:
>>
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
>>
>> It accomplishes this by having the primary thread loop across the index file
>> tracking the offset and (for V4 indexes) expanding the name. It creates a
>> thread to process each block of entries as it comes to them. Once the
>> threads are complete and the cache entries are loaded, the rest of the
>> extensions can be loaded and processed normally on the primary thread.
>>
>> Performance impact:
>>
>> read cache .git/index times on a synthetic repo with:
>>
>> 100,000 entries
>> FALSE       TRUE        Savings     %Savings
>> 0.014798767 0.009580433 0.005218333 35.26%
>>
>> 1,000,000 entries
>> FALSE       TRUE        Savings     %Savings
>> 0.240896533 0.1751243   0.065772233 27.30%
>>
>> read cache .git/index times on an actual repo with:
>>
>> ~3M entries
>> FALSE       TRUE        Savings     %Savings
>> 0.59898098  0.4513169   0.14766408  24.65%
>>
>> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
>> ---
>>
>> Notes:
>>      Base Ref: master
>>      Web-Diff: https://github.com/benpeart/git/commit/67a700419b
>>      Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v1 && git checkout 67a700419b
>>
>>   Documentation/config.txt |   8 ++
>>   config.c                 |  13 +++
>>   config.h                 |   1 +
>>   read-cache.c             | 218 ++++++++++++++++++++++++++++++++++-----
>>   4 files changed, 216 insertions(+), 24 deletions(-)
>>
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 1c42364988..3344685cc4 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -899,6 +899,14 @@ relatively high IO latencies.  When enabled, Git will do the
>>   index comparison to the filesystem data in parallel, allowing
>>   overlapping IO's.  Defaults to true.
>>
>> +core.fastIndex::
>> +       Enable parallel index loading
>> ++
>> +This can speed up operations like 'git diff' and 'git status' especially
>> +when the index is very large.  When enabled, Git will do the index
>> +loading from the on disk format to the in-memory format in parallel.
>> +Defaults to true.
> 
> "fast" is a non-descriptive word as we try to be fast in any operation?
> Maybe core.parallelIndexReading as that just describes what it
> turns on/off, without second guessing its effects?
> (Are there still computers with just a single CPU, where this would not
> make it faster? ;-))
> 

How about core.parallelReadIndex?  Slightly shorter and matches the 
function names better.

> 
>> +int git_config_get_fast_index(void)
>> +{
>> +       int val;
>> +
>> +       if (!git_config_get_maybe_bool("core.fastindex", &val))
>> +               return val;
>> +
>> +       if (getenv("GIT_FASTINDEX_TEST"))
>> +               return 1;
> 
> We look at this env value just before calling this function,
> can be write it to only look at the evn variable once?
> 

Sure, I didn't like the fact that it was called twice but didn't get 
around to cleaning it up.

>> +++ b/config.h
>> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>>   extern int git_config_get_split_index(void);
>>   extern int git_config_get_max_percent_split_change(void);
>>   extern int git_config_get_fsmonitor(void);
>> +extern int git_config_get_fast_index(void);
> 
> Oh. nd/no-extern did not cover config.h
> 
> 
>>
>> +#ifndef min
>> +#define min(a,b) (((a) < (b)) ? (a) : (b))
>> +#endif
> 
> We do not have a minimum function in the tree,
> except for xdiff/xmacros.h:29: XDL_MIN.
> I wonder what the rationale is for not having a MIN()
> definition, I think we discussed that on the list a couple
> times but the rationale escaped me.
> 
> If we introduce a min/max macro, can we put it somewhere
> more prominent? (I would find it useful elsewhere)
>

I'll avoid that particular rabbit hole and just remove the min macro 
definition.  ;-)

>> +/*
>> +* Mostly randomly chosen maximum thread counts: we
>> +* cap the parallelism to online_cpus() threads, and we want
>> +* to have at least 7500 cache entries per thread for it to
>> +* be worth starting a thread.
>> +*/
>> +#define THREAD_COST            (7500)
> 
> This reads very similar to preload-index.c THREAD_COST
> 
>> +       /* loop through index entries starting a thread for every thread_nr entries */
>> +       consumed = thread = 0;
>> +       for (i = 0; ; i++) {
>> +               struct ondisk_cache_entry *ondisk;
>> +               const char *name;
>> +               unsigned int flags;
>> +
>> +               /* we've reached the begining of a block of cache entries, kick off a thread to process them */
> 
> beginning
> 

Thanks

> Thanks,
> Stefan
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 18:06 ` Junio C Hamano
@ 2018-08-23 20:33   ` Ben Peart
  2018-08-24 15:37     ` Duy Nguyen
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-08-23 20:33 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git



On 8/23/2018 2:06 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
> 
> Nice.
> 
>> +int git_config_get_fast_index(void)
>> +{
>> +	int val;
>> +
>> +	if (!git_config_get_maybe_bool("core.fastindex", &val))
>> +		return val;
>> +
>> +	if (getenv("GIT_FASTINDEX_TEST"))
>> +		return 1;
> 
> It probably makes sense to use git_env_bool() to be consistent,
> which allows GIT_FASTINDEX_TEST=0 to turn it off after this becomes
> the default.
> 
>> diff --git a/read-cache.c b/read-cache.c
>> index 7b1354d759..0fa7e1a04c 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -24,6 +24,10 @@
>>   #include "utf8.h"
>>   #include "fsmonitor.h"
>>   
>> +#ifndef min
>> +#define min(a,b) (((a) < (b)) ? (a) : (b))
>> +#endif
> 
> Let's lose this, which is used only once, even though it could be
> used elsewhere but not used (e.g. threads vs cpus near the beginning
> of load_cache_entries()).
> 

I didn't have it, then added it to make it trivial to see what was 
actually happening.  I can switch back.

>> +static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
> 
> Wrap and possibly add comment before the function to describe what
> it does and what its parameters mean?
> 
>> +{
>> +	int i;
>> +	unsigned long src_offset = start_offset;
>> +
>> +	for (i = offset; i < offset + nr; i++) {
>> +		struct ondisk_cache_entry *disk_ce;
>> +		struct cache_entry *ce;
>> +		unsigned long consumed;
>> +
>> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
>> +		set_index_entry(istate, i, ce);
>> +
>> +		src_offset += consumed;
>> +	}
>> +	return src_offset - start_offset;
>> +}
> 
> OK.
> 
>> +static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
> 
> (following aloud) This "all" variant is "one thread does all", iow,
> unthreaded version.  Makes sense.
> 
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	unsigned long consumed;
>> +
>> +	if (istate->version == 4) {
>> +		previous_name = &previous_name_buf;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +			      estimate_cache_size_from_compressed(istate->cache_nr));
>> +	} else {
>> +		previous_name = NULL;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +			      estimate_cache_size(mmap_size, istate->cache_nr));
>> +	}
> 
> I count there are three instances of "if version 4 use the strbuf
> for name-buf, otherwise..." in this patch, which made me wonder if
> we can make them shared more and/or if it makes sense to attempt to
> do so.
> 

Actually, they are all different and all required.  One sets it up for 
the "do it all on one thread" path.  One sets it up for each thread. The 
last one is used by the primary thread when scanning for blocks to hand 
off to the child threads.

>> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
>> +	strbuf_release(&previous_name_buf);
>> +	return consumed;
>> +}
>> +
>> +#ifdef NO_PTHREADS
>> +
>> +#define load_cache_entries load_all_cache_entries
>> +
>> +#else
>> +
>> +#include "thread-utils.h"
>> +
>> +/*
>> +* Mostly randomly chosen maximum thread counts: we
>> +* cap the parallelism to online_cpus() threads, and we want
>> +* to have at least 7500 cache entries per thread for it to
>> +* be worth starting a thread.
>> +*/
>> +#define THREAD_COST		(7500)
>> +
>> +struct load_cache_entries_thread_data
>> +{
>> +	pthread_t pthread;
>> +	struct index_state *istate;
>> +	struct mem_pool *ce_mem_pool;
>> +	int offset, nr;
>> +	void *mmap;
>> +	unsigned long start_offset;
>> +	struct strbuf previous_name_buf;
>> +	struct strbuf *previous_name;
>> +	unsigned long consumed;	/* return # of bytes in index file processed */
>> +};
>> +
>> +/*
>> +* A thread proc to run the load_cache_entries() computation
>> +* across multiple background threads.
>> +*/
>> +static void *load_cache_entries_thread(void *_data)
>> +{
>> +	struct load_cache_entries_thread_data *p = _data;
>> +
>> +	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
>> +	return NULL;
>> +}
> 
> (following aloud) And the threaded version chews the block of ce's
> given to each thread.  Makes sense.
> 
>> +static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	struct load_cache_entries_thread_data *data;
>> +	int threads, cpus, thread_nr;
>> +	unsigned long consumed;
>> +	int i, thread;
>> +
>> +	cpus = online_cpus();
>> +	threads = istate->cache_nr / THREAD_COST;
>> +	if (threads > cpus)
>> +		threads = cpus;
> 
> No other caller of online_cpus() is prepared to deal with faulty
> return from the function (e.g. 0 or negative), so it is perfectly
> fine for this caller to trust it would return at least 1.  OK.
> 
> Not using min() and it still is very readable ;-).
> 
>> +	/* enable testing with fewer than default minimum of entries */
>> +	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
>> +		threads = 2;
> 
> Another good place to use git_env_bool().
> 
>> +	if (threads < 2 || !git_config_get_fast_index())
>> +		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> 
> config_get_fast_index() can return -1 to signal "no strong
> preference either way".  A caller that negates the value without
> paying special attention to negative return makes the reader wonder
> if the code is buggy or actively interpreting "do not care" as "I do
> not mind if you use it" (it is the latter in this case).
> 
> I actually think git_config_get_fast_index() is a helper that does a
> bit too little.  Perhaps the above two if() statements can be
> combined into a single call to
> 
> 	threads = use_fast_index(istate);
> 	if (threads < 2)
> 		return load_all_cache_entries(...);
> 
> and let it call online_cpus(), determination of thread-count taking
> THREADS_COST into account, and also reading the configuration
> variable?  The configuration variable might even want to say how
> many threads it wants to cap us at maximum in the future.
> 

I reworked this a bit.

git_config_get_parallel_read_index() still just deals with the config 
value (I had to read it this way as in some code paths, the global 
config settings in environment.c haven't been read yet).

All the logic about whether to use threads and how many to use is 
centralized here along with the environment variable to override the 
default behavior.

>> +	mem_pool_init(&istate->ce_mem_pool, 0);
>> +	if (istate->version == 4)
>> +		previous_name = &previous_name_buf;
>> +	else
>> +		previous_name = NULL;
>> +
>> +	thread_nr = (istate->cache_nr + threads - 1) / threads;
> 
> (following aloud) threads is the number of threads that we are going
> to spawn.  thread_nr is not any number about threads---it is number
> of cache entries each thread will work on.  The latter is
> confusingly named.
> 
> ce_per_thread perhaps?
> 

Sure

> As the division is rounded up, among "threads" threads, we know we
> will cover all "cache_nr" cache entries.  The last thread may handle
> fewer than "thread_nr" entries, or even just a single entry in the
> worst case.
> 

It's divided by the number of threads so will only be up to 1 less than 
the other threads.  Given the minimum # of entries per thread is 7500, 
you'd never end up with just a single entry (unless using the 
GIT_PARALLELREADINDEX_TEST override).

> When cache_nr == 1 and FASTINDEX_TEST tells us to use threads == 2,
> then thread_nr = (1 + 2 - 1) / 2 = 1.
> 
> The first one in the loop is given (offset, nr) = (0, 1) in the loop
> The second one is given (offset, nr) = (1, 0) in the loop.  Two
> questions come to mind:
> 
>   - Is load_cache_entries_thread() prepared to be given offset that
>     is beyond the end of istate->cache[] and become a no-op?
> 
>   - Does the next loop even terminate without running beyond the end
>     of istate->cache[]?
> 
>> +	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
>> +
>> +	/* loop through index entries starting a thread for every thread_nr entries */
>> +	consumed = thread = 0;
>> +	for (i = 0; ; i++) {
> 
> Uncapped for() loop makes readers a bit nervous.
> An extra "i < istate->cache_nr" would not hurt, perhaps?
> 

We don't need or want to run through _all_ the entries, only to the 
first entry of the last block.  I'd prefer to leave that extra test out 
as it implies that we are going to loop through them all. I'll add a 
comment to make it more obvious what is happening.

>> +		struct ondisk_cache_entry *ondisk;
>> +		const char *name;
>> +		unsigned int flags;
>> +
>> +		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
>> +		if (0 == i % thread_nr) {
>> +			struct load_cache_entries_thread_data *p = &data[thread];
>> +
>> +			p->istate = istate;
>> +			p->offset = i;
>> +			p->nr = min(thread_nr, istate->cache_nr - i);
> 
> (following aloud) p->nr is the number of entries this thread will
> work on.
> 
>> +			/* create a mem_pool for each thread */
>> +			if (istate->version == 4)
>> +				mem_pool_init(&p->ce_mem_pool,
>> +						  estimate_cache_size_from_compressed(p->nr));
>> +			else
>> +				mem_pool_init(&p->ce_mem_pool,
>> +						  estimate_cache_size(mmap_size, p->nr));
>> +
>> +			p->mmap = mmap;
>> +			p->start_offset = src_offset;
>> +			if (previous_name) {
>> +				strbuf_addbuf(&p->previous_name_buf, previous_name);
>> +				p->previous_name = &p->previous_name_buf;
>> +			}
>> +
>> +			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
>> +				die("unable to create load_cache_entries_thread");
>> +			if (++thread == threads || p->nr != thread_nr)
>> +				break;
>> +		}
>> +
>> +		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +
>> +		/* On-disk flags are just 16 bits */
>> +		flags = get_be16(&ondisk->flags);
>> +
>> +		if (flags & CE_EXTENDED) {
>> +			struct ondisk_cache_entry_extended *ondisk2;
>> +			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
>> +			name = ondisk2->name;
>> +		} else
>> +			name = ondisk->name;
>> +
>> +		if (!previous_name) {
>> +			size_t len;
>> +
>> +			/* v3 and earlier */
>> +			len = flags & CE_NAMEMASK;
>> +			if (len == CE_NAMEMASK)
>> +				len = strlen(name);
>> +			src_offset += (flags & CE_EXTENDED) ?
>> +				ondisk_cache_entry_extended_size(len) :
>> +				ondisk_cache_entry_size(len);
>> +		} else
>> +			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
> 
> Nice to see this done without a new index extension that records
> offsets, so that we can load existing index files in parallel.
> 

Yes, I prefer this simpler model as well.  I wasn't sure it would 
produce a significant improvement given the primary thread still has to 
run through the variable length cache entries but was pleasantly surprised.

The recent mem_pool changes really helped as well as it removed all 
thread contention in the heap that was happening before.

>> +	}
>> +
>> +	for (i = 0; i < threads; i++) {
>> +		struct load_cache_entries_thread_data *p = data + i;
>> +		if (pthread_join(p->pthread, NULL))
>> +			die("unable to join load_cache_entries_thread");
>> +		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
>> +		strbuf_release(&p->previous_name_buf);
>> +		consumed += p->consumed;
>> +	}
>> +
>> +	free(data);
>> +	strbuf_release(&previous_name_buf);
>> +
>> +	return consumed;
>> +}
>> +
>> +#endif

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 20:33   ` Ben Peart
@ 2018-08-24 15:37     ` Duy Nguyen
  2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
  0 siblings, 2 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-08-24 15:37 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

Since we're cutting corners to speed things up, could you try
something like this?

I notice that reading v4 is significantly slower than v2 and
apparently strlen() (at least from glibc) is much cleverer and at
least gives me a few percentage time saving.

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..d10cccaed0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
strbuf *name, const char *cp_)
        if (name->len < len)
                die("malformed name field in the index");
        strbuf_remove(name, name->len - len, len);
-       for (ep = cp; *ep; ep++)
-               ; /* find the end */
+       ep = cp + strlen(cp);
        strbuf_add(name, cp, ep - cp);
        return (const char *)ep + 1 - cp_;
 }

On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> > Nice to see this done without a new index extension that records
> > offsets, so that we can load existing index files in parallel.
> >
>
> Yes, I prefer this simpler model as well.  I wasn't sure it would
> produce a significant improvement given the primary thread still has to
> run through the variable length cache entries but was pleasantly surprised.

Out of curiosity, how much time saving could we gain by recording
offsets as an extension (I assume we need, like 4 offsets if the
system has 4 cores)? Much much more than this simpler model (which may
justify the complexity) or just "meh" compared to this?
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:37     ` Duy Nguyen
@ 2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 17:28         ` Ben Peart
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
  1 sibling, 2 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-08-24 15:57 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 05:37:20PM +0200, Duy Nguyen wrote:
> Since we're cutting corners to speed things up, could you try
> something like this?
> 
> I notice that reading v4 is significantly slower than v2 and
> apparently strlen() (at least from glibc) is much cleverer and at
> least gives me a few percentage time saving.
> 
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..d10cccaed0 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
> strbuf *name, const char *cp_)
>         if (name->len < len)
>                 die("malformed name field in the index");
>         strbuf_remove(name, name->len - len, len);
> -       for (ep = cp; *ep; ep++)
> -               ; /* find the end */
> +       ep = cp + strlen(cp);
>         strbuf_add(name, cp, ep - cp);
>         return (const char *)ep + 1 - cp_;
>  }

No try this instead. It's half way back to v2 numbers for me (tested
with "test-tool read-cache 100" on webkit.git). For the record, v4 is
about 30% slower than v2 in my tests.

We could probably do better too. Instead of preparing the string in a
separate buffer (previous_name_buf), we could just assemble it directly
to the newly allocated "ce".

-- 8< --
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..237f60a76c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen(cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 8< --

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:57       ` Duy Nguyen
@ 2018-08-24 17:28         ` Ben Peart
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  1 sibling, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-24 17:28 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 11:57 AM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 05:37:20PM +0200, Duy Nguyen wrote:
>> Since we're cutting corners to speed things up, could you try
>> something like this?
>>
>> I notice that reading v4 is significantly slower than v2 and
>> apparently strlen() (at least from glibc) is much cleverer and at
>> least gives me a few percentage time saving.
>>
>> diff --git a/read-cache.c b/read-cache.c
>> index 7b1354d759..d10cccaed0 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1755,8 +1755,7 @@ static unsigned long expand_name_field(struct
>> strbuf *name, const char *cp_)
>>          if (name->len < len)
>>                  die("malformed name field in the index");
>>          strbuf_remove(name, name->len - len, len);
>> -       for (ep = cp; *ep; ep++)
>> -               ; /* find the end */
>> +       ep = cp + strlen(cp);
>>          strbuf_add(name, cp, ep - cp);
>>          return (const char *)ep + 1 - cp_;
>>   }
> 
> No try this instead. It's half way back to v2 numbers for me (tested
> with "test-tool read-cache 100" on webkit.git). For the record, v4 is
> about 30% slower than v2 in my tests.
> 

Thanks Duy, this helped on my system as well.

Interestingly, simply reading the cache tree extension in read_one() now 
takes about double the CPU on the primary thread as does 
load_cache_entries().

Hmm, that gives me an idea.  I could kick off another thread to load 
that extension in parallel and cut off another ~160 ms.  I'll add that 
to my list of future patches to investigate...

> We could probably do better too. Instead of preparing the string in a
> separate buffer (previous_name_buf), we could just assemble it directly
> to the newly allocated "ce".
> 
> -- 8< --
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..237f60a76c 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
>   
>   	if (name->len < len)
>   		die("malformed name field in the index");
> -	strbuf_remove(name, name->len - len, len);
> -	for (ep = cp; *ep; ep++)
> -		; /* find the end */
> +	strbuf_setlen(name, name->len - len);
> +	ep = cp + strlen(cp);
>   	strbuf_add(name, cp, ep - cp);
>   	return (const char *)ep + 1 - cp_;
>   }
> -- 8< --
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 15:37     ` Duy Nguyen
  2018-08-24 15:57       ` Duy Nguyen
@ 2018-08-24 18:20       ` Duy Nguyen
  2018-08-24 18:40         ` Ben Peart
  1 sibling, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-08-24 18:20 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> > > Nice to see this done without a new index extension that records
> > > offsets, so that we can load existing index files in parallel.
> > >
> >
> > Yes, I prefer this simpler model as well.  I wasn't sure it would
> > produce a significant improvement given the primary thread still has to
> > run through the variable length cache entries but was pleasantly surprised.
>
> Out of curiosity, how much time saving could we gain by recording
> offsets as an extension (I assume we need, like 4 offsets if the
> system has 4 cores)? Much much more than this simpler model (which may
> justify the complexity) or just "meh" compared to this?

To answer my own question, I ran a patched git to precalculate
individual thread parameters, removed the scheduler code and hard
coded these parameters (I ran just 4 threads, one per core). I got
0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
0m4.996s from Ben's patch (same test settings of course) I think it's
definitely worth adding some extra complexity.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-23 17:31 ` Stefan Beller
  2018-08-23 19:44   ` Ben Peart
@ 2018-08-24 18:40   ` Duy Nguyen
  2018-08-28 14:53     ` Ben Peart
  1 sibling, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-08-24 18:40 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Ben Peart, Git Mailing List, Junio C Hamano

On Thu, Aug 23, 2018 at 7:33 PM Stefan Beller <sbeller@google.com> wrote:
> > +core.fastIndex::
> > +       Enable parallel index loading
> > ++
> > +This can speed up operations like 'git diff' and 'git status' especially
> > +when the index is very large.  When enabled, Git will do the index
> > +loading from the on disk format to the in-memory format in parallel.
> > +Defaults to true.
>
> "fast" is a non-descriptive word as we try to be fast in any operation?
> Maybe core.parallelIndexReading as that just describes what it
> turns on/off, without second guessing its effects?

Another option is index.threads (the "index" section currently only
has one item, index.version). The value could be the same as
grep.threads or pack.threads.

(and if you're thinking about parallelizing write as well but it
should be tuned differently, then perhaps index.readThreads, but I
don't think we need to go that far)
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
@ 2018-08-24 18:40         ` Ben Peart
  2018-08-24 19:00           ` Duy Nguyen
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-08-24 18:40 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 2:20 PM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
>> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
>>>> Nice to see this done without a new index extension that records
>>>> offsets, so that we can load existing index files in parallel.
>>>>
>>>
>>> Yes, I prefer this simpler model as well.  I wasn't sure it would
>>> produce a significant improvement given the primary thread still has to
>>> run through the variable length cache entries but was pleasantly surprised.
>>
>> Out of curiosity, how much time saving could we gain by recording
>> offsets as an extension (I assume we need, like 4 offsets if the
>> system has 4 cores)? Much much more than this simpler model (which may
>> justify the complexity) or just "meh" compared to this?
> 
> To answer my own question, I ran a patched git to precalculate
> individual thread parameters, removed the scheduler code and hard
> coded these parameters (I ran just 4 threads, one per core). I got
> 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
> 0m4.996s from Ben's patch (same test settings of course) I think it's
> definitely worth adding some extra complexity.
> 

I took a run at doing that last year [1] but that was before the 
mem_pool work that allowed us to avoid the thread contention on the heap 
so the numbers aren't an apples to apples comparison (they would be 
better today).

The trade-off is the additional complexity to be able to load the index 
extension without having to parse through all the variable length cache 
entries.  My patch worked but there was feedback requested to make it 
more generic and robust that I haven't gotten around to yet.

This patch series went for simplicity over absolutely the best possible 
performance.

[1] 
https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:40         ` Ben Peart
@ 2018-08-24 19:00           ` Duy Nguyen
  2018-08-24 19:57             ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-08-24 19:00 UTC (permalink / raw)
  To: Ben Peart; +Cc: Junio C Hamano, Ben Peart, Git Mailing List

On Fri, Aug 24, 2018 at 8:40 PM Ben Peart <peartben@gmail.com> wrote:
>
>
>
> On 8/24/2018 2:20 PM, Duy Nguyen wrote:
> > On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
> >> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
> >>>> Nice to see this done without a new index extension that records
> >>>> offsets, so that we can load existing index files in parallel.
> >>>>
> >>>
> >>> Yes, I prefer this simpler model as well.  I wasn't sure it would
> >>> produce a significant improvement given the primary thread still has to
> >>> run through the variable length cache entries but was pleasantly surprised.
> >>
> >> Out of curiosity, how much time saving could we gain by recording
> >> offsets as an extension (I assume we need, like 4 offsets if the
> >> system has 4 cores)? Much much more than this simpler model (which may
> >> justify the complexity) or just "meh" compared to this?
> >
> > To answer my own question, I ran a patched git to precalculate
> > individual thread parameters, removed the scheduler code and hard
> > coded these parameters (I ran just 4 threads, one per core). I got
> > 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
> > 0m4.996s from Ben's patch (same test settings of course) I think it's
> > definitely worth adding some extra complexity.
> >
>
> I took a run at doing that last year [1] but that was before the
> mem_pool work that allowed us to avoid the thread contention on the heap
> so the numbers aren't an apples to apples comparison (they would be
> better today).

Ah.. sorry I was not aware. A big chunk of 2017 is blank to me when it
comes to git.

> The trade-off is the additional complexity to be able to load the index
> extension without having to parse through all the variable length cache
> entries.  My patch worked but there was feedback requested to make it
> more generic and robust that I haven't gotten around to yet.

One more comment. Instead of forcing this special index at the bottom,
add a generic one that gives positions of all extensions and put that
one at the bottom. Then you can still quickly locate your offset table
extension, and you could load UNTR and TREE extensions in parallel too
(those scale up to worktree size)

> This patch series went for simplicity over absolutely the best possible
> performance.

Well, you know my stance on this now :) Not that it really matters.

> [1]
> https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/

PS. I still think it's worth bring v4's performance back to v2. It's
low hanging fruit because I'm pretty sure Junio did not add v4 code
with cpu performance in mind. It was about file size at that time and
cpu consumption was still dwarfed by hashing.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 19:00           ` Duy Nguyen
@ 2018-08-24 19:57             ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-24 19:57 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Ben Peart, Git Mailing List



On 8/24/2018 3:00 PM, Duy Nguyen wrote:
> On Fri, Aug 24, 2018 at 8:40 PM Ben Peart <peartben@gmail.com> wrote:
>>
>>
>>
>> On 8/24/2018 2:20 PM, Duy Nguyen wrote:
>>> On Fri, Aug 24, 2018 at 5:37 PM Duy Nguyen <pclouds@gmail.com> wrote:
>>>> On Thu, Aug 23, 2018 at 10:36 PM Ben Peart <peartben@gmail.com> wrote:
>>>>>> Nice to see this done without a new index extension that records
>>>>>> offsets, so that we can load existing index files in parallel.
>>>>>>
>>>>>
>>>>> Yes, I prefer this simpler model as well.  I wasn't sure it would
>>>>> produce a significant improvement given the primary thread still has to
>>>>> run through the variable length cache entries but was pleasantly surprised.
>>>>
>>>> Out of curiosity, how much time saving could we gain by recording
>>>> offsets as an extension (I assume we need, like 4 offsets if the
>>>> system has 4 cores)? Much much more than this simpler model (which may
>>>> justify the complexity) or just "meh" compared to this?
>>>
>>> To answer my own question, I ran a patched git to precalculate
>>> individual thread parameters, removed the scheduler code and hard
>>> coded these parameters (I ran just 4 threads, one per core). I got
>>> 0m2.949s (webkit.git, 275k files, 100 read-cache runs). Compared to
>>> 0m4.996s from Ben's patch (same test settings of course) I think it's
>>> definitely worth adding some extra complexity.
>>>
>>
>> I took a run at doing that last year [1] but that was before the
>> mem_pool work that allowed us to avoid the thread contention on the heap
>> so the numbers aren't an apples to apples comparison (they would be
>> better today).
> 
> Ah.. sorry I was not aware. A big chunk of 2017 is blank to me when it
> comes to git.
> 
>> The trade-off is the additional complexity to be able to load the index
>> extension without having to parse through all the variable length cache
>> entries.  My patch worked but there was feedback requested to make it
>> more generic and robust that I haven't gotten around to yet.
> 
> One more comment. Instead of forcing this special index at the bottom,
> add a generic one that gives positions of all extensions and put that
> one at the bottom. Then you can still quickly locate your offset table
> extension, and you could load UNTR and TREE extensions in parallel too
> (those scale up to worktree size)
> 

That is pretty much what Junio's feedback was and what I was referring 
to as making it "more generic."  The "more robust" was the request to 
add a SHA to the extension ensure it wasn't corrupt and was a valid 
extension.

>> This patch series went for simplicity over absolutely the best possible
>> performance.
> 
> Well, you know my stance on this now :) Not that it really matters.
> 
>> [1]
>> https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/
> 
> PS. I still think it's worth bring v4's performance back to v2. It's
> low hanging fruit because I'm pretty sure Junio did not add v4 code
> with cpu performance in mind. It was about file size at that time and
> cpu consumption was still dwarfed by hashing.
> 

I see that as a nice follow up patch.  If the extension exists, use it 
and jump directly to the blocks and spin up threads.  If it doesn't 
exist, fall back to the code in this patch that has to find/compute the 
blocks on the fly.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH] read-cache.c: optimize reading index format v4
  2018-08-24 15:57       ` Duy Nguyen
  2018-08-24 17:28         ` Ben Peart
@ 2018-08-25  6:44         ` Nguyễn Thái Ngọc Duy
  2018-08-27 19:36           ` Junio C Hamano
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 87+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-08-25  6:44 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in 
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

PS. I notice that v4 does not pad to align entries at 4 byte boundary
like v2/v3. This could cause a slight slow down on x86 and segfault on
some other platforms. We need to fix this in v5 when we introduce
SHA-256 support in the index.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 124 +++++++++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..5c04c8f200 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,16 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
 static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t strip_len;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1782,28 +1735,61 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 		extended_flags = get_be16(&ondisk2->flags2) << 16;
 		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
 		if (extended_flags & ~CE_EXTENDED_FLAGS)
-			die("Unknown index entry format %08x", extended_flags);
+			die(_("unknown index entry format %08x"), extended_flags);
 		flags |= extended_flags;
 		name = ondisk2->name;
 	}
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	if (previous_ce) {
+		const unsigned char *cp = (const unsigned char *)name;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		strip_len = decode_varint(&cp);
+		if (previous_ce->ce_namelen < strip_len)
+			die(_("malformed name field in the index, path '%s'"),
+			    previous_ce->name);
+		name = (const char *)cp;
+	}
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (previous_ce)
+			len += previous_ce->ce_namelen - strip_len;
+	}
+
+	ce = mem_pool__ce_alloc(mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
+
+	if (previous_ce) {
+		size_t copy_len = previous_ce->ce_namelen - strip_len;
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
+	struct cache_entry *dummy_entry = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
+		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1938,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		if (previous_ce)
+			previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
+	free(dummy_entry);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
@ 2018-08-27 19:36           ` Junio C Hamano
  2018-08-28 19:25             ` Duy Nguyen
  2018-09-04 16:08             ` Duy Nguyen
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-08-27 19:36 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Running "test-tool read-cache 100" on webkit.git (275k files), reading
> v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
> time. The patch reduces read time on v4 to 4.319 seconds.

Nice.

> PS. I notice that v4 does not pad to align entries at 4 byte boundary
> like v2/v3. This could cause a slight slow down on x86 and segfault on
> some other platforms.

Care to elaborate?  

Long time ago, we used to mmap and read directly from the index file
contents, requiring either an unaligned read or padded entries.  But
that was eons ago and we first read and convert from on-disk using
get_be32() etc. to in-core structure, so I am not sure what you mean
by "segfault" here.

> @@ -1782,28 +1735,61 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
>  		extended_flags = get_be16(&ondisk2->flags2) << 16;
>  		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
>  		if (extended_flags & ~CE_EXTENDED_FLAGS)
> -			die("Unknown index entry format %08x", extended_flags);
> +			die(_("unknown index entry format %08x"), extended_flags);

Do this as a separate preparation patch that is not controversial
and can sail through without waiting for the rest of this patch.

In other words, don't slip in unreleted changes.

> -	if (!previous_name) {
> -		/* v3 and earlier */
> -		if (len == CE_NAMEMASK)
> -			len = strlen(name);
> -		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
> +	/*
> +	 * Adjacent cache entries tend to share the leading paths, so it makes
> +	 * sense to only store the differences in later entries.  In the v4
> +	 * on-disk format of the index, each on-disk cache entry stores the
> +	 * number of bytes to be stripped from the end of the previous name,
> +	 * and the bytes to append to the result, to come up with its name.
> +	 */
> +	if (previous_ce) {
> +		const unsigned char *cp = (const unsigned char *)name;
>  
> -		*ent_size = ondisk_ce_size(ce);
> -	} else {
> -		unsigned long consumed;
> -		consumed = expand_name_field(previous_name, name);
> -		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
> -					     previous_name->buf,
> -					     previous_name->len);
> +		strip_len = decode_varint(&cp);
> +		if (previous_ce->ce_namelen < strip_len)
> +			die(_("malformed name field in the index, path '%s'"),
> +			    previous_ce->name);

The message is misleading; the previous is not the problematic one,
but the one that comes after it is.  Perhaps s/, path/, near path/
or something.

> +		name = (const char *)cp;
> +	}
>  
> -		*ent_size = (name - ((char *)ondisk)) + consumed;
> +	if (len == CE_NAMEMASK) {
> +		len = strlen(name);
> +		if (previous_ce)
> +			len += previous_ce->ce_namelen - strip_len;

Nicely done.  If the result fits in that 12-bit truncated name, then
it is full so we do not need to adjust for strip.  Otherwise, we
know the length of this name is the sum of the part that is shared
with the previous one and the part that is unique to this one.

> +	}
> +
> +	ce = mem_pool__ce_alloc(mem_pool, len);
> +	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
> +	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
> +	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
> +	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
> +	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
> +	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
> +	ce->ce_mode  = get_be32(&ondisk->mode);
> +	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
> +	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
> +	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
> +	ce->ce_flags = flags & ~CE_NAMEMASK;
> +	ce->ce_namelen = len;
> +	ce->index = 0;
> +	hashcpy(ce->oid.hash, ondisk->sha1);

Again, nice.  Now two callsites (both in this function) that call
cache_entry_from_ondisk() with slightly different parameters are
unified, there is no strong reason to have it as a single caller
helper function.

> @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  	struct cache_header *hdr;
>  	void *mmap;
>  	size_t mmap_size;
> -	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	const struct cache_entry *previous_ce = NULL;
> +	struct cache_entry *dummy_entry = NULL;
>  
>  	if (istate->initialized)
>  		return istate->cache_nr;
> @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  	istate->initialized = 1;
>  
>  	if (istate->version == 4) {
> -		previous_name = &previous_name_buf;
> +		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);

I do like the idea of passing the previous ce around to tell the
next one what the previous name was, but I would have preferred to
see this done a bit more cleanly without requiring us to support "a
dummy entry with name whose length is 0"; a real cache entry never
has zero-length name, and our code may want to enforce it as a
sanity check.

I think we can just call create_from_disk() with NULL set to
previous_ce in the first round; of course, the logic to assign the
one we just created to previous_ce must check istate->version,
instead of "is previous_ce NULL?" (which is an indirect way to check
the same thing used in this patch).

Other than that, looks quite nice.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v1] read-cache: speed up index load through parallelization
  2018-08-24 18:40   ` Duy Nguyen
@ 2018-08-28 14:53     ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-28 14:53 UTC (permalink / raw)
  To: Duy Nguyen, Stefan Beller; +Cc: Ben Peart, Git Mailing List, Junio C Hamano



On 8/24/2018 2:40 PM, Duy Nguyen wrote:
> On Thu, Aug 23, 2018 at 7:33 PM Stefan Beller <sbeller@google.com> wrote:
>>> +core.fastIndex::
>>> +       Enable parallel index loading
>>> ++
>>> +This can speed up operations like 'git diff' and 'git status' especially
>>> +when the index is very large.  When enabled, Git will do the index
>>> +loading from the on disk format to the in-memory format in parallel.
>>> +Defaults to true.
>> "fast" is a non-descriptive word as we try to be fast in any operation?
>> Maybe core.parallelIndexReading as that just describes what it
>> turns on/off, without second guessing its effects?
> Another option is index.threads (the "index" section currently only
> has one item, index.version). The value could be the same as
> grep.threads or pack.threads.
>
> (and if you're thinking about parallelizing write as well but it
> should be tuned differently, then perhaps index.readThreads, but I
> don't think we need to go that far)

I like that.  I'll switch to index.threads and make 'true' or '0' mean 
"automatically determine the number of threads to use" similar to 
pack.threads.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-27 19:36           ` Junio C Hamano
@ 2018-08-28 19:25             ` Duy Nguyen
  2018-08-28 23:54               ` Ben Peart
  2018-08-29 17:14               ` Junio C Hamano
  2018-09-04 16:08             ` Duy Nguyen
  1 sibling, 2 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-08-28 19:25 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Git Mailing List, Ben Peart

On Mon, Aug 27, 2018 at 9:36 PM Junio C Hamano <gitster@pobox.com> wrote:
> > PS. I notice that v4 does not pad to align entries at 4 byte boundary
> > like v2/v3. This could cause a slight slow down on x86 and segfault on
> > some other platforms.
>
> Care to elaborate?
>
> Long time ago, we used to mmap and read directly from the index file
> contents, requiring either an unaligned read or padded entries.  But
> that was eons ago and we first read and convert from on-disk using
> get_be32() etc. to in-core structure, so I am not sure what you mean
> by "segfault" here.
>

My bad. I saw this line

#define get_be16(p) ntohs(*(unsigned short *)(p))

and jumped to conclusion without realizing that block is for safe
unaligned access.

> > @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >       struct cache_header *hdr;
> >       void *mmap;
> >       size_t mmap_size;
> > -     struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> > +     const struct cache_entry *previous_ce = NULL;
> > +     struct cache_entry *dummy_entry = NULL;
> >
> >       if (istate->initialized)
> >               return istate->cache_nr;
> > @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >       istate->initialized = 1;
> >
> >       if (istate->version == 4) {
> > -             previous_name = &previous_name_buf;
> > +             previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
>
> I do like the idea of passing the previous ce around to tell the
> next one what the previous name was, but I would have preferred to
> see this done a bit more cleanly without requiring us to support "a
> dummy entry with name whose length is 0"; a real cache entry never
> has zero-length name, and our code may want to enforce it as a
> sanity check.
>
> I think we can just call create_from_disk() with NULL set to
> previous_ce in the first round; of course, the logic to assign the
> one we just created to previous_ce must check istate->version,
> instead of "is previous_ce NULL?" (which is an indirect way to check
> the same thing used in this patch).

Yeah I kinda hated dummy_entry too but the feeling wasn't strong
enough to move towards the index->version check. I guess I'm going to
do it now.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-28 19:25             ` Duy Nguyen
@ 2018-08-28 23:54               ` Ben Peart
  2018-08-29 17:14               ` Junio C Hamano
  1 sibling, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-28 23:54 UTC (permalink / raw)
  To: Duy Nguyen, Junio C Hamano; +Cc: Ben Peart, Git Mailing List



On 8/28/2018 3:25 PM, Duy Nguyen wrote:
> On Mon, Aug 27, 2018 at 9:36 PM Junio C Hamano <gitster@pobox.com> wrote:
>>> PS. I notice that v4 does not pad to align entries at 4 byte boundary
>>> like v2/v3. This could cause a slight slow down on x86 and segfault on
>>> some other platforms.
>>
>> Care to elaborate?
>>
>> Long time ago, we used to mmap and read directly from the index file
>> contents, requiring either an unaligned read or padded entries.  But
>> that was eons ago and we first read and convert from on-disk using
>> get_be32() etc. to in-core structure, so I am not sure what you mean
>> by "segfault" here.
>>
> 
> My bad. I saw this line
> 
> #define get_be16(p) ntohs(*(unsigned short *)(p))
> 
> and jumped to conclusion without realizing that block is for safe
> unaligned access.
> 
>>> @@ -1898,7 +1884,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>        struct cache_header *hdr;
>>>        void *mmap;
>>>        size_t mmap_size;
>>> -     struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>> +     const struct cache_entry *previous_ce = NULL;
>>> +     struct cache_entry *dummy_entry = NULL;
>>>
>>>        if (istate->initialized)
>>>                return istate->cache_nr;
>>> @@ -1936,11 +1923,10 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>        istate->initialized = 1;
>>>
>>>        if (istate->version == 4) {
>>> -             previous_name = &previous_name_buf;
>>> +             previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
>>
>> I do like the idea of passing the previous ce around to tell the
>> next one what the previous name was, but I would have preferred to
>> see this done a bit more cleanly without requiring us to support "a
>> dummy entry with name whose length is 0"; a real cache entry never
>> has zero-length name, and our code may want to enforce it as a
>> sanity check.
>>
>> I think we can just call create_from_disk() with NULL set to
>> previous_ce in the first round; of course, the logic to assign the
>> one we just created to previous_ce must check istate->version,
>> instead of "is previous_ce NULL?" (which is an indirect way to check
>> the same thing used in this patch).
> 
> Yeah I kinda hated dummy_entry too but the feeling wasn't strong
> enough to move towards the index->version check. I guess I'm going to
> do it now.
> 

I ran some perf tests using p0002-read-cache.sh to compare V4 
performance before and after this patch so I could get a feel for how 
much it helps.

100,000 files

Test                                  HEAD~1   HEAD
------------------------------------------------------------
read_cache/discard_cache 1000 times    14.12    10.75 -23.9%

1,000,000 files

Test                                  HEAD~1   HEAD
------------------------------------------------------------
read_cache/discard_cache 1000 times   202.81   170.33 -16.0%


This provides a nice speedup and IMO simplifies the code as well. 
Nicely done.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 0/3] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
  2018-08-23 17:31 ` Stefan Beller
  2018-08-23 18:06 ` Junio C Hamano
@ 2018-08-29 15:25 ` " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
                     ` (2 more replies)
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

The big changes in this itteration are:

- Switched to index.threads to provide control over the use of threading

- Added another worker thread to load the index extensions in parallel

- Applied optimization expand_name_field() suggested by Duy

The net result of these optimizations is a savings of 25.8% (1,000,000 files)
to 38.1% (100,000 files) as measured by p0002-read-cache.sh.

This patch conflicts with Duy's patch to remove the double memory copy and
pass in the previous ce instead.  The two will need to be merged/reconciled
once they settle down a bit.


Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/39f2b0f5fe
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v2 && git checkout 39f2b0f5fe


### Interdiff (v1..v2):

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 3344685cc4..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -899,14 +899,6 @@ relatively high IO latencies.  When enabled, Git will do the
 index comparison to the filesystem data in parallel, allowing
 overlapping IO's.  Defaults to true.
 
-core.fastIndex::
-       Enable parallel index loading
-+
-This can speed up operations like 'git diff' and 'git status' especially
-when the index is very large.  When enabled, Git will do the index
-loading from the on disk format to the in-memory format in parallel.
-Defaults to true.
-
 core.createObject::
 	You can set this to 'link', in which case a hardlink followed by
 	a delete of the source are used to make sure that object creation
@@ -2399,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 883092fdd3..3bda124550 100644
--- a/config.c
+++ b/config.c
@@ -2289,17 +2289,18 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
-int git_config_get_fast_index(void)
+int git_config_get_index_threads(void)
 {
-	int val;
+	int is_bool, val;
 
-	if (!git_config_get_maybe_bool("core.fastindex", &val))
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
 			return val;
+	}
 
-	if (getenv("GIT_FASTINDEX_TEST"))
-		return 1;
-
-	return -1; /* default value */
+	return 0; /* auto-detect */
 }
 
 NORETURN
diff --git a/config.h b/config.h
index 74ca4e7db5..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,7 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
-extern int git_config_get_fast_index(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 0fa7e1a04c..f5e7c86c42 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -24,10 +24,6 @@
 #include "utf8.h"
 #include "fsmonitor.h"
 
-#ifndef min
-#define min(a,b) (((a) < (b)) ? (a) : (b))
-#endif
-
 /* Mask for the name length in ce_flags in the on-disk index */
 
 #define CE_NAMEMASK  (0x0fff)
@@ -1758,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
@@ -1893,7 +1888,13 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
-static unsigned long load_cache_entry_block(struct index_state *istate, struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap, unsigned long start_offset, struct strbuf *previous_name)
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1912,7 +1913,8 @@ static unsigned long load_cache_entry_block(struct index_state *istate, struct m
 	return src_offset - start_offset;
 }
 
-static unsigned long load_all_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
@@ -1927,7 +1929,8 @@ static unsigned long load_all_cache_entries(struct index_state *istate, void *mm
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
-	consumed = load_cache_entry_block(istate, istate->ce_mem_pool, 0, istate->cache_nr, mmap, src_offset, previous_name);
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
 	strbuf_release(&previous_name_buf);
 	return consumed;
 }
@@ -1955,67 +1958,110 @@ struct load_cache_entries_thread_data
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
 	void *mmap;
+	size_t mmap_size;
 	unsigned long start_offset;
 	struct strbuf previous_name_buf;
 	struct strbuf *previous_name;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
-/*
-* A thread proc to run the load_cache_entries() computation
-* across multiple background threads.
-*/
 static void *load_cache_entries_thread(void *_data)
 {
 	struct load_cache_entries_thread_data *p = _data;
 
-	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool, p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static void *load_index_extensions_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	unsigned long src_offset = p->start_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+								(const char *)p->mmap + src_offset,
+								(char *)p->mmap + src_offset + 8,
+								extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	p->consumed += src_offset - p->start_offset;
+
 	return NULL;
 }
 
-static unsigned long load_cache_entries(struct index_state *istate, void *mmap, size_t mmap_size, unsigned long src_offset)
+static unsigned long load_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
-	int threads, cpus, thread_nr;
+	int nr_threads, cpus, ce_per_thread;
 	unsigned long consumed;
 	int i, thread;
 
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads) {
 		cpus = online_cpus();
-	threads = istate->cache_nr / THREAD_COST;
-	if (threads > cpus)
-		threads = cpus;
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
 
 	/* enable testing with fewer than default minimum of entries */
-	if ((istate->cache_nr > 1) && (threads < 2) && getenv("GIT_FASTINDEX_TEST"))
-		threads = 2;
+	if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 2;
 
-	if (threads < 2 || !git_config_get_fast_index())
+	if (nr_threads < 2)
 		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		die("the name hash isn't thread safe");
+
 	mem_pool_init(&istate->ce_mem_pool, 0);
 	if (istate->version == 4)
 		previous_name = &previous_name_buf;
 	else
 		previous_name = NULL;
 
-	thread_nr = (istate->cache_nr + threads - 1) / threads;
-	data = xcalloc(threads, sizeof(struct load_cache_entries_thread_data));
+	/* allocate an extra thread for loading the index extensions */
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
 
-	/* loop through index entries starting a thread for every thread_nr entries */
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries.
+	 */
 	consumed = thread = 0;
-	for (i = 0; ; i++) {
+	for (i = 0; i < istate->cache_nr; i++) {
 		struct ondisk_cache_entry *ondisk;
 		const char *name;
 		unsigned int flags;
 
-		/* we've reached the begining of a block of cache entries, kick off a thread to process them */
-		if (0 == i % thread_nr) {
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (0 == i % ce_per_thread) {
 			struct load_cache_entries_thread_data *p = &data[thread];
 
 			p->istate = istate;
 			p->offset = i;
-			p->nr = min(thread_nr, istate->cache_nr - i);
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
 
 			/* create a mem_pool for each thread */
 			if (istate->version == 4)
@@ -2034,8 +2080,8 @@ static unsigned long load_cache_entries(struct index_state *istate, void *mmap,
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
 				die("unable to create load_cache_entries_thread");
-			if (++thread == threads || p->nr != thread_nr)
-				break;
+
+			++thread;
 		}
 
 		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
@@ -2064,7 +2110,18 @@ static unsigned long load_cache_entries(struct index_state *istate, void *mmap,
 			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
 	}
 
-	for (i = 0; i < threads; i++) {
+	/* create a thread to load the index extensions */
+	struct load_cache_entries_thread_data *p = &data[thread];
+	p->istate = istate;
+	mem_pool_init(&p->ce_mem_pool, 0);
+	p->mmap = mmap;
+	p->mmap_size = mmap_size;
+	p->start_offset = src_offset;
+
+	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
+		die("unable to create load_index_extensions_thread");
+
+	for (i = 0; i < nr_threads + 1; i++) {
 		struct load_cache_entries_thread_data *p = data + i;
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");


### Patches

Ben Peart (3):
  read-cache: speed up index load through parallelization
  read-cache: load cache extensions on worker thread
  read-cache: micro-optimize expand_name_field() to speed up V4 index
    parsing.

 Documentation/config.txt |   6 +
 config.c                 |  14 ++
 config.h                 |   1 +
 read-cache.c             | 281 +++++++++++++++++++++++++++++++++++----
 4 files changed, 275 insertions(+), 27 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
@ 2018-08-29 15:25   ` " Ben Peart
  2018-08-29 17:14     ` Junio C Hamano
  2018-09-03 19:16     ` Duy Nguyen
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
  2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2 siblings, 2 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them. Once the
threads are complete and the cache entries are loaded, the rest of the
extensions can be loaded and processed normally on the primary thread.

I used p0002-read-cache.sh to generate some performance data:

100,000 entries

Test                                HEAD~3           HEAD~2
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.02(0.01+0.12) 9.81(0.01+0.07) -30.0%

1,000,000 entries

Test                                HEAD~3            HEAD~2
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.06(0.06+0.09) 155.72(0.03+0.06) -22.9%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |   6 +
 config.c                 |  14 +++
 config.h                 |   1 +
 read-cache.c             | 240 +++++++++++++++++++++++++++++++++++----
 4 files changed, 237 insertions(+), 24 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..3bda124550 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,20 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..c30346388a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1889,16 +1889,229 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifdef NO_PTHREADS
+
+#define load_cache_entries load_all_cache_entries
+
+#else
+
+#include "thread-utils.h"
+
+/*
+* Mostly randomly chosen maximum thread counts: we
+* cap the parallelism to online_cpus() threads, and we want
+* to have at least 7500 cache entries per thread for it to
+* be worth starting a thread.
+*/
+#define THREAD_COST		(7500)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+* A thread proc to run the load_cache_entries() computation
+* across multiple background threads.
+*/
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int nr_threads, cpus, ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 2;
+
+	if (nr_threads < 2)
+		return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		die("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (0 == i % ce_per_thread) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						  estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1935,29 +2148,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
 	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
+	src_offset += load_cache_entries(istate, mmap, mmap_size, src_offset);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
@ 2018-08-29 15:25   ` Ben Peart
  2018-08-29 17:12     ` Junio C Hamano
  2018-09-03 19:21     ` Duy Nguyen
  2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2 siblings, 2 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data on the
cumulative impact:

100,000 entries

Test                                HEAD~3           HEAD~2
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

1,000,000 entries

Test                                HEAD~3            HEAD~2
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 60 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index c30346388a..f768004617 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1959,16 +1959,13 @@ struct load_cache_entries_thread_data
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
 	void *mmap;
+	size_t mmap_size;
 	unsigned long start_offset;
 	struct strbuf previous_name_buf;
 	struct strbuf *previous_name;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
-/*
-* A thread proc to run the load_cache_entries() computation
-* across multiple background threads.
-*/
 static void *load_cache_entries_thread(void *_data)
 {
 	struct load_cache_entries_thread_data *p = _data;
@@ -1978,6 +1975,36 @@ static void *load_cache_entries_thread(void *_data)
 	return NULL;
 }
 
+static void *load_index_extensions_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+	unsigned long src_offset = p->start_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+								(const char *)p->mmap + src_offset,
+								(char *)p->mmap + src_offset + 8,
+								extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	p->consumed += src_offset - p->start_offset;
+
+	return NULL;
+}
+
 static unsigned long load_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
@@ -2012,16 +2039,16 @@ static unsigned long load_cache_entries(struct index_state *istate,
 	else
 		previous_name = NULL;
 
+	/* allocate an extra thread for loading the index extensions */
 	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
-	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
 
 	/*
 	 * Loop through index entries starting a thread for every ce_per_thread
-	 * entries. Exit the loop when we've created the final thread (no need
-	 * to parse the remaining entries.
+	 * entries.
 	 */
 	consumed = thread = 0;
-	for (i = 0; ; i++) {
+	for (i = 0; i < istate->cache_nr; i++) {
 		struct ondisk_cache_entry *ondisk;
 		const char *name;
 		unsigned int flags;
@@ -2055,9 +2082,7 @@ static unsigned long load_cache_entries(struct index_state *istate,
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
 				die("unable to create load_cache_entries_thread");
 
-			/* exit the loop when we've created the last thread */
-			if (++thread == nr_threads)
-				break;
+			++thread;
 		}
 
 		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
@@ -2086,7 +2111,18 @@ static unsigned long load_cache_entries(struct index_state *istate,
 			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
 	}
 
-	for (i = 0; i < nr_threads; i++) {
+	/* create a thread to load the index extensions */
+	struct load_cache_entries_thread_data *p = &data[thread];
+	p->istate = istate;
+	mem_pool_init(&p->ce_mem_pool, 0);
+	p->mmap = mmap;
+	p->mmap_size = mmap_size;
+	p->start_offset = src_offset;
+
+	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
+		die("unable to create load_index_extensions_thread");
+
+	for (i = 0; i < nr_threads + 1; i++) {
 		struct load_cache_entries_thread_data *p = data + i;
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing.
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
@ 2018-08-29 15:25   ` Ben Peart
  2 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-29 15:25 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

I used p0002-read-cache.sh to generate some performance data on the
cumulative impact:

100,000 files

Test                                HEAD~3           HEAD
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.03+0.09) 8.71(0.01+0.09) -38.1%

1,000,000 files

Test                                HEAD~3            HEAD
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 201.77(0.03+0.07) 149.68(0.04+0.07) -25.8%

Suggested by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index f768004617..f5e7c86c42 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
@ 2018-08-29 17:12     ` Junio C Hamano
  2018-08-29 21:42       ` Ben Peart
  2018-09-03 19:21     ` Duy Nguyen
  1 sibling, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:12 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\

Ben Peart <Ben.Peart@microsoft.com> writes:

> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.

Good to see such an analysis here.  Once we define an extension
section, which requires us to have the cache entries before
populating it, this scheme would falls down, of course, but the
extension mechanism is all about protecting ourselves from the
future changes, so we'd at least need a good feel for how we read an
unknown extension from the future with the current code.  Perhaps
just like the main cache entries were pre-scanned to apportion them
to worker threads, we can pre-scan the sections and compare them
with a white-list built into our binary before deciding that it is
safe to read them in parallel (and otherwise, we ask the last thread
for reading extensions to wait until the workers that read the main
index all return)?

> -/*
> -* A thread proc to run the load_cache_entries() computation
> -* across multiple background threads.
> -*/

This one was mis-indented (lacking SP before '*') but they are gone
so ... ;-)

> @@ -1978,6 +1975,36 @@ static void *load_cache_entries_thread(void *_data)
>  	return NULL;
>  }
>  
> +static void *load_index_extensions_thread(void *_data)
> +{
> +	struct load_cache_entries_thread_data *p = _data;
> +	unsigned long src_offset = p->start_offset;
> +
> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +		/* After an array of active_nr index entries,
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);
> +		if (read_index_extension(p->istate,
> +								(const char *)p->mmap + src_offset,
> +								(char *)p->mmap + src_offset + 8,
> +								extsize) < 0) {

Overly deep indentation.  Used a wrong tab-width?

> +	/* allocate an extra thread for loading the index extensions */
>  	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
> -	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> +	data = xcalloc(nr_threads + 1, sizeof(struct load_cache_entries_thread_data));
>  
>  	/*
>  	 * Loop through index entries starting a thread for every ce_per_thread
> -	 * entries. Exit the loop when we've created the final thread (no need
> -	 * to parse the remaining entries.
> +	 * entries.
>  	 */

I see.  Now the pre-parsing process needs to go through all the
cache entries to find the beginning of the extensions section.

>  	consumed = thread = 0;
> -	for (i = 0; ; i++) {
> +	for (i = 0; i < istate->cache_nr; i++) {
>  		struct ondisk_cache_entry *ondisk;
>  		const char *name;
>  		unsigned int flags;
> @@ -2055,9 +2082,7 @@ static unsigned long load_cache_entries(struct index_state *istate,
>  			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
>  				die("unable to create load_cache_entries_thread");
>  
> -			/* exit the loop when we've created the last thread */
> -			if (++thread == nr_threads)
> -				break;
> +			++thread;

This is not C++, and in (void) context, the codebase always prefers
post-increment.

> @@ -2086,7 +2111,18 @@ static unsigned long load_cache_entries(struct index_state *istate,
>  			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
>  	}
>  
> -	for (i = 0; i < nr_threads; i++) {
> +	/* create a thread to load the index extensions */
> +	struct load_cache_entries_thread_data *p = &data[thread];

This probably triggers decl-after-statement.

> +	p->istate = istate;
> +	mem_pool_init(&p->ce_mem_pool, 0);
> +	p->mmap = mmap;
> +	p->mmap_size = mmap_size;
> +	p->start_offset = src_offset;
> +
> +	if (pthread_create(&p->pthread, NULL, load_index_extensions_thread, p))
> +		die("unable to create load_index_extensions_thread");
> +
> +	for (i = 0; i < nr_threads + 1; i++) {
>  		struct load_cache_entries_thread_data *p = data + i;
>  		if (pthread_join(p->pthread, NULL))
>  			die("unable to join load_cache_entries_thread");

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-28 19:25             ` Duy Nguyen
  2018-08-28 23:54               ` Ben Peart
@ 2018-08-29 17:14               ` Junio C Hamano
  1 sibling, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:14 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> Yeah I kinda hated dummy_entry too but the feeling wasn't strong
> enough to move towards the index->version check. I guess I'm going to
> do it now.

Sounds like a plan.  Thanks again for a pleasant read.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
@ 2018-08-29 17:14     ` Junio C Hamano
  2018-08-29 21:35       ` Ben Peart
  2018-09-03 19:16     ` Duy Nguyen
  1 sibling, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-08-29 17:14 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\

Ben Peart <Ben.Peart@microsoft.com> writes:

> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..79f8296d9c 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -2391,6 +2391,12 @@ imap::
>  	The configuration variables in the 'imap' section are described
>  	in linkgit:git-imap-send[1].
>  
> +index.threads::
> +	Specifies the number of threads to spawn when loading the index.
> +	This is meant to reduce index load time on multiprocessor machines.
> +	Specifying 0 or 'true' will cause Git to auto-detect the number of
> +	CPU's and set the number of threads accordingly. Defaults to 'true'.

"0 or 'true' means 'auto'" made me go "Huh?"

The "Huh?"  I initially felt comes from the fact that usually 0 and
false are interchangeable, but for this particular application,
"disabling" the threading means setting the count to one (not zero),
leaving us zero as a usable "special value" to signal 'auto'.

So the end result does make sense, especially with this bit ...

> diff --git a/config.c b/config.c
> index 9a0b10d4bc..3bda124550 100644
> --- a/config.c
> +++ b/config.c
> @@ -2289,6 +2289,20 @@ int git_config_get_fsmonitor(void)
> ...
> +	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
> +		if (is_bool)
> +			return val ? 0 : 1;
> +		else
> +			return val;

... which says "'0' and 'true' are the same and yields 0, '1' and
'false' yields 1, and '2' and above will give the int".  

Adding something like

	You can disable multi-threaded code by setting this variable
	to 'false' (or 1).

may reduce the risk of a similar "Huh?" reaction by other readers.

> +struct load_cache_entries_thread_data
> +{
> +	pthread_t pthread;
> +	struct index_state *istate;
> +	struct mem_pool *ce_mem_pool;
> +	int offset, nr;
> +	void *mmap;
> +	unsigned long start_offset;
> +	struct strbuf previous_name_buf;
> +	struct strbuf *previous_name;
> +	unsigned long consumed;	/* return # of bytes in index file processed */
> +};

We saw that Duy's "let's not use strbuf to remember the previous
name but instead use the previous ce" approach gave us a nice
performance boost; I wonder if we can build on that idea here?

One possible approach might be to create one ce per "block" in the
pre-scanning thread and use that ce as the "previous one" in the
per-thread data before spawning a worker.

> +static unsigned long load_cache_entries(struct index_state *istate,
> +			void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	struct load_cache_entries_thread_data *data;
> +	int nr_threads, cpus, ce_per_thread;
> +	unsigned long consumed;
> +	int i, thread;
> +
> +	nr_threads = git_config_get_index_threads();
> +	if (!nr_threads) {
> +		cpus = online_cpus();
> +		nr_threads = istate->cache_nr / THREAD_COST;

Here, nr_threads could become 0 with a small index, but any value
below 2 makes us call load_all_cache_entries() by the main thread
(and the value of nr_thread is not used anyore), it is fine.  Of
course, forced test will set it to 2 so there is no problem, either.

OK.

> +	/* a little sanity checking */
> +	if (istate->name_hash_initialized)
> +		die("the name hash isn't thread safe");

If it is a programming error to call into this codepath without
initializing the name_hash, which I think is the case, this is
better done with BUG("").

The remainder of the patch looked good.  Thanks.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 17:14     ` Junio C Hamano
@ 2018-08-29 21:35       ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-08-29 21:35 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds



On 8/29/2018 1:14 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 1c42364988..79f8296d9c 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -2391,6 +2391,12 @@ imap::
>>   	The configuration variables in the 'imap' section are described
>>   	in linkgit:git-imap-send[1].
>>   

> Adding something like
> 
> 	You can disable multi-threaded code by setting this variable
> 	to 'false' (or 1).
> 
> may reduce the risk of a similar "Huh?" reaction by other readers.
> 

Will do

>> +struct load_cache_entries_thread_data
>> +{
>> +	pthread_t pthread;
>> +	struct index_state *istate;
>> +	struct mem_pool *ce_mem_pool;
>> +	int offset, nr;
>> +	void *mmap;
>> +	unsigned long start_offset;
>> +	struct strbuf previous_name_buf;
>> +	struct strbuf *previous_name;
>> +	unsigned long consumed;	/* return # of bytes in index file processed */
>> +};
> 
> We saw that Duy's "let's not use strbuf to remember the previous
> name but instead use the previous ce" approach gave us a nice
> performance boost; I wonder if we can build on that idea here?
> 
> One possible approach might be to create one ce per "block" in the
> pre-scanning thread and use that ce as the "previous one" in the
> per-thread data before spawning a worker.
> 

Yes, I believe this can be done.  I was planning to wait until both 
patches settled down a bit before adapting it to threads.  It's a little 
trickier because the previous ce doesn't yet exist but I believe one can 
be fabricated enough to make the optimization work.

>> +static unsigned long load_cache_entries(struct index_state *istate,
>> +			void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	struct load_cache_entries_thread_data *data;
>> +	int nr_threads, cpus, ce_per_thread;
>> +	unsigned long consumed;
>> +	int i, thread;
>> +
>> +	nr_threads = git_config_get_index_threads();
>> +	if (!nr_threads) {
>> +		cpus = online_cpus();
>> +		nr_threads = istate->cache_nr / THREAD_COST;
> 
> Here, nr_threads could become 0 with a small index, but any value
> below 2 makes us call load_all_cache_entries() by the main thread
> (and the value of nr_thread is not used anyore), it is fine.  Of
> course, forced test will set it to 2 so there is no problem, either.
> 
> OK.
> 
>> +	/* a little sanity checking */
>> +	if (istate->name_hash_initialized)
>> +		die("the name hash isn't thread safe");
> 
> If it is a programming error to call into this codepath without
> initializing the name_hash, which I think is the case, this is
> better done with BUG("").
> 

Will do

> The remainder of the patch looked good.  Thanks.
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 17:12     ` Junio C Hamano
@ 2018-08-29 21:42       ` Ben Peart
  2018-08-29 22:19         ` Junio C Hamano
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-08-29 21:42 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds



On 8/29/2018 1:12 PM, Junio C Hamano wrote:
> Ben Peart <Ben.Peart@microsoft.com> writes:
> 
>> This is possible because the current extensions don't access the cache
>> entries in the index_state structure so are OK that they don't all exist
>> yet.
>>
>> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
>> extensions don't even get a pointer to the index so don't have access to the
>> cache entries.
>>
>> CACHE_EXT_LINK only uses the index_state to initialize the split index.
>> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
>> update and dirty flags.
> 
> Good to see such an analysis here.  Once we define an extension
> section, which requires us to have the cache entries before
> populating it, this scheme would falls down, of course, but the
> extension mechanism is all about protecting ourselves from the
> future changes, so we'd at least need a good feel for how we read an
> unknown extension from the future with the current code.  Perhaps
> just like the main cache entries were pre-scanned to apportion them
> to worker threads, we can pre-scan the sections and compare them
> with a white-list built into our binary before deciding that it is
> safe to read them in parallel (and otherwise, we ask the last thread
> for reading extensions to wait until the workers that read the main
> index all return)?
> 

Yes, when we add a new extension that requires the cache entries to 
exist and be parsed, we will need to add a mechanism to ensure that 
happens for that extension.  I agree a white list is probably the right 
way to deal with it.  Until we have that need, it would just add 
unnecessary complexity so I think we should wait till it is actually needed.

There isn't any change in behavior with unknown extensions and this 
patch.  If an unknown extension exists it will just get ignored and 
reported as an "unknown extension" or "die" if it is marked as "required."

I'll fix the rest of your suggestions - thanks for the close review.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 21:42       ` Ben Peart
@ 2018-08-29 22:19         ` Junio C Hamano
  0 siblings, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-08-29 22:19 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, git\, pclouds\

Ben Peart <peartben@gmail.com> writes:

> There isn't any change in behavior with unknown extensions and this
> patch.  If an unknown extension exists it will just get ignored and
> reported as an "unknown extension" or "die" if it is marked as
> "required."

OK.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 0/1] optimize reading index format v4
  2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
  2018-08-27 19:36           ` Junio C Hamano
@ 2018-09-02 13:19           ` " Nguyễn Thái Ngọc Duy
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
  1 sibling, 1 reply; 87+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-09-02 13:19 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

v2 removes unrelated changes and the dummy_entry. strip_len is also
replaced with copy_len to reduce repeated subtraction calculation.
Diff: 

diff --git a/read-cache.c b/read-cache.c
index 5c04c8f200..8628d0f3a8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,7 +1713,7 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    const struct cache_entry *previous_ce)
@@ -1722,7 +1722,15 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	size_t len;
 	const char *name;
 	unsigned int flags;
-	size_t strip_len;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1735,37 +1743,37 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 		extended_flags = get_be16(&ondisk2->flags2) << 16;
 		/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
 		if (extended_flags & ~CE_EXTENDED_FLAGS)
-			die(_("unknown index entry format %08x"), extended_flags);
+			die("Unknown index entry format %08x", extended_flags);
 		flags |= extended_flags;
 		name = ondisk2->name;
 	}
 	else
 		name = ondisk->name;
 
-	/*
-	 * Adjacent cache entries tend to share the leading paths, so it makes
-	 * sense to only store the differences in later entries.  In the v4
-	 * on-disk format of the index, each on-disk cache entry stores the
-	 * number of bytes to be stripped from the end of the previous name,
-	 * and the bytes to append to the result, to come up with its name.
-	 */
-	if (previous_ce) {
+	if (expand_name_field) {
 		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
 		strip_len = decode_varint(&cp);
-		if (previous_ce->ce_namelen < strip_len)
-			die(_("malformed name field in the index, path '%s'"),
-			    previous_ce->name);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
 		name = (const char *)cp;
 	}
 
 	if (len == CE_NAMEMASK) {
 		len = strlen(name);
-		if (previous_ce)
-			len += previous_ce->ce_namelen - strip_len;
+		if (expand_name_field)
+			len += copy_len;
 	}
 
-	ce = mem_pool__ce_alloc(mem_pool, len);
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1782,9 +1790,9 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	ce->index = 0;
 	hashcpy(ce->oid.hash, ondisk->sha1);
 
-	if (previous_ce) {
-		size_t copy_len = previous_ce->ce_namelen - strip_len;
-		memcpy(ce->name, previous_ce->name, copy_len);
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
 		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
 		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
 	} else {
@@ -1885,7 +1893,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	const struct cache_entry *previous_ce = NULL;
-	struct cache_entry *dummy_entry = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1923,7 +1930,6 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_ce = dummy_entry = make_empty_transient_cache_entry(0);
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
@@ -1938,14 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_ce);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
-		if (previous_ce)
-			previous_ce = ce;
+		previous_ce = ce;
 	}
-	free(dummy_entry);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 


Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
@ 2018-09-02 13:19             ` " Nguyễn Thái Ngọc Duy
  2018-09-04 18:58               ` Junio C Hamano
  2018-09-04 19:31               ` Junio C Hamano
  0 siblings, 2 replies; 87+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-09-02 13:19 UTC (permalink / raw)
  To: pclouds; +Cc: Ben.Peart, git, gitster, peartben

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 read-cache.c | 128 ++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 68 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..8628d0f3a8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1713,63 +1713,24 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
-/*
- * Adjacent cache entries tend to share the leading paths, so it makes
- * sense to only store the differences in later entries.  In the v4
- * on-disk format of the index, each on-disk cache entry stores the
- * number of bytes to be stripped from the end of the previous name,
- * and the bytes to append to the result, to come up with its name.
- */
-static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
-{
-	const unsigned char *ep, *cp = (const unsigned char *)cp_;
-	size_t len = decode_varint(&cp);
-
-	if (name->len < len)
-		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
-	strbuf_add(name, cp, ep - cp);
-	return (const char *)ep + 1 - cp_;
-}
-
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct index_state *istate,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	/*
+	 * Adjacent cache entries tend to share the leading paths, so it makes
+	 * sense to only store the differences in later entries.  In the v4
+	 * on-disk format of the index, each on-disk cache entry stores the
+	 * number of bytes to be stripped from the end of the previous name,
+	 * and the bytes to append to the result, to come up with its name.
+	 */
+	int expand_name_field = istate->version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1789,21 +1750,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(istate->ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1898,7 +1892,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	const struct cache_entry *previous_ce = NULL;
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1936,11 +1930,9 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->initialized = 1;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 			      estimate_cache_size(mmap_size, istate->cache_nr));
 	}
@@ -1952,12 +1944,12 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.19.0.rc0.337.ge906d732e7


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 1/3] read-cache: speed up index load through parallelization
  2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
  2018-08-29 17:14     ` Junio C Hamano
@ 2018-09-03 19:16     ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:16 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Wed, Aug 29, 2018 at 5:25 PM Ben Peart <Ben.Peart@microsoft.com> wrote:
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..c30346388a 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1889,16 +1889,229 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> +                       struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
> +                       unsigned long start_offset, struct strbuf *previous_name)
> +{
> +       int i;
> +       unsigned long src_offset = start_offset;
> +
> +       for (i = offset; i < offset + nr; i++) {

It may be micro optimization, but since we're looping a lot and can't
trust the compiler to optimize this, maybe just calculate this upper
limit and store in a local variable to make it clear the upper limit
is known, no point of recalculating it at every iteration.

> +               struct ondisk_cache_entry *disk_ce;
> +               struct cache_entry *ce;
> +               unsigned long consumed;
> +
> +               disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +               ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +               set_index_entry(istate, i, ce);
> +
> +               src_offset += consumed;
> +       }
> +       return src_offset - start_offset;
> +}
> +
> +static unsigned long load_all_cache_entries(struct index_state *istate,
> +                       void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +       struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       unsigned long consumed;
> +
> +       if (istate->version == 4) {
> +               previous_name = &previous_name_buf;
> +               mem_pool_init(&istate->ce_mem_pool,
> +                             estimate_cache_size_from_compressed(istate->cache_nr));
> +       } else {
> +               previous_name = NULL;
> +               mem_pool_init(&istate->ce_mem_pool,
> +                             estimate_cache_size(mmap_size, istate->cache_nr));
> +       }
> +
> +       consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
> +                                       0, istate->cache_nr, mmap, src_offset, previous_name);
> +       strbuf_release(&previous_name_buf);
> +       return consumed;
> +}
> +
> +#ifdef NO_PTHREADS
> +
> +#define load_cache_entries load_all_cache_entries
> +
> +#else
> +
> +#include "thread-utils.h"

Don't include files in a middle of a file.

> +
> +/*
> +* Mostly randomly chosen maximum thread counts: we
> +* cap the parallelism to online_cpus() threads, and we want
> +* to have at least 7500 cache entries per thread for it to
> +* be worth starting a thread.
> +*/
> +#define THREAD_COST            (7500)

Isn't 7500 a bit too low? I'm still basing on webkit.git,  and 7500
entries take about 1.2ms on average. 100k files would take about 16ms
and may be more reasonable (still too low in my opinion).

> +
> +struct load_cache_entries_thread_data
> +{
> +       pthread_t pthread;
> +       struct index_state *istate;
> +       struct mem_pool *ce_mem_pool;
> +       int offset, nr;
> +       void *mmap;
> +       unsigned long start_offset;
> +       struct strbuf previous_name_buf;
> +       struct strbuf *previous_name;
> +       unsigned long consumed; /* return # of bytes in index file processed */
> +};
> +
> +/*
> +* A thread proc to run the load_cache_entries() computation
> +* across multiple background threads.
> +*/
> +static void *load_cache_entries_thread(void *_data)
> +{
> +       struct load_cache_entries_thread_data *p = _data;
> +
> +       p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
> +               p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
> +       return NULL;
> +}
> +
> +static unsigned long load_cache_entries(struct index_state *istate,
> +                       void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +       struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       struct load_cache_entries_thread_data *data;
> +       int nr_threads, cpus, ce_per_thread;
> +       unsigned long consumed;
> +       int i, thread;
> +
> +       nr_threads = git_config_get_index_threads();
> +       if (!nr_threads) {
> +               cpus = online_cpus();
> +               nr_threads = istate->cache_nr / THREAD_COST;
> +               if (nr_threads > cpus)
> +                       nr_threads = cpus;
> +       }
> +
> +       /* enable testing with fewer than default minimum of entries */
> +       if ((istate->cache_nr > 1) && (nr_threads < 2) && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
> +               nr_threads = 2;

Please don't add more '()' than necessary. It's just harder to read.
Maybe break that "if" into two lines since it's getting long.

> +
> +       if (nr_threads < 2)
> +               return load_all_cache_entries(istate, mmap, mmap_size, src_offset);
> +
> +       /* a little sanity checking */
> +       if (istate->name_hash_initialized)
> +               die("the name hash isn't thread safe");
> +
> +       mem_pool_init(&istate->ce_mem_pool, 0);
> +       if (istate->version == 4)
> +               previous_name = &previous_name_buf;
> +       else
> +               previous_name = NULL;
> +
> +       ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
> +       data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
> +
> +       /*
> +        * Loop through index entries starting a thread for every ce_per_thread
> +        * entries. Exit the loop when we've created the final thread (no need
> +        * to parse the remaining entries.
> +        */
> +       consumed = thread = 0;
> +       for (i = 0; ; i++) {
> +               struct ondisk_cache_entry *ondisk;
> +               const char *name;
> +               unsigned int flags;
> +
> +               /*
> +                * we've reached the beginning of a block of cache entries,
> +                * kick off a thread to process them
> +                */
> +               if (0 == i % ce_per_thread) {

I don't get why people keep putting constants in reversed order like
this. Perhaps in the old days, it helps catch "a = 0" mistakes, but
compilers nowadays are smart enough to complain about that and this is
just hard to read.

> +                       struct load_cache_entries_thread_data *p = &data[thread];
> +
> +                       p->istate = istate;
> +                       p->offset = i;
> +                       p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
> +
> +                       /* create a mem_pool for each thread */
> +                       if (istate->version == 4)
> +                               mem_pool_init(&p->ce_mem_pool,
> +                                                 estimate_cache_size_from_compressed(p->nr));
> +                       else
> +                               mem_pool_init(&p->ce_mem_pool,
> +                                                 estimate_cache_size(mmap_size, p->nr));
> +
> +                       p->mmap = mmap;
> +                       p->start_offset = src_offset;
> +                       if (previous_name) {
> +                               strbuf_addbuf(&p->previous_name_buf, previous_name);
> +                               p->previous_name = &p->previous_name_buf;
> +                       }
> +
> +                       if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
> +                               die("unable to create load_cache_entries_thread");
> +
> +                       /* exit the loop when we've created the last thread */
> +                       if (++thread == nr_threads)
> +                               break;

I still think it's better to have an extension to avoid looping
through like this. How much time does this "for (i = 0; ; i++)" loop
cost? The first thread can't start until you've scanned to the second
block, when you have zillion of entries and about 4 cores, that could
be significant delay. Unless you break smaller blocks and have one
thread handles multiple blocks, but then you pay the cost for
synchronization. Other threads may overlap a bit, but starting all
threads at the same time would benefit more. You also can't start
loading the extensions until you've scanned through all this.

> +               }
> +
> +               ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +
> +               /* On-disk flags are just 16 bits */
> +               flags = get_be16(&ondisk->flags);
> +
> +               if (flags & CE_EXTENDED) {
> +                       struct ondisk_cache_entry_extended *ondisk2;
> +                       ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
> +                       name = ondisk2->name;
> +               } else
> +                       name = ondisk->name;
> +
> +               if (!previous_name) {
> +                       size_t len;
> +
> +                       /* v3 and earlier */
> +                       len = flags & CE_NAMEMASK;
> +                       if (len == CE_NAMEMASK)
> +                               len = strlen(name);
> +                       src_offset += (flags & CE_EXTENDED) ?
> +                               ondisk_cache_entry_extended_size(len) :
> +                               ondisk_cache_entry_size(len);
> +               } else
> +                       src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
> +       }
> +
> +       for (i = 0; i < nr_threads; i++) {
> +               struct load_cache_entries_thread_data *p = data + i;
> +               if (pthread_join(p->pthread, NULL))
> +                       die("unable to join load_cache_entries_thread");

_()

> +               mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
> +               strbuf_release(&p->previous_name_buf);
> +               consumed += p->consumed;
> +       }
> +
> +       free(data);
> +       strbuf_release(&previous_name_buf);
> +
> +       return consumed;
> +}
> +
> +#endif
> +
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
  2018-08-29 17:12     ` Junio C Hamano
@ 2018-09-03 19:21     ` Duy Nguyen
  2018-09-03 19:27       ` Duy Nguyen
  1 sibling, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:21 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Wed, Aug 29, 2018 at 5:25 PM Ben Peart <Ben.Peart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by loading
> the cache extensions on a worker thread in parallel with loading the cache
> entries.
>
> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.
>
> I used p0002-read-cache.sh to generate some performance data on the
> cumulative impact:
>
> 100,000 entries
>
> Test                                HEAD~3           HEAD~2
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

This is misleading (if I read it correctly). 1/3 already drops
execution time down to 9.81, so this patch alone only has about 6%
saving. Have you measured how much time is spent on loading extensions
in single threaded mode? I'm just curious if we could hide that
completely (provided that we have enough cores) while we load the
index.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 2/3] read-cache: load cache extensions on worker thread
  2018-09-03 19:21     ` Duy Nguyen
@ 2018-09-03 19:27       ` Duy Nguyen
  0 siblings, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-03 19:27 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano

On Mon, Sep 3, 2018 at 9:21 PM Duy Nguyen <pclouds@gmail.com> wrote:
> > I used p0002-read-cache.sh to generate some performance data on the
> > cumulative impact:
> >
> > 100,000 entries
> >
> > Test                                HEAD~3           HEAD~2
> > ---------------------------------------------------------------------------
> > read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> This is misleading (if I read it correctly). 1/3 already drops
> execution time down to 9.81, so this patch alone only has about 6%
> saving.

I may have miscalculated that. 1/3 says -30% saving, here it's -31%,
so I guess it's 1% extra saving (or ~3% on 1m entries)? That's
definitely not worth doing.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH] read-cache.c: optimize reading index format v4
  2018-08-27 19:36           ` Junio C Hamano
  2018-08-28 19:25             ` Duy Nguyen
@ 2018-09-04 16:08             ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-04 16:08 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben.Peart, git, peartben

On Mon, Aug 27, 2018 at 12:36:27PM -0700, Junio C Hamano wrote:
> > PS. I notice that v4 does not pad to align entries at 4 byte boundary
> > like v2/v3. This could cause a slight slow down on x86 and segfault on
> > some other platforms.
> 
> Care to elaborate?  
> 
> Long time ago, we used to mmap and read directly from the index file
> contents, requiring either an unaligned read or padded entries.  But
> that was eons ago and we first read and convert from on-disk using
> get_be32() etc. to in-core structure, so I am not sure what you mean
> by "segfault" here.

To conclude this unalignment thing (since I plan more changes in the
index to keep its size down, which may increase unaligned access), I
ran with the following patch on amd64 (still webkit.git, 275k files,
100 runs), the index version that does not make unaligned access does
not give noticeable differences. Still roughly around 4.2s.

Running with NO_UNALIGNED_LOADS defined is clearly slower, in 4.3s
range. So in theory if we avoid unaligned access in the index and
avoid slow get_beXX versions, we could bring performance back to 4.2s
range for those platforms.

But on the other hand, padding the index increases the index size by
~1MB (v4 version before padding is 21MB) and this may add more cost at
update time because of the trailer hash.

So, yeah it's probably ok to keep living with unaligned access and not
pad more. At least until those on "no unaligned access" platforms yell
up.

diff --git a/read-cache.c b/read-cache.c
index 8628d0f3a8..33ee35fb81 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1794,7 +1794,7 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		if (copy_len)
 			memcpy(ce->name, previous_ce->name, copy_len);
 		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
-		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+		*ent_size = ((name - ((char *)ondisk)) + len - copy_len + 8) & ~7;
 	} else {
 		memcpy(ce->name, name, len + 1);
 		*ent_size = ondisk_ce_size(ce);
@@ -2345,8 +2345,10 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
 			result = ce_write(c, fd, to_remove_vi, prefix_size);
 		if (!result)
 			result = ce_write(c, fd, ce->name + common, ce_namelen(ce) - common);
-		if (!result)
-			result = ce_write(c, fd, padding, 1);
+		if (!result) {
+			int len = prefix_size + ce_namelen(ce) - common;
+			result = ce_write(c, fd, padding, align_padding_size(size, len));
+		}
 
 		strbuf_splice(previous_name, common, to_remove,
 			      ce->name + common, ce_namelen(ce) - common);

--
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
@ 2018-09-04 18:58               ` Junio C Hamano
  2018-09-04 19:31               ` Junio C Hamano
  1 sibling, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-04 18:58 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static struct cache_entry *create_from_disk(struct index_state *istate,
>  					    struct ondisk_cache_entry *ondisk,
>  					    unsigned long *ent_size,
> -					    struct strbuf *previous_name)
> +					    const struct cache_entry *previous_ce)
>  {
>  	struct cache_entry *ce;
>  	size_t len;
>  	const char *name;
>  	unsigned int flags;
> +	size_t copy_len;

We should not have to, but let's initialize it to 0 here, because
...

> +	if (expand_name_field) {
> +...
> +		copy_len = previous_len - strip_len;
> +		name = (const char *)cp;
> +	}
> +
> +	if (len == CE_NAMEMASK) {
> +		len = strlen(name);
> +		if (expand_name_field)
> +			len += copy_len;
> ...
> +	}
> +	if (expand_name_field) {
> +		if (copy_len)
> +			memcpy(ce->name, previous_ce->name, copy_len);
> +		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
> +		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;

I am seeing a compiler getting confused, thinking that copy_len
could be used before getting assigned.

Humans can see that reference to copy_len are made only inside "if
(expand_name_field)", so we shouldn't have to.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v2 1/1] read-cache.c: optimize reading index format v4
  2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
  2018-09-04 18:58               ` Junio C Hamano
@ 2018-09-04 19:31               ` Junio C Hamano
  1 sibling, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-04 19:31 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: Ben.Peart, git, peartben

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Index format v4 requires some more computation to assemble a path
> based on a previous one. The current code is not very efficient
> because
>
>  - it doubles memory copy, we assemble the final path in a temporary
>    first before putting it back to a cache_entry
>
>  - strbuf_remove() in expand_name_field() is not exactly a good fit
>    for stripping a part at the end, _setlen() would do the same job
>    and is much cheaper.
>
>  - the open-coded loop to find the end of the string in
>    expand_name_field() can't beat an optimized strlen()
>
> This patch avoids the temporary buffer and writes directly to the new
> cache_entry, which addresses the first two points. The last point
> could also be avoided if the total string length fits in the first 12
> bits of ce_flags, if not we fall back to strlen().
>
> Running "test-tool read-cache 100" on webkit.git (275k files), reading
> v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
> time. The patch reduces read time on v4 to 4.319 seconds.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  read-cache.c | 128 ++++++++++++++++++++++++---------------------------
>  1 file changed, 60 insertions(+), 68 deletions(-)

Thanks; this round is much easier to read with a clearly named
"expand_name_field" boolean variable, etc.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (2 preceding siblings ...)
  2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
@ 2018-09-06 21:03 ` Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
                     ` (4 more replies)
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  5 siblings, 5 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

On further investigation with the previous patch, I noticed that my test
repos didn't contain the cache tree extension in their index. After doing a
commit to ensure they existed, I realized that in some instances, the time
to load the cache tree exceeded the time to load all the cache entries in
parallel.  Because the thread to read the cache tree was started last (due
to having to parse through all the cache entries first) we weren't always
getting optimal performance.

To better optimize for this case, I decided to write the EOIE extension
as suggested by Junio [1] in response to my earlier multithreading patch
series [2].  This enables me to spin up the thread to load the extensions
earlier as it no longer has to parse through all the cache entries first.

The big changes in this iteration are:

- add the EOIE extension
- update the index extension worker thread to start first

The absolute perf numbers don't look as good as the previous iteration
because not loading the cache tree at all is a lot faster than loading it in
parallel. These were measured with a V4 index that included a cache tree
extension.

I used p0002-read-cache.sh to generate some performance data on how the three
performance patches help:

p0002-read-cache.sh w/100,000 files                        
Baseline         expand_name_field()    Thread extensions       Thread entries
---------------------------------------------------------------------------------------
22.34(0.01+0.12) 21.14(0.03+0.01) -5.4% 20.71(0.03+0.03) -7.3%	13.93(0.04+0.04) -37.6%

p0002-read-cache.sh w/1,000,000 files                        
Baseline          expand_name_field()     Thread extensions        Thread entries
-------------------------------------------------------------------------------------------
306.44(0.04+0.07) 295.42(0.01+0.07) -3.6% 217.60(0.03+0.04) -29.0% 199.00(0.00+0.10) -35.1%

This patch conflicts with Duy's patch to remove the double memory copy and
pass in the previous ce instead.  The two will need to be merged/reconciled
once they settle down a bit.

[1] https://public-inbox.org/git/xmqq1sl017dw.fsf@gitster.mtv.corp.google.com/
[2] https://public-inbox.org/git/20171109141737.47976-1-benpeart@microsoft.com/


Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/325ec69299
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v3 && git checkout 325ec69299


### Patches

Ben Peart (4):
  read-cache: optimize expand_name_field() to speed up V4 index parsing.
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: speed up index load through parallelization

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 ++
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 476 ++++++++++++++++++++---
 t/README                                 |  11 +
 t/t1700-split-index.sh                   |   1 +
 7 files changed, 487 insertions(+), 49 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing.
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

Optimize expand_name_field() to speed up V4 index parsing.

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline         expand_name_field()
---------------------------------------
22.34(0.01+0.12) 21.14(0.03+0.01) -5.4%

p0002-read-cache.sh w/1,000,000 files
Baseline          expand_name_field()
-----------------------------------------
306.44(0.04+0.07) 295.42(0.01+0.07) -3.6%

Suggested by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..382cc16bdc 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1754,9 +1754,8 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07 17:55     ` Junio C Hamano
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 149 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 170 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 382cc16bdc..d0d2793780 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1888,6 +1892,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2197,11 +2206,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2444,7 +2457,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2453,6 +2466,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2519,11 +2533,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2534,7 +2550,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2544,7 +2560,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2555,7 +2571,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2566,7 +2582,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2977,3 +3009,104 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* validate the extension signature */
+	index = eoie;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
+		return 0;
+	if ((const char *)mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 9028b47d92..d8754dd23a 100644
--- a/t/README
+++ b/t/README
@@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
 path where deltas larger than this limit require extra memory
 allocation for bookkeeping.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 39133bcbc8..f613dd72e3 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+export GIT_TEST_DISABLE_EOIE=true
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07 21:10     ` Junio C Hamano
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  4 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline         Thread extensions
---------------------------------------
21.14(0.03+0.01) 20.71(0.03+0.03) -2.0%

p0002-read-cache.sh w/1,000,000 files
Baseline          Thread extensions
------------------------------------------
295.42(0.01+0.07) 217.60(0.03+0.04) -26.3%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..9bd79fb165 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index d0d2793780..fcc776aaf0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1897,6 +1901,46 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+ };
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1907,6 +1951,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1943,6 +1992,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1969,23 +2038,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-06 21:03   ` Ben Peart
  2018-09-07  4:16     ` Torsten Bögershausen
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  4 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-06 21:03 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline           Thread entries
------------------------------------------
20.71(0.03+0.03)   13.93(0.04+0.04) -32.7%

p0002-read-cache.sh w/1,000,000 files
Baseline            Thread entries
-------------------------------------------
217.60(0.03+0.04)   199.00(0.00+0.10) -8.6%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 242 +++++++++++++++++++++++++++++++++++++++++++++------
 t/README     |   6 ++
 2 files changed, 220 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index fcc776aaf0..8537a55750 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 100000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+						estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+						estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1996,10 +2188,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_TEST_INDEX_THREADS", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2008,33 +2210,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
diff --git a/t/README b/t/README
index d8754dd23a..59015f7150 100644
--- a/t/README
+++ b/t/README
@@ -324,6 +324,12 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
 as they currently hard code SHA values for the index which are no longer
 valid due to the addition of the EOIE extension.
 
+GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
+the index cache entries and extensions for the whole test suite.
+
 Naming Tests
 ------------
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-07  4:16     ` Torsten Bögershausen
  2018-09-07 13:43       ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Torsten Bögershausen @ 2018-09-07  4:16 UTC (permalink / raw)
  To: Ben Peart; +Cc: git, gitster, pclouds, Ben Peart


> diff --git a/read-cache.c b/read-cache.c
> index fcc776aaf0..8537a55750 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
>  	return NULL;
>  }
>  
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> +			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
> +			unsigned long start_offset, struct strbuf *previous_name)
> +{
> +	int i;
> +	unsigned long src_offset = start_offset;

I read an unsigned long here:
should that be a size_t instead ?

(And probably even everywhere else in this patch)

> +
> +	for (i = offset; i < offset + nr; i++) {
> +		struct ondisk_cache_entry *disk_ce;
> +		struct cache_entry *ce;
> +		unsigned long consumed;
> +
> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
> +		set_index_entry(istate, i, ce);
> +
> +		src_offset += consumed;
> +	}
> +	return src_offset - start_offset;
> +}
> +
> +static unsigned long load_all_cache_entries(struct index_state *istate,
> +			void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +	unsigned long consumed;
> +
> +	if (istate->version == 4) {
> +		previous_name = &previous_name_buf;
> +		mem_pool_init(&istate->ce_mem_pool,
> +				estimate_cache_size_from_compressed(istate->cache_nr));
> +	} else {
> +		previous_name = NULL;
> +		mem_pool_init(&istate->ce_mem_pool,
> +				estimate_cache_size(mmap_size, istate->cache_nr));
> +	}
> +
> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
> +					0, istate->cache_nr, mmap, src_offset, previous_name);
> +	strbuf_release(&previous_name_buf);
> +	return consumed;
> +}
> +
> +#ifndef NO_PTHREADS
> +

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization
  2018-09-07  4:16     ` Torsten Bögershausen
@ 2018-09-07 13:43       ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-07 13:43 UTC (permalink / raw)
  To: Torsten Bögershausen, Ben Peart; +Cc: git, gitster, pclouds, Ben Peart



On 9/7/2018 12:16 AM, Torsten Bögershausen wrote:
> 
>> diff --git a/read-cache.c b/read-cache.c
>> index fcc776aaf0..8537a55750 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
>>   	return NULL;
>>   }
>>   
>> +/*
>> + * A helper function that will load the specified range of cache entries
>> + * from the memory mapped file and add them to the given index.
>> + */
>> +static unsigned long load_cache_entry_block(struct index_state *istate,
>> +			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
>> +			unsigned long start_offset, struct strbuf *previous_name)
>> +{
>> +	int i;
>> +	unsigned long src_offset = start_offset;
> 
> I read an unsigned long here:
> should that be a size_t instead ?
> 
> (And probably even everywhere else in this patch)
> 

It's a fair question.  The pre-patch code had a mix of unsigned long and 
size_t.  Both src_offset and consumed were unsigned long but mmap_size 
was a size_t.  I stuck with that pattern for consistency.

While it would be possible to convert everything to size_t as a step to 
enable index files >4 GB, I have a hard time believing that will be 
necessary for a very long time and would likely require more substantial 
changes to enable that to work.

>> +
>> +	for (i = offset; i < offset + nr; i++) {
>> +		struct ondisk_cache_entry *disk_ce;
>> +		struct cache_entry *ce;
>> +		unsigned long consumed;
>> +
>> +		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
>> +		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
>> +		set_index_entry(istate, i, ce);
>> +
>> +		src_offset += consumed;
>> +	}
>> +	return src_offset - start_offset;
>> +}
>> +
>> +static unsigned long load_all_cache_entries(struct index_state *istate,
>> +			void *mmap, size_t mmap_size, unsigned long src_offset)
>> +{
>> +	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>> +	unsigned long consumed;
>> +
>> +	if (istate->version == 4) {
>> +		previous_name = &previous_name_buf;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +				estimate_cache_size_from_compressed(istate->cache_nr));
>> +	} else {
>> +		previous_name = NULL;
>> +		mem_pool_init(&istate->ce_mem_pool,
>> +				estimate_cache_size(mmap_size, istate->cache_nr));
>> +	}
>> +
>> +	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
>> +					0, istate->cache_nr, mmap, src_offset, previous_name);
>> +	strbuf_release(&previous_name_buf);
>> +	return consumed;
>> +}
>> +
>> +#ifndef NO_PTHREADS
>> +

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-07 17:21   ` " Junio C Hamano
  2018-09-07 18:31     ` Ben Peart
  2018-09-08 13:18     ` Duy Nguyen
  4 siblings, 2 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-07 17:21 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> On further investigation with the previous patch, I noticed that my test
> repos didn't contain the cache tree extension in their index. After doing a
> commit to ensure they existed, I realized that in some instances, the time
> to load the cache tree exceeded the time to load all the cache entries in
> parallel.  Because the thread to read the cache tree was started last (due
> to having to parse through all the cache entries first) we weren't always
> getting optimal performance.
>
> To better optimize for this case, I decided to write the EOIE extension
> as suggested by Junio [1] in response to my earlier multithreading patch
> series [2].  This enables me to spin up the thread to load the extensions
> earlier as it no longer has to parse through all the cache entries first.

Hmph. I kinda liked the simplicity of the previous one, but if we
need to start reading the extension sections sooner by eliminating
the overhead to scan the cache entries, perhaps we should bite the
bullet now.

> The big changes in this iteration are:
>
> - add the EOIE extension
> - update the index extension worker thread to start first

I guess I'd need to see the actual patch to find this out, but once
we rely on a new extension, then we could omit scanning the main
index even to partition the work among workers (i.e. like the topic
long ago, you can have list of pointers into the main index to help
partitioning, plus reset the prefix compression used in v4).  I
think you didn't get that far in this round, which is good.  If the
gain with EOIE alone (and starting the worker for the extension
section early) is large enough without such a pre-computed work
partition table, the simplicity of this round may give us a good
stopping point.

> This patch conflicts with Duy's patch to remove the double memory copy and
> pass in the previous ce instead.  The two will need to be merged/reconciled
> once they settle down a bit.

Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
reading index format v4", 2018-09-02) is already 'next'-worthy
and ready to be built on, but I'd prefer to hear from Duy to double
check.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-07 17:55     ` Junio C Hamano
  2018-09-07 20:23       ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-09-07 17:55 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> The extension consists of:
>
> - 32-bit offset to the end of the index entries
>
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
>
> SHA-1("TREE" + <binary representation of N> +
> 	"REUC" + <binary representation of M>)

I didn't look at the documentation patch in the larger context, but
please make sure that it is clear to the readers that these fixed
width integers "binary representations" use network byte order.

I briefly wondered if the above should include

    + "EOIE" + <binary representation of (32+160)/8 = 24>

as it is pretty much common file format design to include the header
part of the checksum record (with checksum values padded out with NUL
bytes) when you define a record to hold the checksum of the entire
file.  Since this does not protect the contents of each section from
bit-flipping, adding the data for EOIE itself in the sum does not
give us much (iow, what I am adding above is a constant that does
not contribute any entropy).

How big is a typical TREE extension in _your_ work repository
housing Windows sources?  I am guessing that replacing SHA-1 with
something faster (as this is not about security but is about disk
corruption) and instead hash also the contents of these sections
would NOT help all that much in the performance department, as
having to page them in to read through would already consume
non-trivial amount of time, and that is why you are not hashing the
contents.

> +	/*
> +	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1

s/SHA1/trailing checksum/ or something so that we can withstand
NewHash world order?

> +	 * so that it can be found and processed before all the index entries are
> +	 * read.
> +	 */
> +	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
> +		struct strbuf sb = STRBUF_INIT;
> +
> +		write_eoie_extension(&sb, &eoie_c, offset);
> +		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>  			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
>  		strbuf_release(&sb);
>  		if (err)

OK.

> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
> +
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
> +{
> +	/*
> +	 * The end of index entries (EOIE) extension is guaranteed to be last
> +	 * so that it can be found by scanning backwards from the EOF.
> +	 *
> +	 * "EOIE"
> +	 * <4-byte length>
> +	 * <4-byte offset>
> +	 * <20-byte hash>
> +	 */
> +	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
> +	uint32_t extsize;
> +	unsigned long offset, src_offset;
> +	unsigned char hash[GIT_MAX_RAWSZ];
> +	git_hash_ctx c;
> +
> +	/* validate the extension signature */
> +	index = eoie;
> +	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
> +		return 0;
> +	index += sizeof(uint32_t);
> +
> +	/* validate the extension size */
> +	extsize = get_be32(index);
> +	if (extsize != EOIE_SIZE)
> +		return 0;
> +	index += sizeof(uint32_t);

Do we know we have at least 8-byte to consume to perform the above
two checks, or is that something we need to verify at the beginning
of this function?  Better yet, as we know that a correct EOIE with
its own header is 28-byte long, we probably should abort if
mmap_size is smaller than that.

> +	/*
> +	 * Validate the offset we're going to look for the first extension
> +	 * signature is after the index header and before the eoie extension.
> +	 */
> +	offset = get_be32(index);
> +	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
> +		return 0;

Claims that the end is before the beginning, which we reject as bogus.  Good.

> +	if ((const char *)mmap + offset >= eoie)
> +		return 0;

Claims that the end is beyond the EOIE marker we should have placed
after it, which we reject as bogus.  Good.

> +	index += sizeof(uint32_t);
> +
> +	/*
> +	 * The hash is computed over extension types and their sizes (but not
> +	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
> +	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +	 * then the hash would be:
> +	 *
> +	 * SHA-1("TREE" + <binary representation of N> +
> +	 *               "REUC" + <binary representation of M>)
> +	 */
> +	src_offset = offset;
> +	the_hash_algo->init_fn(&c);
> +	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> +		/* After an array of active_nr index entries,
(Style nit).
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);

Earlier we were using get_be32() but now we use memcpy with ntohl()?
How are we choosing which one to use?

I think you meant to cast mmap to (const char *) here.  It may make it
easier to write and read if we started this function like so:

	static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
	{
		const char *mmap = mmap_;

then we do not have to keep casting mmap and cast to a wrong type by
mistake.

> +
> +		/* verify the extension size isn't so large it will wrap around */
> +		if (src_offset + 8 + extsize < src_offset)
> +			return 0;

Good.

> +		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
> +
> +		src_offset += 8;
> +		src_offset += extsize;
> +	}
> +	the_hash_algo->final_fn(hash, &c);
> +	if (hashcmp(hash, (unsigned char *)index))
> +		return 0;
> +
> +	/* Validate that the extension offsets returned us back to the eoie extension. */
> +	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> +		return 0;

Very good.

> +	return offset;
> +}
> +#endif

Overall it looks like it is carefully done.
Thanks.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
@ 2018-09-07 18:31     ` Ben Peart
  2018-09-08 13:18     ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-07 18:31 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 1:21 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> On further investigation with the previous patch, I noticed that my test
>> repos didn't contain the cache tree extension in their index. After doing a
>> commit to ensure they existed, I realized that in some instances, the time
>> to load the cache tree exceeded the time to load all the cache entries in
>> parallel.  Because the thread to read the cache tree was started last (due
>> to having to parse through all the cache entries first) we weren't always
>> getting optimal performance.
>>
>> To better optimize for this case, I decided to write the EOIE extension
>> as suggested by Junio [1] in response to my earlier multithreading patch
>> series [2].  This enables me to spin up the thread to load the extensions
>> earlier as it no longer has to parse through all the cache entries first.
> 
> Hmph. I kinda liked the simplicity of the previous one, but if we
> need to start reading the extension sections sooner by eliminating
> the overhead to scan the cache entries, perhaps we should bite the
> bullet now.
> 

I preferred the simplicity as well but when I was profiling the code and 
found out that loading the extensions was most often the last thread to 
complete, I took this intermediate step to speed things up.

>> The big changes in this iteration are:
>>
>> - add the EOIE extension
>> - update the index extension worker thread to start first
> 
> I guess I'd need to see the actual patch to find this out, but once
> we rely on a new extension, then we could omit scanning the main
> index even to partition the work among workers (i.e. like the topic
> long ago, you can have list of pointers into the main index to help
> partitioning, plus reset the prefix compression used in v4).  I
> think you didn't get that far in this round, which is good.  If the
> gain with EOIE alone (and starting the worker for the extension
> section early) is large enough without such a pre-computed work
> partition table, the simplicity of this round may give us a good
> stopping point.
> 

Agreed.  I didn't go that far in this series as it doesn't appear to be 
necessary.  We could always add that later if it turned out to be worth 
the additional complexity.

>> This patch conflicts with Duy's patch to remove the double memory copy and
>> pass in the previous ce instead.  The two will need to be merged/reconciled
>> once they settle down a bit.
> 
> Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
> reading index format v4", 2018-09-02) is already 'next'-worthy
> and ready to be built on, but I'd prefer to hear from Duy to double
> check.
> 

I'll take a closer look at what this will entail. It gets more 
complicated as we don't actually have a previous expanded cache entry 
when starting each thread.  I'll see how complex it makes the code and 
how much additional performance it gives.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-07 17:55     ` Junio C Hamano
@ 2018-09-07 20:23       ` Ben Peart
  2018-09-08  6:29         ` Martin Ågren
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-07 20:23 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 1:55 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> The extension consists of:
>>
>> - 32-bit offset to the end of the index entries
>>
>> - 160-bit SHA-1 over the extension types and their sizes (but not
>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> then the hash would be:
>>
>> SHA-1("TREE" + <binary representation of N> +
>> 	"REUC" + <binary representation of M>)
> 
> I didn't look at the documentation patch in the larger context, but
> please make sure that it is clear to the readers that these fixed
> width integers "binary representations" use network byte order.
> 

At the top of the documentation it says "All binary numbers are in 
network byte order" and that is not repeated for any of the other 
sections that are documenting the file format.

> I briefly wondered if the above should include
> 
>      + "EOIE" + <binary representation of (32+160)/8 = 24>
> 
> as it is pretty much common file format design to include the header
> part of the checksum record (with checksum values padded out with NUL
> bytes) when you define a record to hold the checksum of the entire
> file.  Since this does not protect the contents of each section from
> bit-flipping, adding the data for EOIE itself in the sum does not
> give us much (iow, what I am adding above is a constant that does
> not contribute any entropy).
> 
> How big is a typical TREE extension in _your_ work repository
> housing Windows sources?  I am guessing that replacing SHA-1 with
> something faster (as this is not about security but is about disk
> corruption) and instead hash also the contents of these sections
> would NOT help all that much in the performance department, as
> having to page them in to read through would already consume
> non-trivial amount of time, and that is why you are not hashing the
> contents.
> 

The purpose of the SHA isn't to detect disk corruption (we already have 
a SHA for the entire index that can serve that purpose) but to help 
ensure that this was actually a valid EOIE extension and not a lucky 
random set of bytes.  I had used leading and trailing signature bytes 
along with the length and version bytes to validate it was an actual 
EOIE extension but you suggested [1] that I use a SHA of the 4 byte 
extension type + 4 byte extension length instead so I rewrote it that way.

[1] 
https://public-inbox.org/git/xmqq1sl017dw.fsf@gitster.mtv.corp.google.com/

>> +	/*
>> +	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
> 
> s/SHA1/trailing checksum/ or something so that we can withstand
> NewHash world order?
> 

I thought about this but in the document elsewhere it refers to it as 
"160-bit SHA-1 over the content of the index file before this checksum." 
and there are at least a dozen other references to "SHA-1" so I figured 
we can fix them all at the same time when we have a new/better name. :-)

>> +	 * so that it can be found and processed before all the index entries are
>> +	 * read.
>> +	 */
>> +	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
>> +		struct strbuf sb = STRBUF_INIT;
>> +
>> +		write_eoie_extension(&sb, &eoie_c, offset);
>> +		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>>   			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
>>   		strbuf_release(&sb);
>>   		if (err)
> 
> OK.
> 
>> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
>> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
>> +
>> +#ifndef NO_PTHREADS
>> +static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
>> +{
>> +	/*
>> +	 * The end of index entries (EOIE) extension is guaranteed to be last
>> +	 * so that it can be found by scanning backwards from the EOF.
>> +	 *
>> +	 * "EOIE"
>> +	 * <4-byte length>
>> +	 * <4-byte offset>
>> +	 * <20-byte hash>
>> +	 */
>> +	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
>> +	uint32_t extsize;
>> +	unsigned long offset, src_offset;
>> +	unsigned char hash[GIT_MAX_RAWSZ];
>> +	git_hash_ctx c;
>> +
>> +	/* validate the extension signature */
>> +	index = eoie;
>> +	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
>> +		return 0;
>> +	index += sizeof(uint32_t);
>> +
>> +	/* validate the extension size */
>> +	extsize = get_be32(index);
>> +	if (extsize != EOIE_SIZE)
>> +		return 0;
>> +	index += sizeof(uint32_t);
> 
> Do we know we have at least 8-byte to consume to perform the above
> two checks, or is that something we need to verify at the beginning
> of this function?  Better yet, as we know that a correct EOIE with
> its own header is 28-byte long, we probably should abort if
> mmap_size is smaller than that.
> 

I'll add that additional test.

>> +	/*
>> +	 * Validate the offset we're going to look for the first extension
>> +	 * signature is after the index header and before the eoie extension.
>> +	 */
>> +	offset = get_be32(index);
>> +	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
>> +		return 0;
> 
> Claims that the end is before the beginning, which we reject as bogus.  Good.
> 
>> +	if ((const char *)mmap + offset >= eoie)
>> +		return 0;
> 
> Claims that the end is beyond the EOIE marker we should have placed
> after it, which we reject as bogus.  Good.
> 
>> +	index += sizeof(uint32_t);
>> +
>> +	/*
>> +	 * The hash is computed over extension types and their sizes (but not
>> +	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
>> +	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
>> +	 * then the hash would be:
>> +	 *
>> +	 * SHA-1("TREE" + <binary representation of N> +
>> +	 *               "REUC" + <binary representation of M>)
>> +	 */
>> +	src_offset = offset;
>> +	the_hash_algo->init_fn(&c);
>> +	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
>> +		/* After an array of active_nr index entries,
> (Style nit).
>> +		 * there can be arbitrary number of extended
>> +		 * sections, each of which is prefixed with
>> +		 * extension name (4-byte) and section length
>> +		 * in 4-byte network byte order.
>> +		 */
>> +		uint32_t extsize;
>> +		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
>> +		extsize = ntohl(extsize);
> 
> Earlier we were using get_be32() but now we use memcpy with ntohl()?
> How are we choosing which one to use?
> 

I literally copy/pasted this logic from the code that actually loads the 
extensions then removed the call to load the extension and replaced it 
with the call to update the hash.  I kept it the same to facilitate 
consistency for any future fixes or changes.

> I think you meant to cast mmap to (const char *) here.  It may make it
> easier to write and read if we started this function like so:
> 
> 	static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
> 	{
> 		const char *mmap = mmap_;
> 
> then we do not have to keep casting mmap and cast to a wrong type by
> mistake.
> 

Good suggestion.

>> +
>> +		/* verify the extension size isn't so large it will wrap around */
>> +		if (src_offset + 8 + extsize < src_offset)
>> +			return 0;
> 
> Good.
> 
>> +		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
>> +
>> +		src_offset += 8;
>> +		src_offset += extsize;
>> +	}
>> +	the_hash_algo->final_fn(hash, &c);
>> +	if (hashcmp(hash, (unsigned char *)index))
>> +		return 0;
>> +
>> +	/* Validate that the extension offsets returned us back to the eoie extension. */
>> +	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
>> +		return 0;
> 
> Very good.
> 
>> +	return offset;
>> +}
>> +#endif
> 
> Overall it looks like it is carefully done.

Thanks for the careful review!

> Thanks.
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-07 21:10     ` Junio C Hamano
  2018-09-08 14:56       ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-09-07 21:10 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> +struct load_index_extensions
> +{
> +#ifndef NO_PTHREADS
> +	pthread_t pthread;
> +#endif
> +	struct index_state *istate;
> +	void *mmap;
> +	size_t mmap_size;
> +	unsigned long src_offset;

If the file format only allows uint32_t on any platform, perhaps
this is better specified as uint32_t?  Or if this is offset into
a mmap'ed region of memory, size_t may be more appropriate.

Same comment applies to "extension_offset" we see below (which in
turn means the returned type of read_eoie_extension() function may
want to match).

> + };

Space before '}'??

> +
> +static void *load_index_extensions(void *_data)
> +{
> +	struct load_index_extensions *p = _data;

Perhaps we are being superstitious, but I think our code try to
avoid leading underscore when able, i.e.

	load_index_extensions(void *data_)
	{
		struct load_index_extensions *p = data;

> +	unsigned long src_offset = p->src_offset;
> +
> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +		/* After an array of active_nr index entries,
> +		 * there can be arbitrary number of extended
> +		 * sections, each of which is prefixed with
> +		 * extension name (4-byte) and section length
> +		 * in 4-byte network byte order.
> +		 */
> +		uint32_t extsize;
> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +		extsize = ntohl(extsize);

The same "ntohl(), not get_be32()?" question as the one for the
previous step applies here, too.  I think the answer is "the
original was written that way" and that is acceptable, but once this
series lands, we may want to review the whole file and see if it is
worth making them consistent with a separate clean-up patch.

I think mmap() and munmap() are the only places that wants p->mmap
and mmap parameters passed around in various callchains to be of
type "void *"---I wonder if it is simpler to use "const char *"
throughout and only cast it to "void *" when necessary (I suspect
that there is nowhere we need to cast to or from "void *" explicitly
if we did so---assignment and argument passing would give us an
appropriate cast for free)?

> +		if (read_index_extension(p->istate,
> +			(const char *)p->mmap + src_offset,
> +			(char *)p->mmap + src_offset + 8,
> +			extsize) < 0) {
> +			munmap(p->mmap, p->mmap_size);
> +			die("index file corrupt");
> +		}
> +	...
> @@ -1907,6 +1951,11 @@ ...
> ...
> +	p.mmap = mmap;
> +	p.mmap_size = mmap_size;
> +
> +#ifndef NO_PTHREADS
> +	nr_threads = git_config_get_index_threads();
> +	if (!nr_threads)
> +		nr_threads = online_cpus();
> +
> +	if (nr_threads >= 2) {
> +		extension_offset = read_eoie_extension(mmap, mmap_size);
> +		if (extension_offset) {
> +			/* create a thread to load the index extensions */
> +			p.src_offset = extension_offset;
> +			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> +				die(_("unable to create load_index_extensions_thread"));
> +		}
> +	}
> +#endif

Makes sense.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-07 20:23       ` Ben Peart
@ 2018-09-08  6:29         ` Martin Ågren
  2018-09-08 14:03           ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Martin Ågren @ 2018-09-08  6:29 UTC (permalink / raw)
  To: Ben Peart
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart

On Fri, 7 Sep 2018 at 22:24, Ben Peart <peartben@gmail.com> wrote:
> > Ben Peart <benpeart@microsoft.com> writes:

> >> - 160-bit SHA-1 over the extension types and their sizes (but not
> >> their contents).  E.g. if we have "TREE" extension that is N-bytes
> >> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> >> then the hash would be:

> The purpose of the SHA isn't to detect disk corruption (we already have
> a SHA for the entire index that can serve that purpose) but to help
> ensure that this was actually a valid EOIE extension and not a lucky
> random set of bytes. [...]

> >> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */

> >> +    the_hash_algo->init_fn(&c);
> >> +    while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
[...]
> >> +    the_hash_algo->final_fn(hash, &c);
> >> +    if (hashcmp(hash, (unsigned char *)index))
> >> +            return 0;
> >> +
> >> +    /* Validate that the extension offsets returned us back to the eoie extension. */
> >> +    if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> >> +            return 0;

Besides the issue you and Junio discussed with "should we document this
as being SHA-1 or NewHash" (or "the hash algo"), it seems to me that
this implementation is living somewhere between using SHA-1 and "the
hash algo". The hashing uses `the_hash_algo`, but the hash size is
hardcoded at 20 bytes.

Maybe it all works out, e.g., so that when someone (brian) merges a
NewHash and runs the testsuite, this will fail consistently and in a
safe way. But I wonder if it would be too hard to avoid the hardcoded 24
already now.

Martin

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 0/4] read-cache: speed up index load through parallelization
  2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
  2018-09-07 18:31     ` Ben Peart
@ 2018-09-08 13:18     ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-08 13:18 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Git Mailing List, Ben Peart

On Fri, Sep 7, 2018 at 7:21 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Ben Peart <benpeart@microsoft.com> writes:
>
> > On further investigation with the previous patch, I noticed that my test
> > repos didn't contain the cache tree extension in their index. After doing a
> > commit to ensure they existed, I realized that in some instances, the time
> > to load the cache tree exceeded the time to load all the cache entries in
> > parallel.  Because the thread to read the cache tree was started last (due
> > to having to parse through all the cache entries first) we weren't always
> > getting optimal performance.
> >
> > To better optimize for this case, I decided to write the EOIE extension
> > as suggested by Junio [1] in response to my earlier multithreading patch
> > series [2].  This enables me to spin up the thread to load the extensions
> > earlier as it no longer has to parse through all the cache entries first.
>
> Hmph. I kinda liked the simplicity of the previous one, but if we
> need to start reading the extension sections sooner by eliminating
> the overhead to scan the cache entries, perhaps we should bite the
> bullet now.

My view is slightly different. If we have to optimize might as well
try to squeeze the best out of it. Simplicity is already out of the
window at this point (but maintainability remains).

> > The big changes in this iteration are:
> >
> > - add the EOIE extension
> > - update the index extension worker thread to start first
>
> I guess I'd need to see the actual patch to find this out, but once
> we rely on a new extension, then we could omit scanning the main
> index even to partition the work among workers (i.e. like the topic
> long ago, you can have list of pointers into the main index to help
> partitioning, plus reset the prefix compression used in v4).  I
> think you didn't get that far in this round, which is good.  If the
> gain with EOIE alone (and starting the worker for the extension
> section early) is large enough without such a pre-computed work
> partition table, the simplicity of this round may give us a good
> stopping point.

I suspect the reduced gain in 1M files case compared to 100k files in
4/4 is because of scanning the index to split work to worker threads.
Since the index is now larger, the scanning takes more time before we
can start worker threads and we gain less from parallelization. I have
not experimented to see if this is true or there is something else.

> > This patch conflicts with Duy's patch to remove the double memory copy and
> > pass in the previous ce instead.  The two will need to be merged/reconciled
> > once they settle down a bit.
>
> Thanks.  I have a feeling that 67922abb ("read-cache.c: optimize
> reading index format v4", 2018-09-02) is already 'next'-worthy
> and ready to be built on, but I'd prefer to hear from Duy to double
> check.

Yes I think it's good. I ran the entire test suite with v4 just to
double check (and thinking of testing v4 version in travis too).
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-08  6:29         ` Martin Ågren
@ 2018-09-08 14:03           ` Ben Peart
  2018-09-08 17:08             ` Martin Ågren
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-08 14:03 UTC (permalink / raw)
  To: Martin Ågren
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart



On 9/8/2018 2:29 AM, Martin Ågren wrote:
> On Fri, 7 Sep 2018 at 22:24, Ben Peart <peartben@gmail.com> wrote:
>>> Ben Peart <benpeart@microsoft.com> writes:
> 
>>>> - 160-bit SHA-1 over the extension types and their sizes (but not
>>>> their contents).  E.g. if we have "TREE" extension that is N-bytes
>>>> long, "REUC" extension that is M-bytes long, followed by "EOIE",
>>>> then the hash would be:
> 
>> The purpose of the SHA isn't to detect disk corruption (we already have
>> a SHA for the entire index that can serve that purpose) but to help
>> ensure that this was actually a valid EOIE extension and not a lucky
>> random set of bytes. [...]
> 
>>>> +#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
> 
>>>> +    the_hash_algo->init_fn(&c);
>>>> +    while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> [...]
>>>> +    the_hash_algo->final_fn(hash, &c);
>>>> +    if (hashcmp(hash, (unsigned char *)index))
>>>> +            return 0;
>>>> +
>>>> +    /* Validate that the extension offsets returned us back to the eoie extension. */
>>>> +    if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
>>>> +            return 0;
> 
> Besides the issue you and Junio discussed with "should we document this
> as being SHA-1 or NewHash" (or "the hash algo"), it seems to me that
> this implementation is living somewhere between using SHA-1 and "the
> hash algo". The hashing uses `the_hash_algo`, but the hash size is
> hardcoded at 20 bytes.
> 
> Maybe it all works out, e.g., so that when someone (brian) merges a
> NewHash and runs the testsuite, this will fail consistently and in a
> safe way. But I wonder if it would be too hard to avoid the hardcoded 24
> already now.
> 
> Martin
> 

I can certainly change this to be:

#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ)

which should (hopefully) make it easier to find this hard coded hash 
length in the sea of hard coded "20" and "160" (bits) littered through 
the codebase.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 3/4] read-cache: load cache extensions on a worker thread
  2018-09-07 21:10     ` Junio C Hamano
@ 2018-09-08 14:56       ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-08 14:56 UTC (permalink / raw)
  To: Junio C Hamano, Ben Peart; +Cc: git, pclouds, Ben Peart



On 9/7/2018 5:10 PM, Junio C Hamano wrote:
> Ben Peart <benpeart@microsoft.com> writes:
> 
>> +struct load_index_extensions
>> +{
>> +#ifndef NO_PTHREADS
>> +	pthread_t pthread;
>> +#endif
>> +	struct index_state *istate;
>> +	void *mmap;
>> +	size_t mmap_size;
>> +	unsigned long src_offset;
> 
> If the file format only allows uint32_t on any platform, perhaps
> this is better specified as uint32_t?  Or if this is offset into
> a mmap'ed region of memory, size_t may be more appropriate.
> 
> Same comment applies to "extension_offset" we see below (which in
> turn means the returned type of read_eoie_extension() function may
> want to match).
> 
>> + };
> 
> Space before '}'??
> 
>> +
>> +static void *load_index_extensions(void *_data)
>> +{
>> +	struct load_index_extensions *p = _data;
> 
> Perhaps we are being superstitious, but I think our code try to
> avoid leading underscore when able, i.e.
> 
> 	load_index_extensions(void *data_)
> 	{
> 		struct load_index_extensions *p = data;

That's what I get for copying code from elsewhere in the source. :-)

static void *preload_thread(void *_data)
{
	int nr;
	struct thread_data *p = _data;

since there isn't any need for the underscore at all, I'll just make it:

static void *load_index_extensions(void *data)
{
	struct load_index_extensions *p = data;

> 
>> +	unsigned long src_offset = p->src_offset;
>> +
>> +	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
>> +		/* After an array of active_nr index entries,
>> +		 * there can be arbitrary number of extended
>> +		 * sections, each of which is prefixed with
>> +		 * extension name (4-byte) and section length
>> +		 * in 4-byte network byte order.
>> +		 */
>> +		uint32_t extsize;
>> +		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
>> +		extsize = ntohl(extsize);
> 
> The same "ntohl(), not get_be32()?" question as the one for the
> previous step applies here, too.  I think the answer is "the
> original was written that way" and that is acceptable, but once this
> series lands, we may want to review the whole file and see if it is
> worth making them consistent with a separate clean-up patch.
> 

Makes sense, I'll add a cleanup patch to fix the inconsistency and have 
them use get_be32().

> I think mmap() and munmap() are the only places that wants p->mmap
> and mmap parameters passed around in various callchains to be of
> type "void *"---I wonder if it is simpler to use "const char *"
> throughout and only cast it to "void *" when necessary (I suspect
> that there is nowhere we need to cast to or from "void *" explicitly
> if we did so---assignment and argument passing would give us an
> appropriate cast for free)?

Sure, I'll add minimizing the casting to the clean up patch.

> 
>> +		if (read_index_extension(p->istate,
>> +			(const char *)p->mmap + src_offset,
>> +			(char *)p->mmap + src_offset + 8,
>> +			extsize) < 0) {
>> +			munmap(p->mmap, p->mmap_size);
>> +			die("index file corrupt");
>> +		}
>> +	...
>> @@ -1907,6 +1951,11 @@ ...
>> ...
>> +	p.mmap = mmap;
>> +	p.mmap_size = mmap_size;
>> +
>> +#ifndef NO_PTHREADS
>> +	nr_threads = git_config_get_index_threads();
>> +	if (!nr_threads)
>> +		nr_threads = online_cpus();
>> +
>> +	if (nr_threads >= 2) {
>> +		extension_offset = read_eoie_extension(mmap, mmap_size);
>> +		if (extension_offset) {
>> +			/* create a thread to load the index extensions */
>> +			p.src_offset = extension_offset;
>> +			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
>> +				die(_("unable to create load_index_extensions_thread"));
>> +		}
>> +	}
>> +#endif
> 
> Makes sense.
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension
  2018-09-08 14:03           ` Ben Peart
@ 2018-09-08 17:08             ` Martin Ågren
  0 siblings, 0 replies; 87+ messages in thread
From: Martin Ågren @ 2018-09-08 17:08 UTC (permalink / raw)
  To: Ben Peart
  Cc: Junio C Hamano, Ben Peart, Git Mailing List,
	Nguyễn Thái Ngọc Duy, Ben Peart

On Sat, 8 Sep 2018 at 16:04, Ben Peart <peartben@gmail.com> wrote:
> On 9/8/2018 2:29 AM, Martin Ågren wrote:
> > Maybe it all works out, e.g., so that when someone (brian) merges a
> > NewHash and runs the testsuite, this will fail consistently and in a
> > safe way. But I wonder if it would be too hard to avoid the hardcoded 24
> > already now.
>
> I can certainly change this to be:
>
> #define EOIE_SIZE (4 + GIT_SHA1_RAWSZ)
>
> which should (hopefully) make it easier to find this hard coded hash
> length in the sea of hard coded "20" and "160" (bits) littered through
> the codebase.

Yeah, that seems more grep-friendly.

Martin

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 0/5] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (3 preceding siblings ...)
  2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-11 23:26 ` " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (5 more replies)
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  5 siblings, 6 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

This version of the patch merges in Duy's work to speed up index v4 decoding.
I had to massage it a bit to get it to work with the multi-threading but its
still largely his code. It helps a little (3%-4%) when the cache entry thread(s)
take the longest and not when the index extensions loading is the long thread.

I also added a minor cleanup patch to minimize the casting required when
working with the memory mapped index and other minor changes based on the
feedback received.

Base Ref: v2.19.0
Web-Diff: https://github.com/benpeart/git/commit/9d31d5fb20
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v4 && git checkout 9d31d5fb20


### Patches

Ben Peart (4):
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: speed up index load through parallelization
  read-cache: clean up casting and byte decoding

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 +
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 581 +++++++++++++++++++----
 5 files changed, 531 insertions(+), 98 deletions(-)


base-commit: 1d4361b0f344188ab5eec6dcea01f61a3a3a1670
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 154 +++++++++++++++++++++--
 2 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..2abac0a7a2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *mmap = mmap_;
+	const char *index, *eoie;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 2/5] read-cache: load cache extensions on a worker thread
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel Extensions
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel Extensions
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index eb66a11975..d0d8075978 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2400,6 +2400,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 3461993f0a..f7ebf149fc 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 2abac0a7a2..9b97c29f5b 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 3/5] read-cache: speed up index load through parallelization
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
  2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel entries
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel entries
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 240 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 212 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 9b97c29f5b..c01d34a71d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1942,20 +1942,210 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool, istate->cache_nr * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool, estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1997,10 +2187,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_INDEX_THREADS_TEST", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2009,33 +2209,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 4/5] read-cache.c: optimize reading index format v4
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
  2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 136 +++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 65 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index c01d34a71d..d21ccb5e67 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1762,22 +1735,24 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1797,21 +1772,54 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (len == CE_NAMEMASK) {
+		len = strlen(name);
+		if (expand_name_field)
+			len += copy_len;
+	}
+
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
+
+	if (expand_name_field) {
+		if (copy_len)
+			memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1948,7 +1956,7 @@ static void *load_index_extensions(void *_data)
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
 			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1959,10 +1967,11 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
@@ -1970,20 +1979,16 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 static unsigned long load_all_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool, istate->cache_nr * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool, estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -2005,8 +2010,7 @@ struct load_cache_entries_thread_data
 	int offset, nr;
 	void *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2019,7 +2023,7 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
@@ -2066,20 +2070,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2102,7 +2109,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2121,7 +2128,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v4 5/5] read-cache: clean up casting and byte decoding
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-09-11 23:26   ` Ben Peart
  2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-11 23:26 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index d21ccb5e67..6220abc491 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1906,7 +1906,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1916,14 +1916,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1934,13 +1934,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1955,7 +1954,7 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
 			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
@@ -1966,7 +1965,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1977,7 +1976,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	unsigned long consumed;
 
@@ -2008,7 +2007,7 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
 	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
@@ -2028,7 +2027,7 @@ static void *load_cache_entries_thread(void *_data)
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2097,7 +2096,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2145,8 +2144,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2178,7 +2177,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2238,11 +2237,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3265,7 +3264,7 @@ int should_validate_cache_entries(void)
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3276,7 +3275,6 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *mmap = mmap_;
 	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
@@ -3329,8 +3327,7 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v4 0/5] read-cache: speed up index load through parallelization
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
                     ` (4 preceding siblings ...)
  2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
@ 2018-09-12 14:34   ` Ben Peart
  5 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 14:34 UTC (permalink / raw)
  To: Ben Peart, git; +Cc: gitster, pclouds, Ben Peart



On 9/11/2018 7:26 PM, Ben Peart wrote:
> This version of the patch merges in Duy's work to speed up index v4 decoding.
> I had to massage it a bit to get it to work with the multi-threading but its
> still largely his code. It helps a little (3%-4%) when the cache entry thread(s)
> take the longest and not when the index extensions loading is the long thread.
> 
> I also added a minor cleanup patch to minimize the casting required when
> working with the memory mapped index and other minor changes based on the
> feedback received.
> 
> Base Ref: v2.19.0
> Web-Diff: https://github.com/benpeart/git/commit/9d31d5fb20
> Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v4 && git checkout 9d31d5fb20
> 
> 

A bad merge (mistake on my part, not a bug) means this is missing some 
of the changes from V3.  Please ignore, I'll send an updated series to 
address it.

> ### Patches
> 
> Ben Peart (4):
>    eoie: add End of Index Entry (EOIE) extension
>    read-cache: load cache extensions on a worker thread
>    read-cache: speed up index load through parallelization
>    read-cache: clean up casting and byte decoding
> 
> Nguyễn Thái Ngọc Duy (1):
>    read-cache.c: optimize reading index format v4
> 
>   Documentation/config.txt                 |   6 +
>   Documentation/technical/index-format.txt |  23 +
>   config.c                                 |  18 +
>   config.h                                 |   1 +
>   read-cache.c                             | 581 +++++++++++++++++++----
>   5 files changed, 531 insertions(+), 98 deletions(-)
> 
> 
> base-commit: 1d4361b0f344188ab5eec6dcea01f61a3a3a1670
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 0/5] read-cache: speed up index load through parallelization
  2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
                   ` (4 preceding siblings ...)
  2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
@ 2018-09-12 16:18 ` " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
                     ` (4 more replies)
  5 siblings, 5 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

This version of the patch merges in Duy's work to speed up index v4 decoding.
I had to massage it a bit to get it to work with the multi-threading but it is
still largely his code. I also responded to Junio's feedback on initializing
copy_len to avoid compiler warnings.

I also added a minor cleanup patch to minimize the casting required when
working with the memory mapped index and other minor changes based on the
feedback received.

Base Ref: master
Web-Diff: https://github.com/benpeart/git/commit/dcf62005f8
Checkout: git fetch https://github.com/benpeart/git read-index-multithread-v5 && git checkout dcf62005f8


### Interdiff (v3..v5):

diff --git a/read-cache.c b/read-cache.c
index 8537a55750..c05e887fc9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1768,15 +1741,18 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len = 0;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1796,21 +1772,50 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
+
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
 	if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+		len = strlen(name) + copy_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1897,7 +1902,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1907,14 +1912,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1925,13 +1930,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1946,8 +1950,8 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1957,34 +1961,31 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -1993,7 +1994,7 @@ static unsigned long load_all_cache_entries(struct index_state *istate,
 /*
  * Mostly randomly chosen maximum thread counts: we
  * cap the parallelism to online_cpus() threads, and we want
- * to have at least 100000 cache entries per thread for it to
+ * to have at least 10000 cache entries per thread for it to
  * be worth starting a thread.
  */
 #define THREAD_COST		(10000)
@@ -2004,10 +2005,9 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2020,12 +2020,12 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2067,20 +2067,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2091,7 +2094,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2103,7 +2106,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2122,7 +2125,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
@@ -2140,8 +2142,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2173,7 +2175,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2233,11 +2235,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3256,11 +3258,11 @@ int should_validate_cache_entries(void)
 	return validate_index_cache_entries;
 }
 
-#define EOIE_SIZE 24 /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3271,14 +3273,18 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *index, *eoie = (const char *)mmap + mmap_size - GIT_SHA1_RAWSZ - EOIE_SIZE_WITH_HEADER;
+	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
 	unsigned char hash[GIT_MAX_RAWSZ];
 	git_hash_ctx c;
 
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
 	/* validate the extension signature */
-	index = eoie;
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
 	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
 		return 0;
 	index += sizeof(uint32_t);
@@ -3294,9 +3300,9 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 	 * signature is after the index header and before the eoie extension.
 	 */
 	offset = get_be32(index);
-	if ((const char *)mmap + offset < (const char *)mmap + sizeof(struct cache_header))
+	if (mmap + offset < mmap + sizeof(struct cache_header))
 		return 0;
-	if ((const char *)mmap + offset >= eoie)
+	if (mmap + offset >= eoie)
 		return 0;
 	index += sizeof(uint32_t);
 
@@ -3319,20 +3325,19 @@ static unsigned long read_eoie_extension(void *mmap, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
 			return 0;
 
-		the_hash_algo->update_fn(&c, (const char *)mmap + src_offset, 8);
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
 
 		src_offset += 8;
 		src_offset += extsize;
 	}
 	the_hash_algo->final_fn(hash, &c);
-	if (hashcmp(hash, (unsigned char *)index))
+	if (hashcmp(hash, (const unsigned char *)index))
 		return 0;
 
 	/* Validate that the extension offsets returned us back to the eoie extension. */
diff --git a/t/README b/t/README
index 59015f7150..69c695ad8e 100644
--- a/t/README
+++ b/t/README
@@ -326,9 +326,6 @@ valid due to the addition of the EOIE extension.
 
 GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
 the index cache entries and extensions for the whole test suite.
-Currently tests 1, 4-9 in t1700-split-index.sh fail as they hard
-code SHA values for the index which are no longer valid due to the
-addition of the EOIE extension.
 
 Naming Tests
 ------------


### Patches

Ben Peart (4):
  eoie: add End of Index Entry (EOIE) extension
  read-cache: load cache extensions on a worker thread
  read-cache: load cache entries on worker threads
  read-cache: clean up casting and byte decoding

Nguyễn Thái Ngọc Duy (1):
  read-cache.c: optimize reading index format v4

 Documentation/config.txt                 |   6 +
 Documentation/technical/index-format.txt |  23 +
 config.c                                 |  18 +
 config.h                                 |   1 +
 read-cache.c                             | 579 +++++++++++++++++++----
 t/README                                 |   8 +
 t/t1700-split-index.sh                   |   1 +
 7 files changed, 538 insertions(+), 98 deletions(-)


base-commit: 29d9e3e2c47dd4b5053b0a98c891878d398463e3
-- 
2.18.0.windows.1



^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-13 22:44     ` Junio C Hamano
  2018-09-15 10:02     ` Duy Nguyen
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
                     ` (3 subsequent siblings)
  4 siblings, 2 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 154 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..858935f123 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *mmap = mmap_;
+	const char *index, *eoie;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 9028b47d92..d8754dd23a 100644
--- a/t/README
+++ b/t/README
@@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
 path where deltas larger than this limit require extra memory
 allocation for bookkeeping.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 39133bcbc8..f613dd72e3 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+export GIT_TEST_DISABLE_EOIE=true
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by loading
the cache extensions on a worker thread in parallel with loading the cache
entries.

In some cases, loading the extensions takes longer than loading the
cache entries so this patch utilizes the new EOIE to start the thread to
load the extensions before loading all the cache entries in parallel.

This is possible because the current extensions don't access the cache
entries in the index_state structure so are OK that they don't all exist
yet.

The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
extensions don't even get a pointer to the index so don't have access to the
cache entries.

CACHE_EXT_LINK only uses the index_state to initialize the split index.
CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
update and dirty flags.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel Extensions
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel Extensions
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/config.txt |  6 +++
 config.c                 | 18 ++++++++
 config.h                 |  1 +
 read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1c42364988..79f8296d9c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2391,6 +2391,12 @@ imap::
 	The configuration variables in the 'imap' section are described
 	in linkgit:git-imap-send[1].
 
+index.threads::
+	Specifies the number of threads to spawn when loading the index.
+	This is meant to reduce index load time on multiprocessor machines.
+	Specifying 0 or 'true' will cause Git to auto-detect the number of
+	CPU's and set the number of threads accordingly. Defaults to 'true'.
+
 index.version::
 	Specify the version with which new index files should be
 	initialized.  This does not affect existing repositories.
diff --git a/config.c b/config.c
index 9a0b10d4bc..9bd79fb165 100644
--- a/config.c
+++ b/config.c
@@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
 	return 0;
 }
 
+/*
+ * You can disable multi-threaded code by setting index.threads
+ * to 'false' (or 1)
+ */
+int git_config_get_index_threads(void)
+{
+	int is_bool, val;
+
+	if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
+		if (is_bool)
+			return val ? 0 : 1;
+		else
+			return val;
+	}
+
+	return 0; /* auto-detect */
+}
+
 NORETURN
 void git_die_config_linenr(const char *key, const char *filename, int linenr)
 {
diff --git a/config.h b/config.h
index ab46e0165d..a06027e69b 100644
--- a/config.h
+++ b/config.h
@@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
 extern int git_config_get_split_index(void);
 extern int git_config_get_max_percent_split_change(void);
 extern int git_config_get_fsmonitor(void);
+extern int git_config_get_index_threads(void);
 
 /* This dies if the configured or default date is in the future */
 extern int git_config_get_expiry(const char *key, const char **output);
diff --git a/read-cache.c b/read-cache.c
index 858935f123..b203eebb44 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,10 @@
 #include "split-index.h"
 #include "utf8.h"
 #include "fsmonitor.h"
+#ifndef NO_PTHREADS
+#include <pthread.h>
+#include <thread-utils.h>
+#endif
 
 /* Mask for the name length in ce_flags in the on-disk index */
 
@@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
+struct load_index_extensions
+{
+#ifndef NO_PTHREADS
+	pthread_t pthread;
+#endif
+	struct index_state *istate;
+	void *mmap;
+	size_t mmap_size;
+	unsigned long src_offset;
+};
+
+static void *load_index_extensions(void *_data)
+{
+	struct load_index_extensions *p = _data;
+	unsigned long src_offset = p->src_offset;
+
+	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+		if (read_index_extension(p->istate,
+			(const char *)p->mmap + src_offset,
+			(char *)p->mmap + src_offset + 8,
+			extsize) < 0) {
+			munmap(p->mmap, p->mmap_size);
+			die("index file corrupt");
+		}
+		src_offset += 8;
+		src_offset += extsize;
+	}
+
+	return NULL;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	void *mmap;
 	size_t mmap_size;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_index_extensions p = { 0 };
+	unsigned long extension_offset = 0;
+#ifndef NO_PTHREADS
+	int nr_threads;
+#endif
 
 	if (istate->initialized)
 		return istate->cache_nr;
@@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
+	p.istate = istate;
+	p.mmap = mmap;
+	p.mmap_size = mmap_size;
+
+#ifndef NO_PTHREADS
+	nr_threads = git_config_get_index_threads();
+	if (!nr_threads)
+		nr_threads = online_cpus();
+
+	if (nr_threads >= 2) {
+		extension_offset = read_eoie_extension(mmap, mmap_size);
+		if (extension_offset) {
+			/* create a thread to load the index extensions */
+			p.src_offset = extension_offset;
+			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
+				die(_("unable to create load_index_extensions_thread"));
+		}
+	}
+#endif
+
 	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
@@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
-	while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
-		/* After an array of active_nr index entries,
-		 * there can be arbitrary number of extended
-		 * sections, each of which is prefixed with
-		 * extension name (4-byte) and section length
-		 * in 4-byte network byte order.
-		 */
-		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
-		if (read_index_extension(istate,
-					 (const char *) mmap + src_offset,
-					 (char *) mmap + src_offset + 8,
-					 extsize) < 0)
-			goto unmap;
-		src_offset += 8;
-		src_offset += extsize;
+	/* if we created a thread, join it otherwise load the extensions on the primary thread */
+#ifndef NO_PTHREADS
+	if (extension_offset && pthread_join(p.pthread, NULL))
+		die(_("unable to join load_index_extensions_thread"));
+#endif
+	if (!extension_offset) {
+		p.src_offset = src_offset;
+		load_index_extensions(&p);
 	}
 	munmap(mmap, mmap_size);
 	return istate->cache_nr;
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
                       ` (2 more replies)
  2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
  2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart
  4 siblings, 3 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

Test w/100,000 files                Baseline         Parallel entries
---------------------------------------------------------------------------
read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%

Test w/1,000,000 files              Baseline         Parallel entries
------------------------------------------------------------------------------
read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 242 +++++++++++++++++++++++++++++++++++++++++++++------
 t/README     |   3 +
 2 files changed, 217 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index b203eebb44..880f627b4c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1942,20 +1942,212 @@ static void *load_index_extensions(void *_data)
 	return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			unsigned long start_offset, struct strbuf *previous_name)
+{
+	int i;
+	unsigned long src_offset = start_offset;
+
+	for (i = offset; i < offset + nr; i++) {
+		struct ondisk_cache_entry *disk_ce;
+		struct cache_entry *ce;
+		unsigned long consumed;
+
+		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		set_index_entry(istate, i, ce);
+
+		src_offset += consumed;
+	}
+	return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	unsigned long consumed;
+
+	if (istate->version == 4) {
+		previous_name = &previous_name_buf;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
+		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+				estimate_cache_size(mmap_size, istate->cache_nr));
+	}
+
+	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+					0, istate->cache_nr, mmap, src_offset, previous_name);
+	strbuf_release(&previous_name_buf);
+	return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10000 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST		(10000)
+
+struct load_cache_entries_thread_data
+{
+	pthread_t pthread;
+	struct index_state *istate;
+	struct mem_pool *ce_mem_pool;
+	int offset, nr;
+	void *mmap;
+	unsigned long start_offset;
+	struct strbuf previous_name_buf;
+	struct strbuf *previous_name;
+	unsigned long consumed;	/* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+	struct load_cache_entries_thread_data *p = _data;
+
+	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+	return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
+			void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+	struct load_cache_entries_thread_data *data;
+	int ce_per_thread;
+	unsigned long consumed;
+	int i, thread;
+
+	/* a little sanity checking */
+	if (istate->name_hash_initialized)
+		BUG("the name hash isn't thread safe");
+
+	mem_pool_init(&istate->ce_mem_pool, 0);
+	if (istate->version == 4)
+		previous_name = &previous_name_buf;
+	else
+		previous_name = NULL;
+
+	ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+	data = xcalloc(nr_threads, sizeof(struct load_cache_entries_thread_data));
+
+	/*
+	 * Loop through index entries starting a thread for every ce_per_thread
+	 * entries. Exit the loop when we've created the final thread (no need
+	 * to parse the remaining entries.
+	 */
+	consumed = thread = 0;
+	for (i = 0; ; i++) {
+		struct ondisk_cache_entry *ondisk;
+		const char *name;
+		unsigned int flags;
+
+		/*
+		 * we've reached the beginning of a block of cache entries,
+		 * kick off a thread to process them
+		 */
+		if (i % ce_per_thread == 0) {
+			struct load_cache_entries_thread_data *p = &data[thread];
+
+			p->istate = istate;
+			p->offset = i;
+			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+
+			/* create a mem_pool for each thread */
+			if (istate->version == 4)
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size_from_compressed(p->nr));
+			else
+				mem_pool_init(&p->ce_mem_pool,
+					estimate_cache_size(mmap_size, p->nr));
+
+			p->mmap = mmap;
+			p->start_offset = src_offset;
+			if (previous_name) {
+				strbuf_addbuf(&p->previous_name_buf, previous_name);
+				p->previous_name = &p->previous_name_buf;
+			}
+
+			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
+				die("unable to create load_cache_entries_thread");
+
+			/* exit the loop when we've created the last thread */
+			if (++thread == nr_threads)
+				break;
+		}
+
+		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+
+		/* On-disk flags are just 16 bits */
+		flags = get_be16(&ondisk->flags);
+
+		if (flags & CE_EXTENDED) {
+			struct ondisk_cache_entry_extended *ondisk2;
+			ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
+			name = ondisk2->name;
+		} else
+			name = ondisk->name;
+
+		if (!previous_name) {
+			size_t len;
+
+			/* v3 and earlier */
+			len = flags & CE_NAMEMASK;
+			if (len == CE_NAMEMASK)
+				len = strlen(name);
+			src_offset += (flags & CE_EXTENDED) ?
+				ondisk_cache_entry_extended_size(len) :
+				ondisk_cache_entry_size(len);
+		} else
+			src_offset += (name - ((char *)ondisk)) + expand_name_field(previous_name, name);
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		struct load_cache_entries_thread_data *p = data + i;
+		if (pthread_join(p->pthread, NULL))
+			die("unable to join load_cache_entries_thread");
+		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
+		strbuf_release(&p->previous_name_buf);
+		consumed += p->consumed;
+	}
+
+	free(data);
+	strbuf_release(&previous_name_buf);
+
+	return consumed;
+}
+
+#endif
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
-	int fd, i;
+	int fd;
 	struct stat st;
 	unsigned long src_offset;
 	struct cache_header *hdr;
 	void *mmap;
 	size_t mmap_size;
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
 #ifndef NO_PTHREADS
-	int nr_threads;
+	int cpus, nr_threads;
 #endif
 
 	if (istate->initialized)
@@ -1997,10 +2189,20 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	p.mmap = mmap;
 	p.mmap_size = mmap_size;
 
+	src_offset = sizeof(*hdr);
+
 #ifndef NO_PTHREADS
 	nr_threads = git_config_get_index_threads();
-	if (!nr_threads)
-		nr_threads = online_cpus();
+	if (!nr_threads) {
+		cpus = online_cpus();
+		nr_threads = istate->cache_nr / THREAD_COST;
+		if (nr_threads > cpus)
+			nr_threads = cpus;
+	}
+
+	/* enable testing with fewer than default minimum of entries */
+	if (istate->cache_nr > 1 && nr_threads < 3 && git_env_bool("GIT_TEST_INDEX_THREADS", 0))
+		nr_threads = 3;
 
 	if (nr_threads >= 2) {
 		extension_offset = read_eoie_extension(mmap, mmap_size);
@@ -2009,33 +2211,17 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 			p.src_offset = extension_offset;
 			if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
 				die(_("unable to create load_index_extensions_thread"));
+			nr_threads--;
 		}
 	}
+	if (nr_threads >= 2)
+		src_offset += load_cache_entries_threaded(nr_threads, istate, mmap, mmap_size, src_offset);
+	else
+		src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
+#else
+	src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
 #endif
 
-	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size_from_compressed(istate->cache_nr));
-	} else {
-		previous_name = NULL;
-		mem_pool_init(&istate->ce_mem_pool,
-			      estimate_cache_size(mmap_size, istate->cache_nr));
-	}
-
-	src_offset = sizeof(*hdr);
-	for (i = 0; i < istate->cache_nr; i++) {
-		struct ondisk_cache_entry *disk_ce;
-		struct cache_entry *ce;
-		unsigned long consumed;
-
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
-		set_index_entry(istate, i, ce);
-
-		src_offset += consumed;
-	}
-	strbuf_release(&previous_name_buf);
 	istate->timestamp.sec = st.st_mtime;
 	istate->timestamp.nsec = ST_MTIME_NSEC(st);
 
diff --git a/t/README b/t/README
index d8754dd23a..69c695ad8e 100644
--- a/t/README
+++ b/t/README
@@ -324,6 +324,9 @@ This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
 as they currently hard code SHA values for the index which are no longer
 valid due to the addition of the EOIE extension.
 
+GIT_TEST_INDEX_THREADS=<boolean> forces multi-threaded loading of
+the index cache entries and extensions for the whole test suite.
+
 Naming Tests
 ------------
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 4/5] read-cache.c: optimize reading index format v4
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
                     ` (2 preceding siblings ...)
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart
  4 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart

From: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Index format v4 requires some more computation to assemble a path
based on a previous one. The current code is not very efficient
because

 - it doubles memory copy, we assemble the final path in a temporary
   first before putting it back to a cache_entry

 - strbuf_remove() in expand_name_field() is not exactly a good fit
   for stripping a part at the end, _setlen() would do the same job
   and is much cheaper.

 - the open-coded loop to find the end of the string in
   expand_name_field() can't beat an optimized strlen()

This patch avoids the temporary buffer and writes directly to the new
cache_entry, which addresses the first two points. The last point
could also be avoided if the total string length fits in the first 12
bits of ce_flags, if not we fall back to strlen().

Running "test-tool read-cache 100" on webkit.git (275k files), reading
v2 only takes 4.226 seconds, while v4 takes 5.711 seconds, 35% more
time. The patch reduces read time on v4 to 4.319 seconds.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 read-cache.c | 132 ++++++++++++++++++++++++++-------------------------
 1 file changed, 67 insertions(+), 65 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 880f627b4c..40dc4723b2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1721,33 +1721,6 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
-						   struct ondisk_cache_entry *ondisk,
-						   unsigned int flags,
-						   const char *name,
-						   size_t len)
-{
-	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
-
-	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
-	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
-	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
-	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
-	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
-	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
-	ce->ce_mode  = get_be32(&ondisk->mode);
-	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
-	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
-	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
-	ce->ce_flags = flags & ~CE_NAMEMASK;
-	ce->ce_namelen = len;
-	ce->index = 0;
-	hashcpy(ce->oid.hash, ondisk->sha1);
-	memcpy(ce->name, name, len);
-	ce->name[len] = '\0';
-	return ce;
-}
-
 /*
  * Adjacent cache entries tend to share the leading paths, so it makes
  * sense to only store the differences in later entries.  In the v4
@@ -1762,22 +1735,24 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 
 	if (name->len < len)
 		die("malformed name field in the index");
-	strbuf_remove(name, name->len - len, len);
-	for (ep = cp; *ep; ep++)
-		; /* find the end */
+	strbuf_setlen(name, name->len - len);
+	ep = cp + strlen((const char *)cp);
 	strbuf_add(name, cp, ep - cp);
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
+static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
+					    unsigned int version,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
-					    struct strbuf *previous_name)
+					    const struct cache_entry *previous_ce)
 {
 	struct cache_entry *ce;
 	size_t len;
 	const char *name;
 	unsigned int flags;
+	size_t copy_len = 0;
+	int expand_name_field = version == 4;
 
 	/* On-disk flags are just 16 bits */
 	flags = get_be16(&ondisk->flags);
@@ -1797,21 +1772,50 @@ static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 	else
 		name = ondisk->name;
 
-	if (!previous_name) {
-		/* v3 and earlier */
-		if (len == CE_NAMEMASK)
-			len = strlen(name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
+	if (expand_name_field) {
+		const unsigned char *cp = (const unsigned char *)name;
+		size_t strip_len, previous_len;
 
-		*ent_size = ondisk_ce_size(ce);
-	} else {
-		unsigned long consumed;
-		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
-					     previous_name->buf,
-					     previous_name->len);
+		previous_len = previous_ce ? previous_ce->ce_namelen : 0;
+		strip_len = decode_varint(&cp);
+		if (previous_len < strip_len) {
+			if (previous_ce)
+				die(_("malformed name field in the index, near path '%s'"),
+				    previous_ce->name);
+			else
+				die(_("malformed name field in the index in the first path"));
+		}
+		copy_len = previous_len - strip_len;
+		name = (const char *)cp;
+	}
+
+	if (len == CE_NAMEMASK)
+		len = strlen(name) + copy_len;
+
+	ce = mem_pool__ce_alloc(ce_mem_pool, len);
+
+	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
+	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
+	ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
+	ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
+	ce->ce_stat_data.sd_dev   = get_be32(&ondisk->dev);
+	ce->ce_stat_data.sd_ino   = get_be32(&ondisk->ino);
+	ce->ce_mode  = get_be32(&ondisk->mode);
+	ce->ce_stat_data.sd_uid   = get_be32(&ondisk->uid);
+	ce->ce_stat_data.sd_gid   = get_be32(&ondisk->gid);
+	ce->ce_stat_data.sd_size  = get_be32(&ondisk->size);
+	ce->ce_flags = flags & ~CE_NAMEMASK;
+	ce->ce_namelen = len;
+	ce->index = 0;
+	hashcpy(ce->oid.hash, ondisk->sha1);
 
-		*ent_size = (name - ((char *)ondisk)) + consumed;
+	if (expand_name_field) {
+		memcpy(ce->name, previous_ce->name, copy_len);
+		memcpy(ce->name + copy_len, name, len + 1 - copy_len);
+		*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
+	} else {
+		memcpy(ce->name, name, len + 1);
+		*ent_size = ondisk_ce_size(ce);
 	}
 	return ce;
 }
@@ -1948,7 +1952,7 @@ static void *load_index_extensions(void *_data)
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
 			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
-			unsigned long start_offset, struct strbuf *previous_name)
+			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
 	unsigned long src_offset = start_offset;
@@ -1959,10 +1963,11 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(ce_mem_pool, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
+		previous_ce = ce;
 	}
 	return src_offset - start_offset;
 }
@@ -1970,22 +1975,18 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 static unsigned long load_all_cache_entries(struct index_state *istate,
 			void *mmap, size_t mmap_size, unsigned long src_offset)
 {
-	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	unsigned long consumed;
 
 	if (istate->version == 4) {
-		previous_name = &previous_name_buf;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size_from_compressed(istate->cache_nr));
 	} else {
-		previous_name = NULL;
 		mem_pool_init(&istate->ce_mem_pool,
 				estimate_cache_size(mmap_size, istate->cache_nr));
 	}
 
 	consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
-					0, istate->cache_nr, mmap, src_offset, previous_name);
-	strbuf_release(&previous_name_buf);
+					0, istate->cache_nr, mmap, src_offset, NULL);
 	return consumed;
 }
 
@@ -2007,8 +2008,7 @@ struct load_cache_entries_thread_data
 	int offset, nr;
 	void *mmap;
 	unsigned long start_offset;
-	struct strbuf previous_name_buf;
-	struct strbuf *previous_name;
+	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
 };
 
@@ -2021,7 +2021,7 @@ static void *load_cache_entries_thread(void *_data)
 	struct load_cache_entries_thread_data *p = _data;
 
 	p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
-		p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+		p->offset, p->nr, p->mmap, p->start_offset, p->previous_ce);
 	return NULL;
 }
 
@@ -2068,20 +2068,23 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 			p->istate = istate;
 			p->offset = i;
 			p->nr = ce_per_thread < istate->cache_nr - i ? ce_per_thread : istate->cache_nr - i;
+			p->mmap = mmap;
+			p->start_offset = src_offset;
 
 			/* create a mem_pool for each thread */
-			if (istate->version == 4)
+			if (istate->version == 4) {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size_from_compressed(p->nr));
-			else
+
+				/* create a previous ce entry for this block of cache entries */
+				if (previous_name->len) {
+					p->previous_ce = mem_pool__ce_alloc(p->ce_mem_pool, previous_name->len);
+					p->previous_ce->ce_namelen = previous_name->len;
+					memcpy(p->previous_ce->name, previous_name->buf, previous_name->len);
+				}
+			} else {
 				mem_pool_init(&p->ce_mem_pool,
 					estimate_cache_size(mmap_size, p->nr));
-
-			p->mmap = mmap;
-			p->start_offset = src_offset;
-			if (previous_name) {
-				strbuf_addbuf(&p->previous_name_buf, previous_name);
-				p->previous_name = &p->previous_name_buf;
 			}
 
 			if (pthread_create(&p->pthread, NULL, load_cache_entries_thread, p))
@@ -2104,7 +2107,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		} else
 			name = ondisk->name;
 
-		if (!previous_name) {
+		if (istate->version != 4) {
 			size_t len;
 
 			/* v3 and earlier */
@@ -2123,7 +2126,6 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 		if (pthread_join(p->pthread, NULL))
 			die("unable to join load_cache_entries_thread");
 		mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
-		strbuf_release(&p->previous_name_buf);
 		consumed += p->consumed;
 	}
 
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* [PATCH v5 5/5] read-cache: clean up casting and byte decoding
  2018-09-12 16:18 ` [PATCH v5 " Ben Peart
                     ` (3 preceding siblings ...)
  2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
@ 2018-09-12 16:18   ` Ben Peart
  4 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-12 16:18 UTC (permalink / raw)
  To: git; +Cc: gitster, pclouds, Ben Peart, Ben Peart

This patch does a clean up pass to minimize the casting required to work
with the memory mapped index (mmap).

It also makes the decoding of network byte order more consistent by using
get_be32() where possible.

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 read-cache.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 40dc4723b2..c05e887fc9 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1655,7 +1655,7 @@ int verify_index_checksum;
 /* Allow fsck to force verification of the cache entry order. */
 int verify_ce_order;
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr(const struct cache_header *hdr, unsigned long size)
 {
 	git_hash_ctx c;
 	unsigned char hash[GIT_MAX_RAWSZ];
@@ -1679,7 +1679,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size)
 }
 
 static int read_index_extension(struct index_state *istate,
-				const char *ext, void *data, unsigned long sz)
+				const char *ext, const char *data, unsigned long sz)
 {
 	switch (CACHE_EXT(ext)) {
 	case CACHE_EXT_TREE:
@@ -1902,7 +1902,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 }
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size);
 #endif
 static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
 
@@ -1912,14 +1912,14 @@ struct load_index_extensions
 	pthread_t pthread;
 #endif
 	struct index_state *istate;
-	void *mmap;
+	const char *mmap;
 	size_t mmap_size;
 	unsigned long src_offset;
 };
 
-static void *load_index_extensions(void *_data)
+static void *load_index_extensions(void *data)
 {
-	struct load_index_extensions *p = _data;
+	struct load_index_extensions *p = data;
 	unsigned long src_offset = p->src_offset;
 
 	while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
@@ -1930,13 +1930,12 @@ static void *load_index_extensions(void *_data)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(p->mmap + src_offset + 4);
 		if (read_index_extension(p->istate,
-			(const char *)p->mmap + src_offset,
-			(char *)p->mmap + src_offset + 8,
+			p->mmap + src_offset,
+			p->mmap + src_offset + 8,
 			extsize) < 0) {
-			munmap(p->mmap, p->mmap_size);
+			munmap((void *)p->mmap, p->mmap_size);
 			die("index file corrupt");
 		}
 		src_offset += 8;
@@ -1951,7 +1950,7 @@ static void *load_index_extensions(void *_data)
  * from the memory mapped file and add them to the given index.
  */
 static unsigned long load_cache_entry_block(struct index_state *istate,
-			struct mem_pool *ce_mem_pool, int offset, int nr, void *mmap,
+			struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
 			unsigned long start_offset, const struct cache_entry *previous_ce)
 {
 	int i;
@@ -1962,7 +1961,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 		struct cache_entry *ce;
 		unsigned long consumed;
 
-		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
 		ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
 		set_index_entry(istate, i, ce);
 
@@ -1973,7 +1972,7 @@ static unsigned long load_cache_entry_block(struct index_state *istate,
 }
 
 static unsigned long load_all_cache_entries(struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	unsigned long consumed;
 
@@ -2006,7 +2005,7 @@ struct load_cache_entries_thread_data
 	struct index_state *istate;
 	struct mem_pool *ce_mem_pool;
 	int offset, nr;
-	void *mmap;
+	const char *mmap;
 	unsigned long start_offset;
 	struct cache_entry *previous_ce;
 	unsigned long consumed;	/* return # of bytes in index file processed */
@@ -2026,7 +2025,7 @@ static void *load_cache_entries_thread(void *_data)
 }
 
 static unsigned long load_cache_entries_threaded(int nr_threads, struct index_state *istate,
-			void *mmap, size_t mmap_size, unsigned long src_offset)
+			const char *mmap, size_t mmap_size, unsigned long src_offset)
 {
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	struct load_cache_entries_thread_data *data;
@@ -2095,7 +2094,7 @@ static unsigned long load_cache_entries_threaded(int nr_threads, struct index_st
 				break;
 		}
 
-		ondisk = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
+		ondisk = (struct ondisk_cache_entry *)(mmap + src_offset);
 
 		/* On-disk flags are just 16 bits */
 		flags = get_be16(&ondisk->flags);
@@ -2143,8 +2142,8 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	int fd;
 	struct stat st;
 	unsigned long src_offset;
-	struct cache_header *hdr;
-	void *mmap;
+	const struct cache_header *hdr;
+	const char *mmap;
 	size_t mmap_size;
 	struct load_index_extensions p = { 0 };
 	unsigned long extension_offset = 0;
@@ -2176,7 +2175,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		die_errno("unable to map index file");
 	close(fd);
 
-	hdr = mmap;
+	hdr = (const struct cache_header *)mmap;
 	if (verify_hdr(hdr, mmap_size) < 0)
 		goto unmap;
 
@@ -2236,11 +2235,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		p.src_offset = src_offset;
 		load_index_extensions(&p);
 	}
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	return istate->cache_nr;
 
 unmap:
-	munmap(mmap, mmap_size);
+	munmap((void *)mmap, mmap_size);
 	die("index file corrupt");
 }
 
@@ -3263,7 +3262,7 @@ int should_validate_cache_entries(void)
 #define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
 
 #ifndef NO_PTHREADS
-static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+static unsigned long read_eoie_extension(const char *mmap, size_t mmap_size)
 {
 	/*
 	 * The end of index entries (EOIE) extension is guaranteed to be last
@@ -3274,7 +3273,6 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 	 * <4-byte offset>
 	 * <20-byte hash>
 	 */
-	const char *mmap = mmap_;
 	const char *index, *eoie;
 	uint32_t extsize;
 	unsigned long offset, src_offset;
@@ -3327,8 +3325,7 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
 		 * in 4-byte network byte order.
 		 */
 		uint32_t extsize;
-		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
-		extsize = ntohl(extsize);
+		extsize = get_be32(mmap + src_offset + 4);
 
 		/* verify the extension size isn't so large it will wrap around */
 		if (src_offset + 8 + extsize < src_offset)
-- 
2.18.0.windows.1


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
@ 2018-09-13 22:44     ` Junio C Hamano
  2018-09-15 10:02     ` Duy Nguyen
  1 sibling, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-13 22:44 UTC (permalink / raw)
  To: Ben Peart; +Cc: git\, pclouds\, Ben Peart

Ben Peart <benpeart@microsoft.com> writes:

> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>  
>  test_expect_success 'enable split index' '
>  	git config splitIndex.maxPercentChange 100 &&

It is safer to squash the following in; we may want to revisit the
decision test-lint makes on this issue later, though.

-- >8 --
Subject: [PATCH] SQUASH???

http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#export

specifies how "export name[=word]" ought to work, but because
writing "name=word; export name" is not so much more cumbersome
and some older shells that do not understand the former do grok
the latter.  test-lint also recommends spelling it this way.
---
 t/t1700-split-index.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index f613dd72e3..dab97c2187 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,7 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
-export GIT_TEST_DISABLE_EOIE=true
+GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
-- 
2.19.0


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
  2018-09-13 22:44     ` Junio C Hamano
@ 2018-09-15 10:02     ` Duy Nguyen
  2018-09-17 14:54       ` Ben Peart
  1 sibling, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:02 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> The End of Index Entry (EOIE) is used to locate the end of the variable
> length index entries and the beginning of the extensions. Code can take
> advantage of this to quickly locate the index extensions without having
> to parse through all of the index entries.
>
> Because it must be able to be loaded before the variable length cache
> entries and other index extensions, this extension must be written last.
> The signature for this extension is { 'E', 'O', 'I', 'E' }.
>
> The extension consists of:
>
> - 32-bit offset to the end of the index entries
>
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
>
> SHA-1("TREE" + <binary representation of N> +
>         "REUC" + <binary representation of M>)
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/technical/index-format.txt |  23 ++++
>  read-cache.c                             | 154 +++++++++++++++++++++--
>  t/README                                 |   5 +
>  t/t1700-split-index.sh                   |   1 +
>  4 files changed, 175 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
> index db3572626b..6bc2d90f7f 100644
> --- a/Documentation/technical/index-format.txt
> +++ b/Documentation/technical/index-format.txt
> @@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
>
>    - An ewah bitmap, the n-th bit indicates whether the n-th index entry
>      is not CE_FSMONITOR_VALID.
> +
> +== End of Index Entry
> +
> +  The End of Index Entry (EOIE) is used to locate the end of the variable
> +  length index entries and the begining of the extensions. Code can take
> +  advantage of this to quickly locate the index extensions without having
> +  to parse through all of the index entries.
> +
> +  Because it must be able to be loaded before the variable length cache
> +  entries and other index extensions, this extension must be written last.
> +  The signature for this extension is { 'E', 'O', 'I', 'E' }.
> +
> +  The extension consists of:
> +
> +  - 32-bit offset to the end of the index entries
> +
> +  - 160-bit SHA-1 over the extension types and their sizes (but not
> +       their contents).  E.g. if we have "TREE" extension that is N-bytes
> +       long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +       then the hash would be:
> +
> +       SHA-1("TREE" + <binary representation of N> +
> +               "REUC" + <binary representation of M>)
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..858935f123 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -43,6 +43,7 @@
>  #define CACHE_EXT_LINK 0x6c696e6b        /* "link" */
>  #define CACHE_EXT_UNTRACKED 0x554E5452   /* "UNTR" */
>  #define CACHE_EXT_FSMONITOR 0x46534D4E   /* "FSMN" */
> +#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945 /* "EOIE" */
>
>  /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
>  #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
> @@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
>         case CACHE_EXT_FSMONITOR:
>                 read_fsmonitor_extension(istate, data, sz);
>                 break;
> +       case CACHE_EXT_ENDOFINDEXENTRIES:
> +               /* already handled in do_read_index() */
> +               break;

Perhaps catch this extension when it's not written at the end (e.g. by
some other git implementation) and warn.

>         default:
>                 if (*ext < 'A' || 'Z' < *ext)
>                         return error("index uses %.4s extension, which we do not understand",
> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> +#endif

Keep functions unconditionally built as much as possible. I don't see
why this read_eoie_extension() must be built only on multithread
platforms.

> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>         return 0;
>  }
>
> -static int write_index_ext_header(git_hash_ctx *context, int fd,
> -                                 unsigned int ext, unsigned int sz)
> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
> +                                 int fd, unsigned int ext, unsigned int sz)
>  {
>         ext = htonl(ext);
>         sz = htonl(sz);
> +       if (eoie_context) {
> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
> +       }
>         return ((ce_write(context, fd, &ext, 4) < 0) ||
>                 (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>  }
> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>  {
>         uint64_t start = getnanotime();
>         int newfd = tempfile->fd;
> -       git_hash_ctx c;
> +       git_hash_ctx c, eoie_c;
>         struct cache_header hdr;
>         int i, err = 0, removed, extended, hdr_version;
>         struct cache_entry **cache = istate->cache;
> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         struct ondisk_cache_entry_extended ondisk;
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>         int drop_cache_tree = istate->drop_cache_tree;
> +       unsigned long offset;
>
>         for (i = removed = extended = 0; i < entries; i++) {
>                 if (cache[i]->ce_flags & CE_REMOVE)
> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 return err;
>
>         /* Write extension data here */
> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> +       the_hash_algo->init_fn(&eoie_c);

Don't write (or even calculate to write it) unless it's needed. Which
means only do this when parallel reading is enabled and the index size
large enough, or when a test variable is set so you can force writing
this extension.

I briefly wondered if we should continue writing the extension if it's
already written. This way you can manually enable it with "git
update-index". But I don't think it's worth the complexity.

>         if (!strip_extensions && istate->split_index) {
>                 struct strbuf sb = STRBUF_INIT;
>
>                 err = write_link_extension(&sb, istate) < 0 ||
> -                       write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
> +                       write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
>                                                sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 cache_tree_write(&sb, istate->cache_tree);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 resolve_undo_write(&sb, istate->resolve_undo);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
>                                              sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_untracked_extension(&sb, istate->untracked);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
>                                              sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_fsmonitor_extension(&sb, istate);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +                       || ce_write(&c, newfd, sb.buf, sb.len) < 0;
> +               strbuf_release(&sb);
> +               if (err)
> +                       return -1;
> +       }
> +
> +       /*
> +        * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
> +        * so that it can be found and processed before all the index entries are
> +        * read.
> +        */
> +       if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
> +               struct strbuf sb = STRBUF_INIT;
> +
> +               write_eoie_extension(&sb, &eoie_c, offset);
> +               err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
>
>         return validate_index_cache_entries;
>  }
> +
> +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
> +
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
> +{
> +       /*
> +        * The end of index entries (EOIE) extension is guaranteed to be last
> +        * so that it can be found by scanning backwards from the EOF.
> +        *
> +        * "EOIE"
> +        * <4-byte length>
> +        * <4-byte offset>
> +        * <20-byte hash>
> +        */
> +       const char *mmap = mmap_;
> +       const char *index, *eoie;
> +       uint32_t extsize;
> +       unsigned long offset, src_offset;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +       git_hash_ctx c;
> +
> +       /* ensure we have an index big enough to contain an EOIE extension */
> +       if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
> +               return 0;

All these "return 0" indicates an error in EOIE extension. You
probably want to print some warning (much easier to track down why
parallel reading does not happen).

> +
> +       /* validate the extension signature */
> +       index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
> +       if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /* validate the extension size */
> +       extsize = get_be32(index);
> +       if (extsize != EOIE_SIZE)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * Validate the offset we're going to look for the first extension
> +        * signature is after the index header and before the eoie extension.
> +        */
> +       offset = get_be32(index);
> +       if (mmap + offset < mmap + sizeof(struct cache_header))
> +               return 0;
> +       if (mmap + offset >= eoie)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * The hash is computed over extension types and their sizes (but not
> +        * their contents).  E.g. if we have "TREE" extension that is N-bytes
> +        * long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +        * then the hash would be:
> +        *
> +        * SHA-1("TREE" + <binary representation of N> +
> +        *               "REUC" + <binary representation of M>)
> +        */
> +       src_offset = offset;
> +       the_hash_algo->init_fn(&c);
> +       while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);
> +
> +               /* verify the extension size isn't so large it will wrap around */
> +               if (src_offset + 8 + extsize < src_offset)
> +                       return 0;
> +
> +               the_hash_algo->update_fn(&c, mmap + src_offset, 8);
> +
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +       the_hash_algo->final_fn(hash, &c);
> +       if (hashcmp(hash, (const unsigned char *)index))
> +               return 0;
> +
> +       /* Validate that the extension offsets returned us back to the eoie extension. */
> +       if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> +               return 0;
> +
> +       return offset;
> +}
> +#endif
> +
> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)

We normally just put function implementations before it's used to
avoid static forward declaration. Any special reason why it's not done
here?

> +{
> +       uint32_t buffer;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +
> +       /* offset */
> +       put_be32(&buffer, offset);
> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
> +
> +       /* hash */
> +       the_hash_algo->final_fn(hash, eoie_context);
> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
> +}
> diff --git a/t/README b/t/README
> index 9028b47d92..d8754dd23a 100644
> --- a/t/README
> +++ b/t/README
> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>  path where deltas larger than this limit require extra memory
>  allocation for bookkeeping.
>
> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed

I have a feeling that you won't have problems if you don't write eoie
extension by default in the first place. Then this could be switched
to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
eoie and split index are forced on when running through the test
suite, but that should be an easy fix.

> +as they currently hard code SHA values for the index which are no longer
> +valid due to the addition of the EOIE extension.
> +
>  Naming Tests
>  ------------
>
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>
>  test_expect_success 'enable split index' '
>         git config splitIndex.maxPercentChange 100 &&
> --
> 2.18.0.windows.1
>


-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
@ 2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
                         ` (3 more replies)
  0 siblings, 4 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:22 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by loading
> the cache extensions on a worker thread in parallel with loading the cache
> entries.
>
> In some cases, loading the extensions takes longer than loading the
> cache entries so this patch utilizes the new EOIE to start the thread to
> load the extensions before loading all the cache entries in parallel.
>
> This is possible because the current extensions don't access the cache
> entries in the index_state structure so are OK that they don't all exist
> yet.
>
> The CACHE_EXT_TREE, CACHE_EXT_RESOLVE_UNDO, and CACHE_EXT_UNTRACKED
> extensions don't even get a pointer to the index so don't have access to the
> cache entries.
>
> CACHE_EXT_LINK only uses the index_state to initialize the split index.
> CACHE_EXT_FSMONITOR only uses the index_state to save the fsmonitor last
> update and dirty flags.
>
> I used p0002-read-cache.sh to generate some performance data:
>
> Test w/100,000 files                Baseline         Parallel Extensions
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> Test w/1,000,000 files              Baseline         Parallel Extensions
> ------------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/config.txt |  6 +++
>  config.c                 | 18 ++++++++
>  config.h                 |  1 +
>  read-cache.c             | 94 ++++++++++++++++++++++++++++++++--------
>  4 files changed, 102 insertions(+), 17 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 1c42364988..79f8296d9c 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -2391,6 +2391,12 @@ imap::
>         The configuration variables in the 'imap' section are described
>         in linkgit:git-imap-send[1].
>
> +index.threads::
> +       Specifies the number of threads to spawn when loading the index.
> +       This is meant to reduce index load time on multiprocessor machines.
> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
> +       CPU's and set the number of threads accordingly. Defaults to 'true'.

I'd rather this variable defaults to 0. Spawning threads have
associated cost and most projects out there are small enough that this
multi threading could just add more cost than gain. It only makes
sense to enable this on huge repos.

Wait there's no way to disable this parallel reading? Does not sound
right. And  if ordinary numbers mean the number of threads then 0
should mean no threading. Auto detection could have a new keyword,
like 'auto'.

> +
>  index.version::
>         Specify the version with which new index files should be
>         initialized.  This does not affect existing repositories.
> diff --git a/config.c b/config.c
> index 9a0b10d4bc..9bd79fb165 100644
> --- a/config.c
> +++ b/config.c
> @@ -2289,6 +2289,24 @@ int git_config_get_fsmonitor(void)
>         return 0;
>  }
>
> +/*
> + * You can disable multi-threaded code by setting index.threads
> + * to 'false' (or 1)
> + */
> +int git_config_get_index_threads(void)
> +{
> +       int is_bool, val;
> +
> +       if (!git_config_get_bool_or_int("index.threads", &is_bool, &val)) {
> +               if (is_bool)
> +                       return val ? 0 : 1;
> +               else
> +                       return val;
> +       }
> +
> +       return 0; /* auto-detect */
> +}
> +
>  NORETURN
>  void git_die_config_linenr(const char *key, const char *filename, int linenr)
>  {
> diff --git a/config.h b/config.h
> index ab46e0165d..a06027e69b 100644
> --- a/config.h
> +++ b/config.h
> @@ -250,6 +250,7 @@ extern int git_config_get_untracked_cache(void);
>  extern int git_config_get_split_index(void);
>  extern int git_config_get_max_percent_split_change(void);
>  extern int git_config_get_fsmonitor(void);
> +extern int git_config_get_index_threads(void);
>
>  /* This dies if the configured or default date is in the future */
>  extern int git_config_get_expiry(const char *key, const char **output);
> diff --git a/read-cache.c b/read-cache.c
> index 858935f123..b203eebb44 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -23,6 +23,10 @@
>  #include "split-index.h"
>  #include "utf8.h"
>  #include "fsmonitor.h"
> +#ifndef NO_PTHREADS
> +#include <pthread.h>
> +#include <thread-utils.h>
> +#endif

I don't think you're supposed to include system header files after
"cache.h". Including thread-utils.h should be enough (and it keeps the
exception of inclduing pthread.h in just one place). Please use
"pthread-utils.h" instead of <pthread-utils.h> which is usually for
system header files. And include ptherad-utils.h unconditionally.

>
>  /* Mask for the name length in ce_flags in the on-disk index */
>
> @@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>  #endif
>  static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>
> +struct load_index_extensions
> +{
> +#ifndef NO_PTHREADS
> +       pthread_t pthread;
> +#endif
> +       struct index_state *istate;
> +       void *mmap;
> +       size_t mmap_size;
> +       unsigned long src_offset;
> +};
> +
> +static void *load_index_extensions(void *_data)
> +{
> +       struct load_index_extensions *p = _data;
> +       unsigned long src_offset = p->src_offset;
> +
> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);
> +               if (read_index_extension(p->istate,
> +                       (const char *)p->mmap + src_offset,
> +                       (char *)p->mmap + src_offset + 8,
> +                       extsize) < 0) {
> +                       munmap(p->mmap, p->mmap_size);
> +                       die("index file corrupt");

_()

> +               }
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +
> +       return NULL;
> +}
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> @@ -1908,6 +1952,11 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         void *mmap;
>         size_t mmap_size;
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> +       struct load_index_extensions p = { 0 };
> +       unsigned long extension_offset = 0;
> +#ifndef NO_PTHREADS
> +       int nr_threads;
> +#endif
>
>         if (istate->initialized)
>                 return istate->cache_nr;
> @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
>         istate->initialized = 1;
>
> +       p.istate = istate;
> +       p.mmap = mmap;
> +       p.mmap_size = mmap_size;
> +
> +#ifndef NO_PTHREADS
> +       nr_threads = git_config_get_index_threads();
> +       if (!nr_threads)
> +               nr_threads = online_cpus();
> +
> +       if (nr_threads >= 2) {
> +               extension_offset = read_eoie_extension(mmap, mmap_size);
> +               if (extension_offset) {
> +                       /* create a thread to load the index extensions */

Pointless comment. It's pretty clear from the pthread_create() below
thanks to good function naming. Please remove.

> +                       p.src_offset = extension_offset;
> +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> +                               die(_("unable to create load_index_extensions_thread"));
> +               }
> +       }
> +#endif
> +
>         if (istate->version == 4) {
>                 previous_name = &previous_name_buf;
>                 mem_pool_init(&istate->ce_mem_pool,
> @@ -1970,23 +2039,14 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>         istate->timestamp.sec = st.st_mtime;
>         istate->timestamp.nsec = ST_MTIME_NSEC(st);
>
> -       while (src_offset <= mmap_size - the_hash_algo->rawsz - 8) {
> -               /* After an array of active_nr index entries,
> -                * there can be arbitrary number of extended
> -                * sections, each of which is prefixed with
> -                * extension name (4-byte) and section length
> -                * in 4-byte network byte order.
> -                */
> -               uint32_t extsize;
> -               memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> -               extsize = ntohl(extsize);
> -               if (read_index_extension(istate,
> -                                        (const char *) mmap + src_offset,
> -                                        (char *) mmap + src_offset + 8,
> -                                        extsize) < 0)
> -                       goto unmap;
> -               src_offset += 8;
> -               src_offset += extsize;
> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
> +#ifndef NO_PTHREADS
> +       if (extension_offset && pthread_join(p.pthread, NULL))
> +               die(_("unable to join load_index_extensions_thread"));

I guess the last _ is a typo and you wanted "unable to join
load_index_extensions thread". Please use die_errno() instead.

> +#endif
> +       if (!extension_offset) {
> +               p.src_offset = src_offset;
> +               load_index_extensions(&p);
>         }
>         munmap(mmap, mmap_size);
>         return istate->cache_nr;
> --
> 2.18.0.windows.1
>


-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
@ 2018-09-15 10:24       ` Duy Nguyen
  2018-09-17 16:38         ` Ben Peart
  2018-09-15 16:23       ` Duy Nguyen
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:24 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
> > @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
> >         istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
> >         istate->initialized = 1;
> >
> > +       p.istate = istate;
> > +       p.mmap = mmap;
> > +       p.mmap_size = mmap_size;
> > +
> > +#ifndef NO_PTHREADS
> > +       nr_threads = git_config_get_index_threads();
> > +       if (!nr_threads)
> > +               nr_threads = online_cpus();
> > +
> > +       if (nr_threads >= 2) {
> > +               extension_offset = read_eoie_extension(mmap, mmap_size);
> > +               if (extension_offset) {

One more thing I forgot. If the extension area is small enough, then
we should not need to create a thread to parse extensions in parallel.
We should know roughly how much work we need because we know the total
size of all extensions.

> > +                       /* create a thread to load the index extensions */
>
> Pointless comment. It's pretty clear from the pthread_create() below
> thanks to good function naming. Please remove.
>
> > +                       p.src_offset = extension_offset;
> > +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
> > +                               die(_("unable to create load_index_extensions_thread"));
> > +               }
> > +       }
> > +#endif
> > +
> >         if (istate->version == 4) {
> >                 previous_name = &previous_name_buf;
> >                 mem_pool_init(&istate->ce_mem_pool,
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
@ 2018-09-15 10:31     ` Duy Nguyen
  2018-09-17 17:25       ` Ben Peart
  2018-09-15 11:07     ` Duy Nguyen
  2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 10:31 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them.
>
> I used p0002-read-cache.sh to generate some performance data:
>
> Test w/100,000 files                Baseline         Parallel entries
> ---------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>
> Test w/1,000,000 files              Baseline         Parallel entries
> ------------------------------------------------------------------------------
> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%

The numbers here and the previous patch to load extensions in parallel
are exactly the same. What do these numbers mean? With both changes?
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
@ 2018-09-15 11:07     ` Duy Nguyen
  2018-09-15 11:09       ` Duy Nguyen
  2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:07 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> This patch helps address the CPU cost of loading the index by creating
> multiple threads to divide the work of loading and converting the cache
> entries across all available CPU cores.
>
> It accomplishes this by having the primary thread loop across the index file
> tracking the offset and (for V4 indexes) expanding the name. It creates a
> thread to process each block of entries as it comes to them.

I added a couple trace_printf() to see how time is spent. This is with
a 1m entry index (basically my webkit.git index repeated 4 times)

12:50:00.084237 read-cache.c:1721       start loading index
12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s:
loaded all extensions (1667075 bytes)
12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s:
loaded 367110 entries
12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s:
finished scanning all entries
12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s:
loaded 367110 entries
12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s:
loaded 367108 entries
12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s:
read cache .git/index

Two observations:

- the extension thread finishes up quickly (this is with TREE
extension alone). We could use that spare core to parse some more
entries.

- the main "scanning and allocating" thread does hold up the two
remaining threads. You can see the first index entry thread is
finished even before the scanning thread. And this scanning thread
takes a lot of cpu.

If all index entry threads start at the same time, based on these
numbers we would be finished around 12:50:00.185352 mark, cutting
loading time by half.

Could you go back to your original solution? If you don't want to
spend more time on this, I offer to rewrite this patch.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 11:07     ` Duy Nguyen
@ 2018-09-15 11:09       ` Duy Nguyen
  2018-09-17 18:52         ` Ben Peart
  0 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:09 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 01:07:46PM +0200, Duy Nguyen wrote:
> 12:50:00.084237 read-cache.c:1721       start loading index
> 12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s: loaded all extensions (1667075 bytes)
> 12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s: loaded 367110 entries
> 12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s: finished scanning all entries
> 12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s: loaded 367110 entries
> 12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s: loaded 367108 entries
> 12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s: read cache .git/index

The previous mail wraps these lines and make it a bit hard to read. Corrected now.

--
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
  2018-09-15 10:31     ` Duy Nguyen
  2018-09-15 11:07     ` Duy Nguyen
@ 2018-09-15 11:29     ` Duy Nguyen
  2 siblings, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 11:29 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>  #ifndef NO_PTHREADS
>         nr_threads = git_config_get_index_threads();
> -       if (!nr_threads)
> -               nr_threads = online_cpus();
> +       if (!nr_threads) {
> +               cpus = online_cpus();
> +               nr_threads = istate->cache_nr / THREAD_COST;
> +               if (nr_threads > cpus)
> +                       nr_threads = cpus;

It seems like overcommitting cpu does reduce time. With this patch
(and a 4 core system), I got

$ test-tool read-cache 100
real    0m36.270s
user    0m54.193s
sys     0m17.346s

if I force nr_threads to 9 (even though cpus is 4)

$ test-tool read-cache 100
real    0m33.592s
user    1m4.230s
sys     0m18.380s

Even though we use more cpus, real time is shorter. I guess these
threads still sleep a bit due to I/O and having more threads than
cores will utilize those idle cycles.
--
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
@ 2018-09-15 16:23       ` Duy Nguyen
  2018-09-17 17:19         ` Junio C Hamano
  2018-09-17 16:26       ` Ben Peart
  2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-15 16:23 UTC (permalink / raw)
  To: Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart

On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
> Wait there's no way to disable this parallel reading? Does not sound
> right. And  if ordinary numbers mean the number of threads then 0
> should mean no threading. Auto detection could have a new keyword,
> like 'auto'.

My bad. Disabling threading means _1_ thread. What was I thinking...
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-15 10:02     ` Duy Nguyen
@ 2018-09-17 14:54       ` Ben Peart
  2018-09-17 16:05         ` Duy Nguyen
  0 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-17 14:54 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:02 AM, Duy Nguyen wrote:

>>          default:
>>                  if (*ext < 'A' || 'Z' < *ext)
>>                          return error("index uses %.4s extension, which we do not understand",
>> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>>          return ondisk_size + entries * per_entry;
>>   }
>>
>> +#ifndef NO_PTHREADS
>> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>> +#endif
> 
> Keep functions unconditionally built as much as possible. I don't see
> why this read_eoie_extension() must be built only on multithread
> platforms.
> 

This is conditional to avoid generating a warning on single threaded 
platforms where the function is currently unused.  That seemed like a 
better choice than calling it and ignoring it on single threaded 
platforms just to avoid a compiler warning.

>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>> +
>>   /* remember to discard_cache() before reading a different cache! */
>>   int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>   {
>> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>>          return 0;
>>   }
>>
>> -static int write_index_ext_header(git_hash_ctx *context, int fd,
>> -                                 unsigned int ext, unsigned int sz)
>> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
>> +                                 int fd, unsigned int ext, unsigned int sz)
>>   {
>>          ext = htonl(ext);
>>          sz = htonl(sz);
>> +       if (eoie_context) {
>> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
>> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
>> +       }
>>          return ((ce_write(context, fd, &ext, 4) < 0) ||
>>                  (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>>   }
>> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>   {
>>          uint64_t start = getnanotime();
>>          int newfd = tempfile->fd;
>> -       git_hash_ctx c;
>> +       git_hash_ctx c, eoie_c;
>>          struct cache_header hdr;
>>          int i, err = 0, removed, extended, hdr_version;
>>          struct cache_entry **cache = istate->cache;
>> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          struct ondisk_cache_entry_extended ondisk;
>>          struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>          int drop_cache_tree = istate->drop_cache_tree;
>> +       unsigned long offset;
>>
>>          for (i = removed = extended = 0; i < entries; i++) {
>>                  if (cache[i]->ce_flags & CE_REMOVE)
>> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>                  return err;
>>
>>          /* Write extension data here */
>> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>> +       the_hash_algo->init_fn(&eoie_c);
> 
> Don't write (or even calculate to write it) unless it's needed. Which
> means only do this when parallel reading is enabled and the index size
> large enough, or when a test variable is set so you can force writing
> this extension.

I made the logic always write the extension based on the earlier 
discussion [1] where it was suggested this should have been part of the 
original index format for extensions from the beginning.  This helps 
ensure it is available for current and future uses we haven't even 
discovered yet.

[1] 
https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/


>> +
>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> 
> We normally just put function implementations before it's used to
> avoid static forward declaration. Any special reason why it's not done
> here?
> 

This was done to promote readability of the (already large) read-cache.c 
file.  I first considered moving the EOIE read/write functions into a 
separate file entirely but they need access to information only 
available within read-cache.c so I compromised and moved them to the end 
of the file instead.

>> +{
>> +       uint32_t buffer;
>> +       unsigned char hash[GIT_MAX_RAWSZ];
>> +
>> +       /* offset */
>> +       put_be32(&buffer, offset);
>> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +
>> +       /* hash */
>> +       the_hash_algo->final_fn(hash, eoie_context);
>> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
>> +}
>> diff --git a/t/README b/t/README
>> index 9028b47d92..d8754dd23a 100644
>> --- a/t/README
>> +++ b/t/README
>> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>>   path where deltas larger than this limit require extra memory
>>   allocation for bookkeeping.
>>
>> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
>> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
> 
> I have a feeling that you won't have problems if you don't write eoie
> extension by default in the first place. Then this could be switched
> to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
> eoie and split index are forced on when running through the test
> suite, but that should be an easy fix.
> 
>> +as they currently hard code SHA values for the index which are no longer
>> +valid due to the addition of the EOIE extension.
>> +
>>   Naming Tests
>>   ------------
>>
>> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
>> index 39133bcbc8..f613dd72e3 100755
>> --- a/t/t1700-split-index.sh
>> +++ b/t/t1700-split-index.sh
>> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>>   # We need total control of index splitting here
>>   sane_unset GIT_TEST_SPLIT_INDEX
>>   sane_unset GIT_FSMONITOR_TEST
>> +export GIT_TEST_DISABLE_EOIE=true
>>
>>   test_expect_success 'enable split index' '
>>          git config splitIndex.maxPercentChange 100 &&
>> --
>> 2.18.0.windows.1
>>
> 
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 14:54       ` Ben Peart
@ 2018-09-17 16:05         ` Duy Nguyen
  2018-09-17 17:31           ` Junio C Hamano
  0 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-17 16:05 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Sep 17, 2018 at 4:55 PM Ben Peart <peartben@gmail.com> wrote:
> On 9/15/2018 6:02 AM, Duy Nguyen wrote:
>
> >>          default:
> >>                  if (*ext < 'A' || 'Z' < *ext)
> >>                          return error("index uses %.4s extension, which we do not understand",
> >> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
> >>          return ondisk_size + entries * per_entry;
> >>   }
> >>
> >> +#ifndef NO_PTHREADS
> >> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> >> +#endif
> >
> > Keep functions unconditionally built as much as possible. I don't see
> > why this read_eoie_extension() must be built only on multithread
> > platforms.
> >
>
> This is conditional to avoid generating a warning on single threaded
> platforms where the function is currently unused.  That seemed like a
> better choice than calling it and ignoring it on single threaded
> platforms just to avoid a compiler warning.

The third option is ignore the compiler. I consider that warning a
helpful suggestion, not a strict rule.

Most devs don't run single thread builds (I think) so is this function
is updated in a way that breaks single thread mode, it can only be
found out when this function is used in single thread mode. At that
point the function may have changed a lot. If it's built
unconditionally, at least single thread users will yell up much sooner
and we could fix it much earlier.

> >> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
> >>                  return err;
> >>
> >>          /* Write extension data here */
> >> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> >> +       the_hash_algo->init_fn(&eoie_c);
> >
> > Don't write (or even calculate to write it) unless it's needed. Which
> > means only do this when parallel reading is enabled and the index size
> > large enough, or when a test variable is set so you can force writing
> > this extension.
>
> I made the logic always write the extension based on the earlier
> discussion [1] where it was suggested this should have been part of the
> original index format for extensions from the beginning.  This helps
> ensure it is available for current and future uses we haven't even
> discovered yet.

But it _is_ available now. If you need it, you write the extension
out. If we make this part of index version 5 (and make it not an
extension anymore) then I buy that argument. As it is, it's an
optional extension.

> [1] https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/
>
>
> >> +
> >> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> >
> > We normally just put function implementations before it's used to
> > avoid static forward declaration. Any special reason why it's not done
> > here?
> >
>
> This was done to promote readability of the (already large) read-cache.c
> file.  I first considered moving the EOIE read/write functions into a
> separate file entirely but they need access to information only
> available within read-cache.c so I compromised and moved them to the end
> of the file instead.

I consider grouping extension related functions closer to
read_index_extension gives better readability, or at least better than
just putting new functions at the end in no particular order. But I
guess this is personal view.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
  2018-09-15 10:24       ` Duy Nguyen
  2018-09-15 16:23       ` Duy Nguyen
@ 2018-09-17 16:26       ` Ben Peart
  2018-09-17 16:45         ` Duy Nguyen
  2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 1 reply; 87+ messages in thread
From: Ben Peart @ 2018-09-17 16:26 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:22 AM, Duy Nguyen wrote:
>> +index.threads::
>> +       Specifies the number of threads to spawn when loading the index.
>> +       This is meant to reduce index load time on multiprocessor machines.
>> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
>> +       CPU's and set the number of threads accordingly. Defaults to 'true'.
> 
> I'd rather this variable defaults to 0. Spawning threads have
> associated cost and most projects out there are small enough that this
> multi threading could just add more cost than gain. It only makes
> sense to enable this on huge repos.
> 
> Wait there's no way to disable this parallel reading? Does not sound
> right. And  if ordinary numbers mean the number of threads then 0
> should mean no threading. Auto detection could have a new keyword,
> like 'auto'.
> 

The index.threads setting is patterned after the pack.threads setting 
for consistency.  Specifying 1 (or 'false') will disable multithreading 
but I will call that out explicitly in the documentation to make it more 
obvious.

The THREAD_COST logic is designed to ensure small repos don't incur more 
cost than gain.  If you have data on that logic that shows it isn't 
working properly, I'm happy to change the logic as necessary.

>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -23,6 +23,10 @@
>>   #include "split-index.h"
>>   #include "utf8.h"
>>   #include "fsmonitor.h"
>> +#ifndef NO_PTHREADS
>> +#include <pthread.h>
>> +#include <thread-utils.h>
>> +#endif
> 
> I don't think you're supposed to include system header files after
> "cache.h". Including thread-utils.h should be enough (and it keeps the
> exception of inclduing pthread.h in just one place). Please use
> "pthread-utils.h" instead of <pthread-utils.h> which is usually for
> system header files. And include ptherad-utils.h unconditionally.
> 

Thanks, I'll fix that.

>>
>>   /* Mask for the name length in ce_flags in the on-disk index */
>>
>> @@ -1898,6 +1902,46 @@ static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>>   #endif
>>   static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>>
>> +struct load_index_extensions
>> +{
>> +#ifndef NO_PTHREADS
>> +       pthread_t pthread;
>> +#endif
>> +       struct index_state *istate;
>> +       void *mmap;
>> +       size_t mmap_size;
>> +       unsigned long src_offset;
>> +};
>> +
>> +static void *load_index_extensions(void *_data)
>> +{
>> +       struct load_index_extensions *p = _data;
>> +       unsigned long src_offset = p->src_offset;
>> +
>> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
>> +               /* After an array of active_nr index entries,
>> +                * there can be arbitrary number of extended
>> +                * sections, each of which is prefixed with
>> +                * extension name (4-byte) and section length
>> +                * in 4-byte network byte order.
>> +                */
>> +               uint32_t extsize;
>> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
>> +               extsize = ntohl(extsize);
>> +               if (read_index_extension(p->istate,
>> +                       (const char *)p->mmap + src_offset,
>> +                       (char *)p->mmap + src_offset + 8,
>> +                       extsize) < 0) {
>> +                       munmap(p->mmap, p->mmap_size);
>> +                       die("index file corrupt");
> 
> _()
> 

You're feedback style can be a bit abrupt and terse.  I _think_ what you 
are trying to say here is that the "die" call should use the _() macro 
around the string.

This is an edit of the previous code that loaded index extensions and 
doesn't change the use of _(). I don't know the rules for when _() 
should be used and didn't have any luck finding where it was documented 
so left it unchanged.

FWIW, in this file alone there are 20 existing instances of die() or 
die_errorno() and only two that use the _() macro.  A quick grep through 
the source code shows thousands of die() calls the vast majority of 
which do not use the _() macro.  This appears to be an area that is 
unclear and inconsistent and could use some attention in a separate patch.


>> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
>> +#ifndef NO_PTHREADS
>> +       if (extension_offset && pthread_join(p.pthread, NULL))
>> +               die(_("unable to join load_index_extensions_thread"));
> 
> I guess the last _ is a typo and you wanted "unable to join
> load_index_extensions thread". Please use die_errno() instead.
> 

Why should this be die_errorno() here?  All other instances of 
pthread_join() failing in a fatal way use die(), not die_errorno().

>> +#endif
>> +       if (!extension_offset) {
>> +               p.src_offset = src_offset;
>> +               load_index_extensions(&p);
>>          }
>>          munmap(mmap, mmap_size);
>>          return istate->cache_nr;
>> --
>> 2.18.0.windows.1
>>
> 
> 

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:24       ` Duy Nguyen
@ 2018-09-17 16:38         ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-17 16:38 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:24 AM, Duy Nguyen wrote:
> On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
>>> @@ -1944,6 +1993,26 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>>          istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
>>>          istate->initialized = 1;
>>>
>>> +       p.istate = istate;
>>> +       p.mmap = mmap;
>>> +       p.mmap_size = mmap_size;
>>> +
>>> +#ifndef NO_PTHREADS
>>> +       nr_threads = git_config_get_index_threads();
>>> +       if (!nr_threads)
>>> +               nr_threads = online_cpus();
>>> +
>>> +       if (nr_threads >= 2) {
>>> +               extension_offset = read_eoie_extension(mmap, mmap_size);
>>> +               if (extension_offset) {
> 
> One more thing I forgot. If the extension area is small enough, then
> we should not need to create a thread to parse extensions in parallel.
> We should know roughly how much work we need because we know the total
> size of all extensions.
> 

The only extensions I found to be significant enough to be helped by a 
separate thread was the cache tree.  Since the size of the cache tree is 
driven by the number of files in the repo, I think the existing 
THREAD_COST logic (that comes in the next patch of the series) is a 
sufficient proxy.  Basically, if you have enough cache entries to be 
benefited by threading, your extensions (driven by the cache tree) are 
probably also big enough to warrant a thread.

>>> +                       /* create a thread to load the index extensions */
>>
>> Pointless comment. It's pretty clear from the pthread_create() below
>> thanks to good function naming. Please remove.
>>
>>> +                       p.src_offset = extension_offset;
>>> +                       if (pthread_create(&p.pthread, NULL, load_index_extensions, &p))
>>> +                               die(_("unable to create load_index_extensions_thread"));
>>> +               }
>>> +       }
>>> +#endif
>>> +
>>>          if (istate->version == 4) {
>>>                  previous_name = &previous_name_buf;
>>>                  mem_pool_init(&istate->ce_mem_pool,

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-17 16:26       ` Ben Peart
@ 2018-09-17 16:45         ` Duy Nguyen
  0 siblings, 0 replies; 87+ messages in thread
From: Duy Nguyen @ 2018-09-17 16:45 UTC (permalink / raw)
  To: Ben Peart; +Cc: Ben Peart, Git Mailing List, Junio C Hamano, Ben Peart

On Mon, Sep 17, 2018 at 6:26 PM Ben Peart <peartben@gmail.com> wrote:
>
>
>
> On 9/15/2018 6:22 AM, Duy Nguyen wrote:
> >> +index.threads::
> >> +       Specifies the number of threads to spawn when loading the index.
> >> +       This is meant to reduce index load time on multiprocessor machines.
> >> +       Specifying 0 or 'true' will cause Git to auto-detect the number of
> >> +       CPU's and set the number of threads accordingly. Defaults to 'true'.
> >
> > I'd rather this variable defaults to 0. Spawning threads have
> > associated cost and most projects out there are small enough that this
> > multi threading could just add more cost than gain. It only makes
> > sense to enable this on huge repos.
> >
> > Wait there's no way to disable this parallel reading? Does not sound
> > right. And  if ordinary numbers mean the number of threads then 0
> > should mean no threading. Auto detection could have a new keyword,
> > like 'auto'.
> >
>
> The index.threads setting is patterned after the pack.threads setting
> for consistency.  Specifying 1 (or 'false') will disable multithreading
> but I will call that out explicitly in the documentation to make it more
> obvious.
>
> The THREAD_COST logic is designed to ensure small repos don't incur more
> cost than gain.  If you have data on that logic that shows it isn't
> working properly, I'm happy to change the logic as necessary.

THREAD_COST does not apply to this extension thread if I remember correctly.

> >> +static void *load_index_extensions(void *_data)
> >> +{
> >> +       struct load_index_extensions *p = _data;
> >> +       unsigned long src_offset = p->src_offset;
> >> +
> >> +       while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
> >> +               /* After an array of active_nr index entries,
> >> +                * there can be arbitrary number of extended
> >> +                * sections, each of which is prefixed with
> >> +                * extension name (4-byte) and section length
> >> +                * in 4-byte network byte order.
> >> +                */
> >> +               uint32_t extsize;
> >> +               memcpy(&extsize, (char *)p->mmap + src_offset + 4, 4);
> >> +               extsize = ntohl(extsize);
> >> +               if (read_index_extension(p->istate,
> >> +                       (const char *)p->mmap + src_offset,
> >> +                       (char *)p->mmap + src_offset + 8,
> >> +                       extsize) < 0) {
> >> +                       munmap(p->mmap, p->mmap_size);
> >> +                       die("index file corrupt");
> >
> > _()
> >
>
> You're feedback style can be a bit abrupt and terse.  I _think_ what you
> are trying to say here is that the "die" call should use the _() macro
> around the string.

Yes. Sorry I should have explained a bit better.

> This is an edit of the previous code that loaded index extensions and
> doesn't change the use of _(). I don't know the rules for when _()
> should be used and didn't have any luck finding where it was documented
> so left it unchanged.
>
> FWIW, in this file alone there are 20 existing instances of die() or
> die_errorno() and only two that use the _() macro.  A quick grep through
> the source code shows thousands of die() calls the vast majority of
> which do not use the _() macro.  This appears to be an area that is
> unclear and inconsistent and could use some attention in a separate patch.

This is one of the gray areas where we have to determine if the
message should be translated or not. And it should be translated
unless it's part of the plumbing output, to be consumed by scripts.

I know there's lots of messages still untranslated. I'm trying to do
something about that. But I cannot just go fix up all strings when you
all keep adding more strings for me to go fix. When you add a new
string, please consider if it should be translated or not. In this
case since it already receives reviewer attention we should be able to
determine it now, instead of delaying it for later.

> >> +       /* if we created a thread, join it otherwise load the extensions on the primary thread */
> >> +#ifndef NO_PTHREADS
> >> +       if (extension_offset && pthread_join(p.pthread, NULL))
> >> +               die(_("unable to join load_index_extensions_thread"));
> >
> > I guess the last _ is a typo and you wanted "unable to join
> > load_index_extensions thread". Please use die_errno() instead.
> >
>
> Why should this be die_errorno() here?  All other instances of
> pthread_join() failing in a fatal way use die(), not die_errorno().

That argument does not fly well in my opinion. I read the man page and
it listed the error codes, which made me think that we need to use
die_errno() to show the error. My mistake though is the error is
returned as the return value, not in errno, so die_errno() would not
catch it. But we could still do something like

    int ret = pthread_join();
    die(_("blah blah: %s"), strerror(ret));

Other code can also be improved, but that's a separate issue.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 16:23       ` Duy Nguyen
@ 2018-09-17 17:19         ` Junio C Hamano
  0 siblings, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-17 17:19 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> On Sat, Sep 15, 2018 at 12:22 PM Duy Nguyen <pclouds@gmail.com> wrote:
>> Wait there's no way to disable this parallel reading? Does not sound
>> right. And  if ordinary numbers mean the number of threads then 0
>> should mean no threading. Auto detection could have a new keyword,
>> like 'auto'.
>
> My bad. Disabling threading means _1_ thread. What was I thinking...

I did the same during my earlier review.  It seems that it somehow
is unintuitive to us that we do not specify how many _extra_ threads
of control we dedicate to ;-).

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 10:31     ` Duy Nguyen
@ 2018-09-17 17:25       ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-17 17:25 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 6:31 AM, Duy Nguyen wrote:
> On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>>
>> This patch helps address the CPU cost of loading the index by creating
>> multiple threads to divide the work of loading and converting the cache
>> entries across all available CPU cores.
>>
>> It accomplishes this by having the primary thread loop across the index file
>> tracking the offset and (for V4 indexes) expanding the name. It creates a
>> thread to process each block of entries as it comes to them.
>>
>> I used p0002-read-cache.sh to generate some performance data:
>>
>> Test w/100,000 files                Baseline         Parallel entries
>> ---------------------------------------------------------------------------
>> read_cache/discard_cache 1000 times 14.08(0.01+0.10) 9.72(0.03+0.06) -31.0%
>>
>> Test w/1,000,000 files              Baseline         Parallel entries
>> ------------------------------------------------------------------------------
>> read_cache/discard_cache 1000 times 202.95(0.01+0.07) 154.14(0.03+0.06) -24.1%
> 
> The numbers here and the previous patch to load extensions in parallel
> are exactly the same. What do these numbers mean? With both changes?
> 

It means I messed up when creating my commit message for the extension 
patch and copy/pasted the wrong numbers.  Yes, these numbers are with 
both changes (the correct numbers for the extension only are not as good).

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 16:05         ` Duy Nguyen
@ 2018-09-17 17:31           ` Junio C Hamano
  2018-09-17 17:38             ` Duy Nguyen
  0 siblings, 1 reply; 87+ messages in thread
From: Junio C Hamano @ 2018-09-17 17:31 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> But it _is_ available now. If you need it, you write the extension
> out.

Are you arguing for making it omitted when it is not needed (e.g.
small enough index file)?  IOW, did you mean "If you do not need it,
you do not write it out" by the above?

I do not think overhead of writing (or preparing to write) the
extension for a small index file is by definition small enough ;-).

I do not think the configuration that decides if the reader side
uses parallel reading should have any say in the decision to write
(or omit) the extension, by the way.



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 17:31           ` Junio C Hamano
@ 2018-09-17 17:38             ` Duy Nguyen
  2018-09-17 19:08               ` Junio C Hamano
  0 siblings, 1 reply; 87+ messages in thread
From: Duy Nguyen @ 2018-09-17 17:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

On Mon, Sep 17, 2018 at 7:31 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Duy Nguyen <pclouds@gmail.com> writes:
>
> > But it _is_ available now. If you need it, you write the extension
> > out.
>
> Are you arguing for making it omitted when it is not needed (e.g.
> small enough index file)?  IOW, did you mean "If you do not need it,
> you do not write it out" by the above?

Yes I did.

> I do not think overhead of writing (or preparing to write) the
> extension for a small index file is by definition small enough ;-).

Good point.

I get annoyed by the "ignoring unknown extension xxx" messages while
testing though (not just this extension) and I think it will be the
same for other git implementations. But perhaps other implementations
just silently drop the extension. Most of the extensions we have added
so far (except the ancient 'TREE') are optional and are probably not
present 99% of time when a different git impl reads an index created
by C Git. This 'EIOE' may be a good test then to see if they follow
the "ignore optional extensions" rule since it will always appear in
new C Git releases.
-- 
Duy

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 3/5] read-cache: load cache entries on worker threads
  2018-09-15 11:09       ` Duy Nguyen
@ 2018-09-17 18:52         ` Ben Peart
  0 siblings, 0 replies; 87+ messages in thread
From: Ben Peart @ 2018-09-17 18:52 UTC (permalink / raw)
  To: Duy Nguyen, Ben Peart; +Cc: Git Mailing List, Junio C Hamano, Ben Peart



On 9/15/2018 7:09 AM, Duy Nguyen wrote:
> On Sat, Sep 15, 2018 at 01:07:46PM +0200, Duy Nguyen wrote:
>> 12:50:00.084237 read-cache.c:1721       start loading index
>> 12:50:00.119941 read-cache.c:1943       performance: 0.034778758 s: loaded all extensions (1667075 bytes)
>> 12:50:00.185352 read-cache.c:2029       performance: 0.100152079 s: loaded 367110 entries
>> 12:50:00.189683 read-cache.c:2126       performance: 0.104566615 s: finished scanning all entries
>> 12:50:00.217900 read-cache.c:2029       performance: 0.082309193 s: loaded 367110 entries
>> 12:50:00.259969 read-cache.c:2029       performance: 0.070257130 s: loaded 367108 entries
>> 12:50:00.263662 read-cache.c:2278       performance: 0.179344458 s: read cache .git/index
> 
> The previous mail wraps these lines and make it a bit hard to read. Corrected now.
> 
> --
> Duy
> 

Interesting!  Clearly the data shape makes a big difference here as I 
had run a similar test but in my case, the extensions thread actually 
finished last (and it's cost is what drove me to move that onto a 
separate thread that starts first).

Purpose	    			First	Last	Duration
load_index_extensions_thread	719.40	968.50	249.10
load_cache_entries_thread	718.89	738.65	19.76
load_cache_entries_thread	730.39	753.83	23.43
load_cache_entries_thread	741.23	751.23	10.00
load_cache_entries_thread	751.93	780.88	28.95
load_cache_entries_thread	763.60	791.31	27.72
load_cache_entries_thread	773.46	783.46	10.00
load_cache_entries_thread	783.96	794.28	10.32
load_cache_entries_thread	795.61	805.52	9.91
load_cache_entries_thread	805.99	827.21	21.22
load_cache_entries_thread	816.85	826.85	10.00
load_cache_entries_thread	827.03	837.96	10.93

In my tests, the scanning thread clearly delayed the later ce threads 
but given the extension was so slow, it didn't impact the overall time 
nearly as much as your case.

I completely agree that the optimal solution would be to go back to my 
original patch/design.  It eliminates the overhead of the scanning 
thread entirely and allows all threads to start at the same time. This 
would ensure the best performance whether the extensions were the 
longest thread or the cache entry threads took the longest.

I ran out of time and energy last year so dropped it to work on other 
tasks.  I appreciate your offer of help. Perhaps between the two of us 
we could successfully get it through the mailing list this time. :-) 
Let me go back and see what it would take to combine the current EOIE 
patch with the older IEOT patch.

I'm also intrigued with your observation that over committing the cpu 
actually results in time savings.  I hadn't tested that.  It looks like 
that could have a positive impact on the overall time and warrant a 
change to the default nr_threads logic.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension
  2018-09-17 17:38             ` Duy Nguyen
@ 2018-09-17 19:08               ` Junio C Hamano
  0 siblings, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-17 19:08 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

> I get annoyed by the "ignoring unknown extension xxx" messages while
> testing though (not just this extension) and I think it will be the
> same for other git implementations. But perhaps other implementations
> just silently drop the extension. Most of the extensions we have added
> so far (except the ancient 'TREE') are optional and are probably not

Most of the index extensions are optional, including TREE.  I think
"link" is the only one that the readers that do not understand it
are told to abort without causing damage.

> present 99% of time when a different git impl reads an index created
> by C Git. This 'EIOE' may be a good test then to see if they follow
> the "ignore optional extensions" rule since it will always appear in
> new C Git releases.

I think we probably should squelch "ignoring unknown" unless some
sort of GIT_TRACE/DEBUG switch is set.

Patches welcome ;-)

Thanks.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [PATCH v5 2/5] read-cache: load cache extensions on a worker thread
  2018-09-15 10:22     ` Duy Nguyen
                         ` (2 preceding siblings ...)
  2018-09-17 16:26       ` Ben Peart
@ 2018-09-17 21:32       ` Junio C Hamano
  3 siblings, 0 replies; 87+ messages in thread
From: Junio C Hamano @ 2018-09-17 21:32 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Ben Peart, Git Mailing List, Ben Peart

Duy Nguyen <pclouds@gmail.com> writes:

>> diff --git a/read-cache.c b/read-cache.c
>> index 858935f123..b203eebb44 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -23,6 +23,10 @@
>>  #include "split-index.h"
>>  #include "utf8.h"
>>  #include "fsmonitor.h"
>> +#ifndef NO_PTHREADS
>> +#include <pthread.h>
>> +#include <thread-utils.h>
>> +#endif
>
> I don't think you're supposed to include system header files after
> "cache.h". Including thread-utils.h should be enough (and it keeps the
> exception of inclduing pthread.h in just one place). Please use
> "pthread-utils.h" instead of <pthread-utils.h> which is usually for
> system header files. And include ptherad-utils.h unconditionally.

All correct except for s/p\(thread-utils\)/\1/g;
Sorry for missing this during my earlier review.

Thanks.



^ permalink raw reply	[flat|nested] 87+ messages in thread

end of thread, back to index

Thread overview: 87+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-23 15:41 [PATCH v1] read-cache: speed up index load through parallelization Ben Peart
2018-08-23 17:31 ` Stefan Beller
2018-08-23 19:44   ` Ben Peart
2018-08-24 18:40   ` Duy Nguyen
2018-08-28 14:53     ` Ben Peart
2018-08-23 18:06 ` Junio C Hamano
2018-08-23 20:33   ` Ben Peart
2018-08-24 15:37     ` Duy Nguyen
2018-08-24 15:57       ` Duy Nguyen
2018-08-24 17:28         ` Ben Peart
2018-08-25  6:44         ` [PATCH] read-cache.c: optimize reading index format v4 Nguyễn Thái Ngọc Duy
2018-08-27 19:36           ` Junio C Hamano
2018-08-28 19:25             ` Duy Nguyen
2018-08-28 23:54               ` Ben Peart
2018-08-29 17:14               ` Junio C Hamano
2018-09-04 16:08             ` Duy Nguyen
2018-09-02 13:19           ` [PATCH v2 0/1] " Nguyễn Thái Ngọc Duy
2018-09-02 13:19             ` [PATCH v2 1/1] read-cache.c: " Nguyễn Thái Ngọc Duy
2018-09-04 18:58               ` Junio C Hamano
2018-09-04 19:31               ` Junio C Hamano
2018-08-24 18:20       ` [PATCH v1] read-cache: speed up index load through parallelization Duy Nguyen
2018-08-24 18:40         ` Ben Peart
2018-08-24 19:00           ` Duy Nguyen
2018-08-24 19:57             ` Ben Peart
2018-08-29 15:25 ` [PATCH v2 0/3] " Ben Peart
2018-08-29 15:25   ` [PATCH v2 1/3] " Ben Peart
2018-08-29 17:14     ` Junio C Hamano
2018-08-29 21:35       ` Ben Peart
2018-09-03 19:16     ` Duy Nguyen
2018-08-29 15:25   ` [PATCH v2 2/3] read-cache: load cache extensions on worker thread Ben Peart
2018-08-29 17:12     ` Junio C Hamano
2018-08-29 21:42       ` Ben Peart
2018-08-29 22:19         ` Junio C Hamano
2018-09-03 19:21     ` Duy Nguyen
2018-09-03 19:27       ` Duy Nguyen
2018-08-29 15:25   ` [PATCH v2 3/3] read-cache: micro-optimize expand_name_field() to speed up V4 index parsing Ben Peart
2018-09-06 21:03 ` [PATCH v3 0/4] read-cache: speed up index load through parallelization Ben Peart
2018-09-06 21:03   ` [PATCH v3 1/4] read-cache: optimize expand_name_field() to speed up V4 index parsing Ben Peart
2018-09-06 21:03   ` [PATCH v3 2/4] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-07 17:55     ` Junio C Hamano
2018-09-07 20:23       ` Ben Peart
2018-09-08  6:29         ` Martin Ågren
2018-09-08 14:03           ` Ben Peart
2018-09-08 17:08             ` Martin Ågren
2018-09-06 21:03   ` [PATCH v3 3/4] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-07 21:10     ` Junio C Hamano
2018-09-08 14:56       ` Ben Peart
2018-09-06 21:03   ` [PATCH v3 4/4] read-cache: speed up index load through parallelization Ben Peart
2018-09-07  4:16     ` Torsten Bögershausen
2018-09-07 13:43       ` Ben Peart
2018-09-07 17:21   ` [PATCH v3 0/4] " Junio C Hamano
2018-09-07 18:31     ` Ben Peart
2018-09-08 13:18     ` Duy Nguyen
2018-09-11 23:26 ` [PATCH v4 0/5] " Ben Peart
2018-09-11 23:26   ` [PATCH v4 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-11 23:26   ` [PATCH v4 2/5] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-11 23:26   ` [PATCH v4 3/5] read-cache: speed up index load through parallelization Ben Peart
2018-09-11 23:26   ` [PATCH v4 4/5] read-cache.c: optimize reading index format v4 Ben Peart
2018-09-11 23:26   ` [PATCH v4 5/5] read-cache: clean up casting and byte decoding Ben Peart
2018-09-12 14:34   ` [PATCH v4 0/5] read-cache: speed up index load through parallelization Ben Peart
2018-09-12 16:18 ` [PATCH v5 " Ben Peart
2018-09-12 16:18   ` [PATCH v5 1/5] eoie: add End of Index Entry (EOIE) extension Ben Peart
2018-09-13 22:44     ` Junio C Hamano
2018-09-15 10:02     ` Duy Nguyen
2018-09-17 14:54       ` Ben Peart
2018-09-17 16:05         ` Duy Nguyen
2018-09-17 17:31           ` Junio C Hamano
2018-09-17 17:38             ` Duy Nguyen
2018-09-17 19:08               ` Junio C Hamano
2018-09-12 16:18   ` [PATCH v5 2/5] read-cache: load cache extensions on a worker thread Ben Peart
2018-09-15 10:22     ` Duy Nguyen
2018-09-15 10:24       ` Duy Nguyen
2018-09-17 16:38         ` Ben Peart
2018-09-15 16:23       ` Duy Nguyen
2018-09-17 17:19         ` Junio C Hamano
2018-09-17 16:26       ` Ben Peart
2018-09-17 16:45         ` Duy Nguyen
2018-09-17 21:32       ` Junio C Hamano
2018-09-12 16:18   ` [PATCH v5 3/5] read-cache: load cache entries on worker threads Ben Peart
2018-09-15 10:31     ` Duy Nguyen
2018-09-17 17:25       ` Ben Peart
2018-09-15 11:07     ` Duy Nguyen
2018-09-15 11:09       ` Duy Nguyen
2018-09-17 18:52         ` Ben Peart
2018-09-15 11:29     ` Duy Nguyen
2018-09-12 16:18   ` [PATCH v5 4/5] read-cache.c: optimize reading index format v4 Ben Peart
2018-09-12 16:18   ` [PATCH v5 5/5] read-cache: clean up casting and byte decoding Ben Peart

git@vger.kernel.org mailing list mirror (one of many)

Archives are clonable:
	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://ou63pmih66umazou.onion/inbox.comp.version-control.git
	nntp://czquwvybam4bgbro.onion/inbox.comp.version-control.git
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.version-control.git
	nntp://news.gmane.org/gmane.comp.version-control.git

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox