git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jameson Miller <jamill@microsoft.com>
To: "git@vger.kernel.org" <git@vger.kernel.org>
Cc: "gitster@pobox.com" <gitster@pobox.com>,
	"pclouds@gmail.com" <pclouds@gmail.com>,
	"jonathantanmy@google.com" <jonathantanmy@google.com>,
	Jameson Miller <jamill@microsoft.com>
Subject: [PATCH v1 4/5] Allocate cache entries from memory pools
Date: Tue, 17 Apr 2018 16:34:42 +0000	[thread overview]
Message-ID: <20180417163400.3875-6-jamill@microsoft.com> (raw)
In-Reply-To: <20180417163400.3875-1-jamill@microsoft.com>

Improve performance of reading a large index by reducing the number of
malloc() calls. When reading an index with a large number of entries,
a portion of the time is dominated in malloc() calls. This can be
mitigated by allocating a single large block of memory up front into a
memory pool and have git hand out chunks of time.

This change moves the cache entry allocation to be on top of memory
pools.

Design:

The index_state struct will gain a notion of an associated memory_pool
from which cache_entry structs will be allocated from. When reading in
the index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool and cache entries from the memory pool will be
discarded as well. This means the lifetime of cache_entry structs are
tied to the lifetime of the index_state they were allocated for.

In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:

Terminology:
  - 'the_index': represents the logical view of the index

  - 'split_index': represents the "base" cache entries. Read from the
    split index file.

'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is.  This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
---
 cache.h       |  2 ++
 read-cache.c  | 95 +++++++++++++++++++++++++++++++++++++++++++++--------------
 split-index.c | 23 +++++++++++++--
 3 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/cache.h b/cache.h
index eedf154815..7c0d2343c3 100644
--- a/cache.h
+++ b/cache.h
@@ -15,6 +15,7 @@
 #include "path.h"
 #include "sha1-array.h"
 #include "repository.h"
+#include "mem-pool.h"
 
 #include <zlib.h>
 typedef struct git_zstream {
@@ -328,6 +329,7 @@ struct index_state {
 	struct untracked_cache *untracked;
 	uint64_t fsmonitor_last_update;
 	struct ewah_bitmap *fsmonitor_dirty;
+	struct mem_pool *ce_mem_pool;
 };
 
 extern struct index_state the_index;
diff --git a/read-cache.c b/read-cache.c
index 04fa7e1bd0..67438bf375 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -46,6 +46,42 @@
 		 CE_ENTRY_ADDED | CE_ENTRY_REMOVED | CE_ENTRY_CHANGED | \
 		 SPLIT_INDEX_ORDERED | UNTRACKED_CHANGED | FSMONITOR_CHANGED)
 
+
+/*
+ * This is an estimate of the average pathname length in the index.  We use
+ * this for V4 index files to guess the un-deltafied size of the index in
+ * memory because of pathname deltafication.  This is not required for V2/V3
+ * index formats because their pathnames are not compressed.  If the initial
+ * amount of memory set aside is not sufficient, the mem pool will allocate
+ * extra memory.
+ */
+#define CACHE_ENTRY_AVG_PATH_LENGTH_ESTIMATE 80
+
+static inline struct cache_entry *mem_pool__ce_alloc(struct mem_pool *mem_pool, size_t len)
+{
+	return mem_pool_alloc(mem_pool, cache_entry_size(len));
+}
+
+static inline struct cache_entry *mem_pool__ce_calloc(struct mem_pool *mem_pool, size_t len)
+{
+	return mem_pool_calloc(mem_pool, 1, cache_entry_size(len));
+}
+
+static struct mem_pool *find_mem_pool(struct index_state *istate)
+{
+	struct mem_pool **pool_ptr;
+
+	if (istate->split_index && istate->split_index->base)
+		pool_ptr = &istate->split_index->base->ce_mem_pool;
+	else
+		pool_ptr = &istate->ce_mem_pool;
+
+	if (!*pool_ptr)
+		mem_pool_init(pool_ptr, 0);
+
+	return *pool_ptr;
+}
+
 struct index_state the_index;
 static const char *alternate_index_output;
 
@@ -746,7 +782,7 @@ int add_file_to_index(struct index_state *istate, const char *path, int flags)
 
 struct cache_entry *make_empty_index_cache_entry(struct index_state *istate, size_t len)
 {
-	return xcalloc(1, cache_entry_size(len));
+	return mem_pool__ce_calloc(find_mem_pool(istate), len);
 }
 
 struct cache_entry *make_empty_transient_cache_entry(size_t len)
@@ -1641,13 +1677,13 @@ int read_index(struct index_state *istate)
 	return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct index_state *istate,
+static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
 						   struct ondisk_cache_entry *ondisk,
 						   unsigned int flags,
 						   const char *name,
 						   size_t len)
 {
-	struct cache_entry *ce = make_empty_index_cache_entry(istate, len);
+	struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
 
 	ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
 	ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1689,7 +1725,7 @@ static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
 	return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct index_state *istate,
+static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
 					    struct ondisk_cache_entry *ondisk,
 					    unsigned long *ent_size,
 					    struct strbuf *previous_name)
@@ -1721,13 +1757,13 @@ static struct cache_entry *create_from_disk(struct index_state *istate,
 		/* v3 and earlier */
 		if (len == CE_NAMEMASK)
 			len = strlen(name);
-		ce = cache_entry_from_ondisk(istate, ondisk, flags, name, len);
+		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, len);
 
 		*ent_size = ondisk_ce_size(ce);
 	} else {
 		unsigned long consumed;
 		consumed = expand_name_field(previous_name, name);
-		ce = cache_entry_from_ondisk(istate, ondisk, flags,
+		ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
 					     previous_name->buf,
 					     previous_name->len);
 
@@ -1801,6 +1837,22 @@ static void post_read_index_from(struct index_state *istate)
 	tweak_fsmonitor(istate);
 }
 
+static size_t estimate_cache_size_from_compressed(unsigned int entries)
+{
+	return entries * (sizeof(struct cache_entry) + CACHE_ENTRY_AVG_PATH_LENGTH_ESTIMATE);
+}
+
+static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
+{
+	long per_entry = sizeof(struct cache_entry) - sizeof(struct ondisk_cache_entry);
+
+	/*
+	 * Account for potential alignment differences.
+	 */
+	per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry));
+	return ondisk_size + entries * per_entry;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1847,10 +1899,15 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 	istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
 	istate->initialized = 1;
 
-	if (istate->version == 4)
+	if (istate->version == 4) {
 		previous_name = &previous_name_buf;
-	else
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size_from_compressed(istate->cache_nr));
+	} else {
 		previous_name = NULL;
+		mem_pool_init(&istate->ce_mem_pool,
+			      estimate_cache_size(mmap_size, istate->cache_nr));
+	}
 
 	src_offset = sizeof(*hdr);
 	for (i = 0; i < istate->cache_nr; i++) {
@@ -1859,7 +1916,7 @@ int do_read_index(struct index_state *istate, const char *path, int must_exist)
 		unsigned long consumed;
 
 		disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
-		ce = create_from_disk(istate, disk_ce, &consumed, previous_name);
+		ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, previous_name);
 		set_index_entry(istate, i, ce);
 
 		src_offset += consumed;
@@ -1956,17 +2013,6 @@ int is_index_unborn(struct index_state *istate)
 
 int discard_index(struct index_state *istate)
 {
-	int i;
-
-	for (i = 0; i < istate->cache_nr; i++) {
-		if (istate->cache[i]->index &&
-		    istate->split_index &&
-		    istate->split_index->base &&
-		    istate->cache[i]->index <= istate->split_index->base->cache_nr &&
-		    istate->cache[i] == istate->split_index->base->cache[istate->cache[i]->index - 1])
-			continue;
-		index_cache_entry_discard(istate->cache[i]);
-	}
 	resolve_undo_clear_index(istate);
 	istate->cache_nr = 0;
 	istate->cache_changed = 0;
@@ -1980,6 +2026,12 @@ int discard_index(struct index_state *istate)
 	discard_split_index(istate);
 	free_untracked_cache(istate->untracked);
 	istate->untracked = NULL;
+
+	if (istate->ce_mem_pool) {
+		mem_pool_discard(istate->ce_mem_pool);
+		istate->ce_mem_pool = NULL;
+	}
+
 	return 0;
 }
 
@@ -2772,11 +2824,10 @@ void move_index_extensions(struct index_state *dst, struct index_state *src)
 }
 
 /*
- * Free cache entry allocated for an index.
+ * Indicate that a cache entry is no longer is use
  */
 void index_cache_entry_discard(struct cache_entry *ce)
 {
-	free(ce);
 }
 
 void transient_cache_entry_discard(struct cache_entry *ce)
diff --git a/split-index.c b/split-index.c
index 361c821096..a920c0ad07 100644
--- a/split-index.c
+++ b/split-index.c
@@ -73,16 +73,31 @@ void move_cache_to_base_index(struct index_state *istate)
 	int i;
 
 	/*
-	 * do not delete old si->base, its index entries may be shared
-	 * with istate->cache[]. Accept a bit of leaking here because
-	 * this code is only used by short-lived update-index.
+	 * If there was a previous base index, then transfer ownership of allocated
+	 * entries to the parent index.
 	 */
+	if (si->base &&
+		si->base->ce_mem_pool) {
+
+		if (!istate->ce_mem_pool)
+			mem_pool_init(&istate->ce_mem_pool, 0);
+
+		mem_pool_combine(istate->ce_mem_pool, istate->split_index->base->ce_mem_pool);
+	}
+
 	si->base = xcalloc(1, sizeof(*si->base));
 	si->base->version = istate->version;
 	/* zero timestamp disables racy test in ce_write_index() */
 	si->base->timestamp = istate->timestamp;
 	ALLOC_GROW(si->base->cache, istate->cache_nr, si->base->cache_alloc);
 	si->base->cache_nr = istate->cache_nr;
+
+	/*
+	 * The mem_pool needs to move with the allocated entries.
+	 */
+	si->base->ce_mem_pool = istate->ce_mem_pool;
+	istate->ce_mem_pool = NULL;
+
 	COPY_ARRAY(si->base->cache, istate->cache, istate->cache_nr);
 	mark_base_index_entries(si->base);
 	for (i = 0; i < si->base->cache_nr; i++)
@@ -330,6 +345,8 @@ void add_split_index(struct index_state *istate)
 void remove_split_index(struct index_state *istate)
 {
 	if (istate->split_index) {
+		mem_pool_combine(istate->ce_mem_pool, istate->split_index->base->ce_mem_pool);
+
 		/*
 		 * can't discard_split_index(&the_index); because that
 		 * will destroy split_index->base->cache[], which may
-- 
2.14.3


  parent reply	other threads:[~2018-04-17 16:35 UTC|newest]

Thread overview: 100+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-17 16:34 [PATCH v1 0/5] Allocate cache entries from memory pool Jameson Miller
2018-04-17 16:34 ` Jameson Miller
2018-04-17 16:34 ` [PATCH v1 1/5] read-cache: teach refresh_cache_entry to take istate Jameson Miller
2018-04-17 19:00   ` Ben Peart
2018-04-17 16:34 ` [PATCH v1 2/5] Add an API creating / discarding cache_entry structs Jameson Miller
2018-04-17 23:11   ` Ben Peart
2018-04-17 16:34 ` Jameson Miller [this message]
2018-04-17 16:34 ` [PATCH v1 3/5] mem-pool: fill out functionality Jameson Miller
2018-04-20 23:21   ` Jonathan Tan
2018-04-23 17:27     ` Jameson Miller
2018-04-23 17:49       ` Jonathan Tan
2018-04-23 18:20         ` Jameson Miller
2018-04-17 16:34 ` [PATCH v1 5/5] Add optional memory validations around cache_entry lifecyle Jameson Miller
2018-04-17 18:39 ` [PATCH v1 0/5] Allocate cache entries from memory pool Ben Peart
2018-04-23 14:09   ` Jameson Miller
2018-04-18  4:49 ` Junio C Hamano
2018-04-20 17:49   ` Stefan Beller
2018-04-23 16:44     ` Jameson Miller
2018-04-23 17:18       ` Stefan Beller
2018-04-23 16:19   ` Jameson Miller
2018-04-20 23:34 ` Jonathan Tan
2018-04-23 17:14   ` Jameson Miller
2018-04-30 15:31 ` [PATCH v2 " Jameson Miller
2018-04-30 15:31   ` [PATCH v2 1/5] read-cache: teach refresh_cache_entry() to take istate Jameson Miller
2018-04-30 15:31   ` [PATCH v2 2/5] block alloc: add lifecycle APIs for cache_entry structs Jameson Miller
2018-04-30 15:31   ` [PATCH v2 3/5] mem-pool: fill out functionality Jameson Miller
2018-04-30 21:42     ` Stefan Beller
2018-05-01 15:43       ` Jameson Miller
2018-05-03 16:18     ` Duy Nguyen
2018-04-30 15:31   ` [PATCH v2 4/5] block alloc: allocate cache entries from mem_pool Jameson Miller
2018-04-30 15:31   ` [PATCH v2 5/5] block alloc: add validations around cache_entry lifecyle Jameson Miller
2018-05-03 16:28     ` Duy Nguyen
2018-05-03 16:35   ` [PATCH v2 0/5] Allocate cache entries from memory pool Duy Nguyen
2018-05-03 17:21     ` Stefan Beller
2018-05-03 19:17       ` Duy Nguyen
2018-05-03 20:58         ` Stefan Beller
2018-05-03 21:13           ` Jameson Miller
2018-05-03 22:18             ` [PATCH] alloc.c: replace alloc by mempool Stefan Beller
2018-05-04 16:33               ` Duy Nguyen
2018-05-08  0:37                 ` Junio C Hamano
2018-05-08  0:44                   ` Stefan Beller
2018-05-08  1:07                     ` Junio C Hamano
2018-05-23 14:47 ` [PATCH v3 0/7] allocate cache entries from memory pool Jameson Miller
2018-05-23 14:47   ` [PATCH v3 1/7] read-cache: teach refresh_cache_entry() to take istate Jameson Miller
2018-05-25 22:54     ` Stefan Beller
2018-05-23 14:47   ` [PATCH v3 2/7] block alloc: add lifecycle APIs for cache_entry structs Jameson Miller
2018-05-24  4:52     ` Junio C Hamano
2018-05-24 14:47       ` Jameson Miller
2018-05-23 14:47   ` [PATCH v3 3/7] mem-pool: only search head block for available space Jameson Miller
2018-05-23 14:47   ` [PATCH v3 4/7] mem-pool: add lifecycle management functions Jameson Miller
2018-05-23 14:47   ` [PATCH v3 5/7] mem-pool: fill out functionality Jameson Miller
2018-06-01 19:28     ` Stefan Beller
2018-05-23 14:47   ` [PATCH v3 6/7] block alloc: allocate cache entries from mem_pool Jameson Miller
2018-05-23 14:47   ` [PATCH v3 7/7] block alloc: add validations around cache_entry lifecyle Jameson Miller
2018-05-24  4:55   ` [PATCH v3 0/7] allocate cache entries from memory pool Junio C Hamano
2018-05-24 14:44     ` Jameson Miller
2018-05-25 22:53       ` Stefan Beller
2018-06-20 20:41         ` Jameson Miller
2018-05-25 22:41   ` Stefan Beller
2018-06-20 20:17 ` [PATCH v4 0/8] Allocate cache entries from mem_pool Jameson Miller
2018-06-20 20:17   ` [PATCH v4 1/8] read-cache: teach refresh_cache_entry() to take istate Jameson Miller
2018-06-20 20:17   ` [PATCH v4 2/8] block alloc: add lifecycle APIs for cache_entry structs Jameson Miller
2018-06-21 21:14     ` Stefan Beller
2018-06-28 14:07       ` Jameson Miller
2018-06-20 20:17   ` [PATCH v4 3/8] mem-pool: only search head block for available space Jameson Miller
2018-06-21 21:33     ` Stefan Beller
2018-06-28 14:12       ` Jameson Miller
2018-06-20 20:17   ` [PATCH v4 4/8] mem-pool: tweak math on mp_block allocation size Jameson Miller
2018-06-20 20:17   ` [PATCH v4 5/8] mem-pool: add lifecycle management functions Jameson Miller
2018-06-20 20:17   ` [PATCH v4 6/8] mem-pool: fill out functionality Jameson Miller
2018-06-20 20:17   ` [PATCH v4 7/8] block alloc: allocate cache entries from mem_pool Jameson Miller
2018-06-20 20:17   ` [PATCH v4 8/8] block alloc: add validations around cache_entry lifecyle Jameson Miller
2018-06-28 14:00   ` [PATCH v5 0/8] Allocate cache entries from mem_pool Jameson Miller
2018-06-28 14:00     ` [PATCH v5 1/8] read-cache: teach refresh_cache_entry() to take istate Jameson Miller
2018-06-28 14:00     ` [PATCH v5 2/8] read-cache: make_cache_entry should take object_id struct Jameson Miller
2018-06-28 17:14       ` Junio C Hamano
2018-06-28 22:27       ` SZEDER Gábor
2018-06-28 14:00     ` [PATCH v5 3/8] block alloc: add lifecycle APIs for cache_entry structs Jameson Miller
2018-06-28 18:43       ` Junio C Hamano
2018-06-28 22:28       ` SZEDER Gábor
2018-06-28 14:00     ` [PATCH v5 4/8] mem-pool: only search head block for available space Jameson Miller
2018-06-28 14:00     ` [PATCH v5 5/8] mem-pool: add life cycle management functions Jameson Miller
2018-06-28 17:15       ` Junio C Hamano
2018-06-28 14:00     ` [PATCH v5 6/8] mem-pool: fill out functionality Jameson Miller
2018-06-28 19:09       ` Junio C Hamano
2018-07-02 18:28         ` Jameson Miller
2018-06-28 14:00     ` [PATCH v5 7/8] block alloc: allocate cache entries from mem-pool Jameson Miller
2018-06-28 14:00     ` [PATCH v5 8/8] block alloc: add validations around cache_entry lifecyle Jameson Miller
2018-07-02 19:49     ` [PATCH v6 0/8] Allocate cache entries from mem_pool Jameson Miller
2018-07-02 19:49       ` [PATCH v6 1/8] read-cache: teach refresh_cache_entry to take istate Jameson Miller
2018-07-02 19:49       ` [PATCH v6 2/8] read-cache: teach make_cache_entry to take object_id Jameson Miller
2018-07-02 21:23         ` Stefan Beller
2018-07-05 15:20           ` Jameson Miller
2018-07-02 19:49       ` [PATCH v6 3/8] block alloc: add lifecycle APIs for cache_entry structs Jameson Miller
2018-07-22  9:23         ` Duy Nguyen
2018-07-02 19:49       ` [PATCH v6 4/8] mem-pool: only search head block for available space Jameson Miller
2018-07-02 19:49       ` [PATCH v6 5/8] mem-pool: add life cycle management functions Jameson Miller
2018-07-02 19:49       ` [PATCH v6 6/8] mem-pool: fill out functionality Jameson Miller
2018-07-02 19:49       ` [PATCH v6 7/8] block alloc: allocate cache entries from mem_pool Jameson Miller
2018-07-02 19:49       ` [PATCH v6 8/8] block alloc: add validations around cache_entry lifecyle Jameson Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180417163400.3875-6-jamill@microsoft.com \
    --to=jamill@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jonathantanmy@google.com \
    --cc=pclouds@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).