git@vger.kernel.org list mirror (unofficial, one of many)
 help / color / mirror / Atom feed
From: "SZEDER Gábor" <szeder.dev@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Garima Singh" <garima.singh@microsoft.com>,
	"Derrick Stolee" <stolee@gmail.com>,
	"Jakub Narebski" <jnareb@gmail.com>, "Jeff King" <peff@peff.net>,
	"Taylor Blau" <me@ttaylorr.com>,
	"SZEDER Gábor" <szeder.dev@gmail.com>
Subject: [PATCH 26/34] commit-graph: deduplicate modified path Bloom filters
Date: Fri, 29 May 2020 10:50:30 +0200
Message-ID: <20200529085038.26008-27-szeder.dev@gmail.com> (raw)
In-Reply-To: <20200529085038.26008-1-szeder.dev@gmail.com>

Some commits modify the same set of paths, and, consequently, have
identical modified path Bloom filters.

Use a hashmap to find these identical Bloom filters, and omit all
duplicates from the Modified Path Bloom Filters chunk, reducing its
size.

                   MPBF chunk size
                 without       with             Time spent
                    deduplication                deduping
  --------------------------------------------------------
  android-base   8620198     2618764    -69.6%    0.089s
  cmssw         10347018     9094468    -12.1%    0.056s
  cpython         526381      371629    -29.4%    0.013s
  elasticsearch  4733274     3607106    -23.8%    0.029s
  gcc            3724544     3394521     -8.9%    0.072s
  gecko-dev     20591010    17042732    -17.2%    0.235s
  git             160718      139546    -13.2%    0.004s
  glibc           759132      699623     -7.8%    0.009s
  go              458375      421394     -8.1%    0.008s
  jdk           13213208    12158533     -8.0%    0.034s
  linux         26575278    25029700     -5.8%    0.169s
  llvm-project   3988133     3269438    -18.0%    0.085s
  rails           958691      614528    -35.9%    0.015s
  rust           1985782     1682919    -15.3%    0.023s
  tensorflow     4173570     3789768     -9.2%    0.022s
  webkit         5891751     5394507     -8.4%    0.096s

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
---
 commit-graph.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 5 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 56b8f33d10..178bbf0113 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -57,6 +57,7 @@
 
 struct modified_path_bloom_filter_info {
 	struct bloom_filter filter;
+	struct modified_path_bloom_filter_info *duplicate_of;
 	/*
 	 * The offset relative to the start of the Modified Path Bloom
 	 * Filters chunk where this Bloom filter has been written,
@@ -1099,6 +1100,9 @@ struct write_commit_graph_context {
 
 		/* Excluding embedded modified path Bloom filters */
 		uint64_t total_filter_size;
+
+		/* Used to find identical modified path Bloom filters */
+		struct hashmap dedup_hashmap;
 	} mpbfctx;
 };
 
@@ -1315,7 +1319,11 @@ static int write_graph_chunk_modified_path_bloom_filters(struct hashfile *f,
 		bfi = modified_path_bloom_filters_peek(
 				&modified_path_bloom_filters, commit);
 
-		if (!bfi || !bfi->filter.nr_bits)
+		if (!bfi)
+			continue;
+		if (bfi->duplicate_of)
+			continue;
+		if (!bfi->filter.nr_bits)
 			continue;
 		if (bfi->filter.nr_bits == GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS)
 			continue;
@@ -1364,6 +1372,9 @@ static int write_graph_chunk_modified_path_bloom_index(struct hashfile *f,
 			 */
 			filterdata[0] |= 1 << 7;
 			hashwrite(f, filterdata, sizeof(filterdata));
+		} else if (bfi->duplicate_of) {
+			uint64_t offset = htonll(bfi->duplicate_of->offset);
+			hashwrite(f, &offset, sizeof(offset));
 		} else if (bfi->offset != -1) {
 			uint64_t offset = htonll(bfi->offset);
 			hashwrite(f, &offset, sizeof(offset));
@@ -1515,6 +1526,53 @@ static void file_change_cb(struct diff_options *options,
 	handle_modified_file(options->change_fn_data, fullpath);
 }
 
+struct modified_path_bloom_filter_dedup_entry {
+	struct hashmap_entry entry;
+	struct modified_path_bloom_filter_info *bfi;
+};
+
+static int modified_path_bloom_filter_dedup_cmp(const void *unused_cmp_data,
+		const struct hashmap_entry *he1,
+		const struct hashmap_entry *he2,
+		const void *unused_keydata)
+{
+	const struct modified_path_bloom_filter_dedup_entry *a, *b;
+
+	a = container_of(he1, const struct modified_path_bloom_filter_dedup_entry, entry);
+	b = container_of(he2, const struct modified_path_bloom_filter_dedup_entry, entry);
+
+	if (a->bfi->filter.nr_bits != b->bfi->filter.nr_bits)
+		return 1;
+
+	return memcmp(a->bfi->filter.bits, b->bfi->filter.bits,
+		      bloom_filter_bytes(&a->bfi->filter));
+}
+
+static int handle_duplicate_modified_path_bloom_filter(
+		struct modified_path_bloom_filter_context *mpbfctx,
+		struct modified_path_bloom_filter_info *bfi)
+{
+	struct modified_path_bloom_filter_dedup_entry *e, stack_entry;
+	unsigned int h;
+
+	h = memhash(bfi->filter.bits, bloom_filter_bytes(&bfi->filter));
+	hashmap_entry_init(&stack_entry.entry, h);
+	stack_entry.bfi = bfi;
+
+	e = hashmap_get_entry(&mpbfctx->dedup_hashmap, &stack_entry, entry,
+			      NULL);
+	if (e) {
+		bfi->duplicate_of = e->bfi;
+		return 1;
+	} else {
+		e = xmalloc(sizeof(*e));
+		hashmap_entry_init(&e->entry, h);
+		e->bfi = bfi;
+		hashmap_add(&mpbfctx->dedup_hashmap, &e->entry);
+		return 0;
+	}
+}
+
 static void create_modified_path_bloom_filter(
 		struct modified_path_bloom_filter_context *mpbfctx,
 		struct commit *commit)
@@ -1547,17 +1605,20 @@ static void create_modified_path_bloom_filter(
 	bfi = modified_path_bloom_filters_at(&modified_path_bloom_filters,
 					     commit);
 	bfi->offset = -1;
-	if (path_component_count > mpbfctx->embedded_limit) {
+	if (path_component_count > mpbfctx->embedded_limit)
 		bloom_filter_init(&bfi->filter, mpbfctx->num_hashes,
 				  path_component_count);
-		mpbfctx->total_filter_size += sizeof(uint32_t) +
-					      bloom_filter_bytes(&bfi->filter);
-	} else
+	else
 		bloom_filter_init_with_size(&bfi->filter,
 				GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS);
 
 	bloom_filter_set_bits(&bfi->filter, mpbfctx->hashes,
 			      mpbfctx->hashes_nr);
+
+	if (path_component_count > mpbfctx->embedded_limit &&
+	    !handle_duplicate_modified_path_bloom_filter(mpbfctx, bfi))
+		mpbfctx->total_filter_size += sizeof(uint32_t) +
+					      bloom_filter_bytes(&bfi->filter);
 }
 
 static void add_missing_parents(struct write_commit_graph_context *ctx, struct commit *commit)
@@ -2396,6 +2457,8 @@ int write_commit_graph(const char *obj_dir,
 		diff_setup_done(&ctx->mpbfctx.diffopt);
 
 		strbuf_init(&ctx->mpbfctx.prev_path, PATH_MAX);
+		hashmap_init(&ctx->mpbfctx.dedup_hashmap,
+			     modified_path_bloom_filter_dedup_cmp, NULL, 0);
 	}
 
 	if (ctx->split) {
@@ -2525,6 +2588,9 @@ int write_commit_graph(const char *obj_dir,
 		strbuf_release(&ctx->mpbfctx.prev_path);
 		free(ctx->mpbfctx.hashed_prefix_lengths);
 		free(ctx->mpbfctx.hashes);
+		hashmap_free_entries(&ctx->mpbfctx.dedup_hashmap,
+				struct modified_path_bloom_filter_dedup_entry,
+				entry);
 	}
 
 	free(ctx);
-- 
2.27.0.rc1.431.g5c813f95dc


  parent reply	other threads:[~2020-05-29  8:52 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-29  8:50 [PoC PATCH 00/34] An alternative modified path Bloom filters implementation SZEDER Gábor
2020-05-28 22:09 ` Johannes Schindelin
2020-05-29  8:50 ` [PATCH 01/34] tree-walk.c: don't match submodule entries for 'submod/anything' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 02/34] commit-graph: fix parsing the Chunk Lookup table SZEDER Gábor
2020-05-29  8:50 ` [PATCH 03/34] commit-graph-format.txt: all multi-byte numbers are in network byte order SZEDER Gábor
2020-05-29  8:50 ` [PATCH 04/34] commit-slab: add a function to deep free entries on the slab SZEDER Gábor
2020-06-04 16:43   ` Derrick Stolee
2020-06-27 15:56     ` SZEDER Gábor
2020-06-29 11:30       ` Derrick Stolee
2020-05-29  8:50 ` [PATCH 05/34] diff.h: drop diff_tree_oid() & friends' return value SZEDER Gábor
2020-05-29  8:50 ` [PATCH 06/34] commit-graph: clean up #includes SZEDER Gábor
2020-05-29  8:50 ` [PATCH 07/34] commit-graph: simplify parse_commit_graph() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 08/34] commit-graph: simplify parse_commit_graph() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 09/34] commit-graph: simplify write_commit_graph_file() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 10/34] commit-graph: simplify write_commit_graph_file() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 11/34] commit-graph: allocate the 'struct chunk_info' array dinamically SZEDER Gábor
2020-05-29  8:50 ` [PATCH 12/34] commit-graph: unify the signatures of all write_graph_chunk_*() functions SZEDER Gábor
2020-05-29  8:50 ` [PATCH 13/34] commit-graph: simplify write_commit_graph_file() #3 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 14/34] commit-graph: check chunk sizes after writing SZEDER Gábor
2020-05-29  8:50 ` [PATCH 15/34] commit-graph-format.txt: document the modified path Bloom filter chunks SZEDER Gábor
2020-06-02 17:59   ` SZEDER Gábor
2020-05-29  8:50 ` [PATCH 16/34] Add a generic and minimal Bloom filter implementation SZEDER Gábor
2020-05-29  8:50 ` [PATCH 17/34] Import a streaming-capable Murmur3 hash function implementation SZEDER Gábor
2020-05-29  8:50 ` [PATCH 18/34] commit-graph: write "empty" Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 19/34] commit-graph: add commit slab for modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 20/34] commit-graph: fill the Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 21/34] commit-graph: load and use " SZEDER Gábor
2020-05-29  8:50 ` [PATCH 22/34] commit-graph: write the Modified Path Bloom Filters chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 23/34] commit-graph: load and use " SZEDER Gábor
2020-05-29  8:50 ` [PATCH 24/34] commit-graph: check all leading directories in modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 25/34] commit-graph: check embedded modified path Bloom filters with a mask SZEDER Gábor
2020-05-29  8:50 ` SZEDER Gábor [this message]
2020-05-29  8:50 ` [PATCH 27/34] commit-graph: load modified path Bloom filters for merge commits SZEDER Gábor
2020-05-29  8:50 ` [PATCH 28/34] commit-graph: write Modified Path Bloom Filter Merge Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 29/34] commit-graph: extract init and free write_commit_graph_context SZEDER Gábor
2020-05-29  8:50 ` [PATCH 30/34] commit-graph: move write_commit_graph_reachable below write_commit_graph SZEDER Gábor
2020-05-29  8:50 ` [PATCH 31/34] t7007-show: make the first test compatible with the next patch SZEDER Gábor
2020-05-29  8:50 ` [PATCH 32/34] PoC commit-graph: use revision walk machinery for '--reachable' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 33/34] commit-graph: write modified path Bloom filters in "history order" SZEDER Gábor
2020-05-29  8:50 ` [PATCH 34/34] commit-graph: use modified path Bloom filters with wildcards, if possible SZEDER Gábor
2020-05-29 13:59 ` [PoC PATCH 00/34] An alternative modified path Bloom filters implementation Derrick Stolee
2020-06-01 23:25 ` Taylor Blau
2020-06-02 17:08   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200529085038.26008-27-szeder.dev@gmail.com \
    --to=szeder.dev@gmail.com \
    --cc=garima.singh@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jnareb@gmail.com \
    --cc=me@ttaylorr.com \
    --cc=peff@peff.net \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

git@vger.kernel.org list mirror (unofficial, one of many)

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 git git/ https://public-inbox.org/git \
		git@vger.kernel.org
	public-inbox-index git

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://ou63pmih66umazou.onion/inbox.comp.version-control.git
	nntp://czquwvybam4bgbro.onion/inbox.comp.version-control.git
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.version-control.git
	nntp://news.gmane.io/gmane.comp.version-control.git
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for the project(s) associated with this inbox:

	https://80x24.org/mirrors/git.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git