git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: "SZEDER Gábor" <szeder.dev@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Garima Singh" <garima.singh@microsoft.com>,
	"Derrick Stolee" <stolee@gmail.com>,
	"Jakub Narebski" <jnareb@gmail.com>, "Jeff King" <peff@peff.net>,
	"Taylor Blau" <me@ttaylorr.com>,
	"SZEDER Gábor" <szeder.dev@gmail.com>
Subject: [PATCH 16/34] Add a generic and minimal Bloom filter implementation
Date: Fri, 29 May 2020 10:50:20 +0200	[thread overview]
Message-ID: <20200529085038.26008-17-szeder.dev@gmail.com> (raw)
In-Reply-To: <20200529085038.26008-1-szeder.dev@gmail.com>

We'll use Bloom filters to store the paths modified between each
commit and one of its parents.  This means many small Bloom filters,
which puts some constraints on the hashing scheme, as discussed in the
previous commit.  However, Bloom filters might turn out to be useful
in other parts of Git as well, where they might be free from those
constraints.  So add a Bloom filter implementation that is fairly
generic, and copes only with allocation, initialization and release of
the filter and with manipulating the bits in its bit array, but leaves
all the hashing to the user of the filter.

Unfortunately, it's still somewhat specific, though, in how it maps
the hashes to specific bits in memory, i.e. how it maps the hashes to
array indices and how the bit array is indexed.  We'll see how that
will pan out...
---
 Makefile       |  1 +
 bloom-filter.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 bloom-filter.h | 47 ++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 bloom-filter.c
 create mode 100644 bloom-filter.h

diff --git a/Makefile b/Makefile
index 09f98b777c..a9db1e8d9b 100644
--- a/Makefile
+++ b/Makefile
@@ -839,6 +839,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blame.o
 LIB_OBJS += blob.o
+LIB_OBJS += bloom-filter.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
diff --git a/bloom-filter.c b/bloom-filter.c
new file mode 100644
index 0000000000..919da26462
--- /dev/null
+++ b/bloom-filter.c
@@ -0,0 +1,91 @@
+#include "bloom-filter.h"
+
+void bloom_filter_init(struct bloom_filter *bf, unsigned int nr_hashes,
+		       uint32_t nr_elements)
+{
+	/* n * k / ln(2) ≈ n * k / 0.69315 ≈ n * k * 10 / 7 */
+	uint32_t nr_bits = st_mult(st_mult(nr_elements, nr_hashes), 10) / 7;
+	uint32_t nr_bytes = st_add(nr_bits, 7) / 8;
+	/*
+	 * But we round up to fully utilize all bytes, thus lowering the
+	 * probability of false positives a bit.
+	 */
+	bf->nr_bits = nr_bytes * 8;
+	bf->bits = xcalloc(nr_bytes, sizeof(*(bf->bits)));
+}
+
+void bloom_filter_init_with_size(struct bloom_filter *bf, uint32_t nr_bits)
+{
+	uint32_t nr_bytes = st_add(nr_bits, 7) / 8;
+	bf->nr_bits = nr_bits;
+	bf->bits = xcalloc(nr_bytes, sizeof(*(bf->bits)));
+}
+
+void bloom_filter_free(struct bloom_filter *bf)
+{
+	FREE_AND_NULL(bf->bits);
+	bf->nr_bits = 0;
+}
+
+uint32_t bloom_filter_bytes(struct bloom_filter *bf)
+{
+	return (bf->nr_bits + 7) / 8;
+}
+
+void bloom_filter_clear_all_bits(struct bloom_filter *bf)
+{
+	memset(bf->bits, 0, bloom_filter_bytes(bf));
+}
+
+void bloom_filter_set_all_bits(struct bloom_filter *bf)
+{
+	memset(bf->bits, 0xff, bloom_filter_bytes(bf));
+}
+
+static inline uint32_t bit_offset(uint32_t nr_bits, uint32_t hash)
+{
+	return hash % nr_bits;
+}
+
+static inline uint32_t byte_offset(uint32_t nr_bits, uint32_t bit_offset)
+{
+	return (nr_bits - 1) / 8 - bit_offset / 8;
+}
+
+static inline uint8_t byte_mask(uint32_t bit_offset)
+{
+	return 1 << (bit_offset % 8);
+}
+
+static inline void bloom_filter_set_one_bit(struct bloom_filter *bf,
+					    uint32_t hash)
+{
+	uint32_t offset = bit_offset(bf->nr_bits, hash);
+	bf->bits[byte_offset(bf->nr_bits, offset)] |= byte_mask(offset);
+}
+
+void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *hashes,
+			   unsigned int nr_hashes)
+{
+	unsigned int i;
+	for (i = 0; i < nr_hashes; i++)
+		bloom_filter_set_one_bit(bf, hashes[i]);
+}
+
+static inline int bloom_filter_check_one_bit(struct bloom_filter *bf,
+					     uint32_t hash)
+{
+	uint32_t offset = bit_offset(bf->nr_bits, hash);
+	return bf->bits[byte_offset(bf->nr_bits, offset)] & byte_mask(offset);
+}
+
+enum bloom_result bloom_filter_check_bits(struct bloom_filter *bf,
+					  const uint32_t *hashes,
+					  unsigned int nr_hashes)
+{
+	unsigned int i;
+	for (i = 0; i < nr_hashes; i++)
+		if (!bloom_filter_check_one_bit(bf, hashes[i]))
+			return BLOOM_DEFINITELY_NOT;
+	return BLOOM_POSSIBLY_YES;
+}
diff --git a/bloom-filter.h b/bloom-filter.h
new file mode 100644
index 0000000000..2fcd4a2c19
--- /dev/null
+++ b/bloom-filter.h
@@ -0,0 +1,47 @@
+#ifndef BLOOM_FILTER_H
+#define BLOOM_FILTER_H
+
+#include "git-compat-util.h"
+
+enum bloom_result {
+	/*
+	 * BLOOM_CANT_TELL is a value that a caller can use to report that
+	 * a Bloom filter is not available; bloom_filter_check_bits() will
+	 * never return it.
+	 */
+	BLOOM_CANT_TELL = -1,
+	BLOOM_DEFINITELY_NOT = 0,
+	BLOOM_POSSIBLY_YES = 1
+};
+
+struct bloom_filter {
+	uint32_t nr_bits;
+	uint8_t *bits;
+};
+
+/*
+ * Initialize a Bloom filter with the number of bits that is (close to)
+ * optimal to hold the given number of elements using the given number
+ * of hashes per element.
+ */
+void bloom_filter_init(struct bloom_filter *bf, uint32_t nr_hashes,
+		       uint32_t nr_elements);
+
+/* Initialize a Bloom filter with the given number of bits */
+void bloom_filter_init_with_size(struct bloom_filter *bf, uint32_t nr_bits);
+
+void bloom_filter_free(struct bloom_filter *bf);
+
+/* Return the size of the Bloom filter's bit array in bytes */
+uint32_t bloom_filter_bytes(struct bloom_filter *bf);
+
+void bloom_filter_clear_all_bits(struct bloom_filter *bf);
+void bloom_filter_set_all_bits(struct bloom_filter *bf);
+
+void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *hashes,
+			   unsigned int nr_hashes);
+enum bloom_result bloom_filter_check_bits(struct bloom_filter *bf,
+					  const uint32_t *hashes,
+					  unsigned int nr_hashes);
+
+#endif
-- 
2.27.0.rc1.431.g5c813f95dc


  parent reply	other threads:[~2020-05-29  8:52 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-29  8:50 [PoC PATCH 00/34] An alternative modified path Bloom filters implementation SZEDER Gábor
2020-05-28 22:09 ` Johannes Schindelin
2020-05-29  8:50 ` [PATCH 01/34] tree-walk.c: don't match submodule entries for 'submod/anything' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 02/34] commit-graph: fix parsing the Chunk Lookup table SZEDER Gábor
2020-05-29  8:50 ` [PATCH 03/34] commit-graph-format.txt: all multi-byte numbers are in network byte order SZEDER Gábor
2020-05-29  8:50 ` [PATCH 04/34] commit-slab: add a function to deep free entries on the slab SZEDER Gábor
2020-06-04 16:43   ` Derrick Stolee
2020-06-27 15:56     ` SZEDER Gábor
2020-06-29 11:30       ` Derrick Stolee
2020-05-29  8:50 ` [PATCH 05/34] diff.h: drop diff_tree_oid() & friends' return value SZEDER Gábor
2020-05-29  8:50 ` [PATCH 06/34] commit-graph: clean up #includes SZEDER Gábor
2020-05-29  8:50 ` [PATCH 07/34] commit-graph: simplify parse_commit_graph() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 08/34] commit-graph: simplify parse_commit_graph() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 09/34] commit-graph: simplify write_commit_graph_file() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 10/34] commit-graph: simplify write_commit_graph_file() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 11/34] commit-graph: allocate the 'struct chunk_info' array dinamically SZEDER Gábor
2020-05-29  8:50 ` [PATCH 12/34] commit-graph: unify the signatures of all write_graph_chunk_*() functions SZEDER Gábor
2020-05-29  8:50 ` [PATCH 13/34] commit-graph: simplify write_commit_graph_file() #3 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 14/34] commit-graph: check chunk sizes after writing SZEDER Gábor
2020-05-29  8:50 ` [PATCH 15/34] commit-graph-format.txt: document the modified path Bloom filter chunks SZEDER Gábor
2020-06-02 17:59   ` SZEDER Gábor
2020-05-29  8:50 ` SZEDER Gábor [this message]
2020-05-29  8:50 ` [PATCH 17/34] Import a streaming-capable Murmur3 hash function implementation SZEDER Gábor
2020-05-29  8:50 ` [PATCH 18/34] commit-graph: write "empty" Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 19/34] commit-graph: add commit slab for modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 20/34] commit-graph: fill the Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 21/34] commit-graph: load and use " SZEDER Gábor
2020-05-29  8:50 ` [PATCH 22/34] commit-graph: write the Modified Path Bloom Filters chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 23/34] commit-graph: load and use " SZEDER Gábor
2020-05-29  8:50 ` [PATCH 24/34] commit-graph: check all leading directories in modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 25/34] commit-graph: check embedded modified path Bloom filters with a mask SZEDER Gábor
2020-05-29  8:50 ` [PATCH 26/34] commit-graph: deduplicate modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 27/34] commit-graph: load modified path Bloom filters for merge commits SZEDER Gábor
2020-05-29  8:50 ` [PATCH 28/34] commit-graph: write Modified Path Bloom Filter Merge Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 29/34] commit-graph: extract init and free write_commit_graph_context SZEDER Gábor
2020-05-29  8:50 ` [PATCH 30/34] commit-graph: move write_commit_graph_reachable below write_commit_graph SZEDER Gábor
2020-05-29  8:50 ` [PATCH 31/34] t7007-show: make the first test compatible with the next patch SZEDER Gábor
2020-05-29  8:50 ` [PATCH 32/34] PoC commit-graph: use revision walk machinery for '--reachable' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 33/34] commit-graph: write modified path Bloom filters in "history order" SZEDER Gábor
2020-05-29  8:50 ` [PATCH 34/34] commit-graph: use modified path Bloom filters with wildcards, if possible SZEDER Gábor
2020-05-29 13:59 ` [PoC PATCH 00/34] An alternative modified path Bloom filters implementation Derrick Stolee
2020-06-01 23:25 ` Taylor Blau
2020-06-02 17:08   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200529085038.26008-17-szeder.dev@gmail.com \
    --to=szeder.dev@gmail.com \
    --cc=garima.singh@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jnareb@gmail.com \
    --cc=me@ttaylorr.com \
    --cc=peff@peff.net \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).