From: "Đoàn Trần Công Danh" <congdanhqx@gmail.com>
To: Taylor Blau <me@ttaylorr.com>
Cc: git@vger.kernel.org
Subject: Re: test-tool: bloom: generate_filter for multiple string?
Date: Wed, 13 Jan 2021 18:59:52 +0700 [thread overview]
Message-ID: <X/7guF05a/Bb/VNp@danh.dev> (raw)
In-Reply-To: <X/3+PG2hi71tj/UA@nand.local>
[-- Attachment #1: Type: text/plain, Size: 855 bytes --]
On 2021-01-12 14:53:32-0500, Taylor Blau <me@ttaylorr.com> wrote:
> On Thu, Dec 31, 2020 at 10:54:38AM +0700, Đoàn Trần Công Danh wrote:
> > I'm reading the code for Bloom Filter to see if arXiv:2012.00472
> > could be an improvement.
>
> I'm late to the party, but I'm curious to hear which part of this
> article you think would help out the Bloom filter implementation.
Uhm, no. The article doesn't help the Bloom filter implementation.
The article was suggesting using Bloom filter to speed-up the
negotiation in fetch-pack and upload-pack. Which, in my own quick
experience, doesn't help much. Maybe it's me not understand the
article idea or I have made a naive implementation. However, I'm not
convinced to pursued further.
If you are curious, I'm attaching 2 quick-and-low-quality patches with
this email for your consideration.
--
Danh
[-- Attachment #2: 0002-fetch-pack-implent-bloom-filter-in-client-side.patch --]
[-- Type: text/x-diff, Size: 4579 bytes --]
From 287d1b94606bbc5475909e5298560cbd28ee998e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
<congdanhqx@gmail.com>
Date: Thu, 31 Dec 2020 10:58:22 +0700
Subject: [PATCH 2/3] fetch-pack: implent bloom filter in client side
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Đoàn Trần Công Danh <congdanhqx@gmail.com>
---
fetch-pack.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 81 insertions(+), 4 deletions(-)
diff --git a/fetch-pack.c b/fetch-pack.c
index 876f90c759..f423ccd816 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -23,6 +23,9 @@
#include "fetch-negotiator.h"
#include "fsck.h"
#include "shallow.h"
+#include "bloom.h"
+#include "list-objects.h"
+#include "revision.h"
static int transfer_unpack_limit = -1;
static int fetch_unpack_limit = -1;
@@ -1184,6 +1187,71 @@ static int add_haves(struct fetch_negotiator *negotiator,
return ret;
}
+static int rev_info_insert_ref_oid(const char *refname,
+ const struct object_id *oid,
+ int flag, void *cbdata)
+{
+ struct rev_info *rev_info = cbdata;
+ add_pending_oid(rev_info, refname, oid, 0);
+ return 1;
+}
+
+static void rev_info_insert_to_oidset(struct commit *commit, void *cbdata)
+{
+ struct oidset *set = cbdata;
+ oidset_insert(set, &commit->object.oid);
+}
+
+static void add_bloom(struct strbuf *req_buf, struct oidset *common)
+{
+ struct oidset_iter iter;
+ struct oidset haves = OIDSET_INIT;
+ struct rev_info rev_info;
+ struct bloom_filter filter;
+ struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+ struct strbuf buf;
+ const struct object_id *oid;
+ int i;
+
+ if (oidset_size(common) == 0)
+ return;
+
+ repo_init_revisions(the_repository, &rev_info, NULL);
+
+ oidset_iter_init(common, &iter);
+ while ((oid = oidset_iter_next(&iter))) {
+ add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+ UNINTERESTING | BOTTOM);
+ }
+
+ for_each_rawref(rev_info_insert_ref_oid, &rev_info);
+ if (prepare_revision_walk(&rev_info)) {
+ warning("couldn't prepare revision walk for fetch");
+ return;
+ }
+ traverse_commit_list(&rev_info, rev_info_insert_to_oidset, NULL, &haves);
+ if (oidset_size(&haves) == 0)
+ return;
+ filter.len = (oidset_size(&haves) * settings.bits_per_entry + BITS_PER_WORD - 1)
+ / BITS_PER_WORD;
+ free(filter.data);
+ filter.data = xcalloc(filter.len, sizeof(unsigned char));
+ oidset_iter_init(&haves, &iter);
+ while ((oid = oidset_iter_next(&iter))) {
+ struct bloom_key key;
+ fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+ &key, &settings);
+ add_key_to_filter(&key, &filter, &settings);
+ }
+
+ strbuf_init(&buf, filter.len * 2 + strlen("bloom \n"));
+ strbuf_addstr(&buf, "bloom ");
+ for (i = 0; i < filter.len; i++)
+ strbuf_addf(&buf, "%02x", filter.data[i]);
+ strbuf_addch(&buf, '\n');
+ packet_buf_write_len(req_buf, buf.buf, buf.len);
+}
+
static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
struct fetch_pack_args *args,
const struct ref *wants, struct oidset *common,
@@ -1191,6 +1259,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
int sideband_all, int seen_ack)
{
int ret = 0;
+ int transferbloom = 0;
const char *hash_name;
struct strbuf req_buf = STRBUF_INIT;
@@ -1278,6 +1347,11 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
ret = add_haves(negotiator, seen_ack, &req_buf,
haves_to_send, in_vain);
+ /* Add bloom filter represent commits we have from known common */
+ git_config_get_maybe_bool("transfer.bloom", &transferbloom);
+ if (transferbloom && server_supports_v2("bloom", 0))
+ add_bloom(&req_buf, common);
+
/* Send request */
packet_buf_flush(&req_buf);
if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0)
@@ -1352,11 +1426,14 @@ static enum common_found process_acks(struct fetch_negotiator *negotiator,
struct object_id oid;
received_ack = 1;
if (!get_oid_hex(arg, &oid)) {
+ struct object *obj;
struct commit *commit;
- oidset_insert(common, &oid);
- commit = lookup_commit(the_repository, &oid);
- if (negotiator)
- negotiator->ack(negotiator, commit);
+ obj = lookup_object(the_repository, &oid);
+ if (obj && (commit = object_as_type(obj, OBJ_COMMIT, 0))) {
+ oidset_insert(common, &oid);
+ if (negotiator)
+ negotiator->ack(negotiator, commit);
+ }
}
continue;
}
--
2.30.0.4.gbe3229325d
[-- Attachment #3: 0003-upload-pack-support-bloom-filter.patch --]
[-- Type: text/x-diff, Size: 6093 bytes --]
From 2c93a1a6c37d40ba37eb083a139b6f1312a547e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
<congdanhqx@gmail.com>
Date: Mon, 4 Jan 2021 20:23:29 +0700
Subject: [PATCH 3/3] upload-pack: support bloom filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Đoàn Trần Công Danh <congdanhqx@gmail.com>
---
serve.c | 1 +
t/t5701-git-serve.sh | 1 +
upload-pack.c | 86 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 88 insertions(+)
diff --git a/serve.c b/serve.c
index eec2fe6f29..daea53bfb7 100644
--- a/serve.c
+++ b/serve.c
@@ -78,6 +78,7 @@ static struct protocol_capability capabilities[] = {
{ "server-option", always_advertise, NULL },
{ "object-format", object_format_advertise, NULL },
{ "session-id", session_id_advertise, NULL },
+ { "bloom", always_advertise, NULL },
};
static void advertise_capabilities(void)
diff --git a/t/t5701-git-serve.sh b/t/t5701-git-serve.sh
index a1f5fdc9fd..9fabdba6b1 100755
--- a/t/t5701-git-serve.sh
+++ b/t/t5701-git-serve.sh
@@ -16,6 +16,7 @@ test_expect_success 'test capability advertisement' '
fetch=shallow
server-option
object-format=$(test_oid algo)
+ bloom
0000
EOF
diff --git a/upload-pack.c b/upload-pack.c
index 3b66bf92ba..0eff165865 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -27,6 +27,7 @@
#include "commit-graph.h"
#include "commit-reach.h"
#include "shallow.h"
+#include "bloom.h"
/* Remember to update object flag allocation in object.h */
#define THEY_HAVE (1u << 11)
@@ -62,6 +63,8 @@ struct upload_pack_data {
struct object_array have_obj;
struct oid_array haves; /* v2 only */
struct string_list wanted_refs; /* v2 only */
+ struct bloom_filter bloom_filter; /* v2 only */
+ struct bloom_filter_settings bloom_filter_settings;
struct object_array shallows;
struct string_list deepen_not;
@@ -113,6 +116,13 @@ struct upload_pack_data {
unsigned advertise_sid : 1;
};
+struct upload_pack_bloom_args
+{
+ struct bloom_filter *bloom_filter;
+ struct bloom_filter_settings *bloom_filter_settings;
+ struct oid_array *acks;
+};
+
static void upload_pack_data_init(struct upload_pack_data *data)
{
struct string_list symref = STRING_LIST_INIT_DUP;
@@ -125,6 +135,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
struct string_list uri_protocols = STRING_LIST_INIT_DUP;
struct object_array extra_edge_obj = OBJECT_ARRAY_INIT;
struct string_list allowed_filters = STRING_LIST_INIT_DUP;
+ struct bloom_filter_settings bloom_filter_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
memset(data, 0, sizeof(*data));
data->symref = symref;
@@ -132,6 +143,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
data->want_obj = want_obj;
data->have_obj = have_obj;
data->haves = haves;
+ data->bloom_filter_settings = bloom_filter_settings;
data->shallows = shallows;
data->deepen_not = deepen_not;
data->uri_protocols = uri_protocols;
@@ -1464,6 +1476,34 @@ static int parse_have(const char *line, struct oid_array *haves)
return 0;
}
+static int parse_bloom(const char *line, struct bloom_filter *filter,
+ struct bloom_filter_settings *settings)
+{
+ const char *arg;
+ unsigned char *p;
+ size_t len;
+ if (!skip_prefix(line, "bloom ", &arg))
+ return 0;
+
+ len = strlen(arg);
+ if (len % 2)
+ die("git upload-pack: corrupted bloom data with len: %zu", len);
+ filter->len = len / 2;
+ filter->data = p = xmalloc(filter->len);
+ while (*arg) {
+ int val = hex2chr(arg);
+ if (val < 0)
+ goto errout;
+ *p++ = val;
+ arg += 2;
+ }
+ return 1;
+errout:
+ filter->len = 0;
+ FREE_AND_NULL(filter->data);
+ return 0;
+}
+
static void process_args(struct packet_reader *request,
struct upload_pack_data *data)
{
@@ -1482,6 +1522,9 @@ static void process_args(struct packet_reader *request,
if (parse_have(arg, &data->haves))
continue;
+ if (parse_bloom(arg, &data->bloom_filter, &data->bloom_filter_settings))
+ continue;
+
/* process args like thin-pack */
if (!strcmp(arg, "thin-pack")) {
data->use_thin_pack = 1;
@@ -1570,6 +1613,48 @@ static int process_haves(struct upload_pack_data *data, struct oid_array *common
return 0;
}
+static void rev_info_bloom_ack(struct commit *commit, void *cbdata)
+{
+ struct upload_pack_bloom_args *args = cbdata;
+ struct bloom_key key;
+ struct object_id *oid = &commit->object.oid;
+ fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+ &key, args->bloom_filter_settings);
+ if (bloom_filter_contains(args->bloom_filter, &key,
+ args->bloom_filter_settings))
+ oid_array_append(args->acks, oid);
+}
+
+/* Walking from wanted_obj until haves, add all MAYBE objects to acks */
+static int process_bloom(struct upload_pack_data *data, struct oid_array *acks)
+{
+ struct upload_pack_bloom_args args;
+ struct rev_info rev_info;
+ const struct object_id *oid;
+ int i;
+
+ /* unknown bloom data and/or no known common objects */
+ if (!data->bloom_filter.len || !acks->nr)
+ return 0;
+ repo_init_revisions(the_repository, &rev_info, NULL);
+ for (i = 0; i < acks->nr; i++) {
+ oid = &acks->oid[i];
+ add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+ UNINTERESTING | BOTTOM);
+ }
+ for (i = 0; i < data->want_obj.nr; i++) {
+ oid = &data->want_obj.objects[i].item->oid;
+ add_pending_oid(&rev_info, oid_to_hex(oid), oid, 0);
+ }
+ if (prepare_revision_walk(&rev_info))
+ return 0;
+ args.bloom_filter = &data->bloom_filter;
+ args.bloom_filter_settings = &data->bloom_filter_settings;
+ args.acks = acks;
+ traverse_commit_list(&rev_info, rev_info_bloom_ack, NULL, &args);
+ return 0;
+}
+
static int send_acks(struct upload_pack_data *data, struct oid_array *acks)
{
int i;
@@ -1600,6 +1685,7 @@ static int process_haves_and_send_acks(struct upload_pack_data *data)
int ret = 0;
process_haves(data, &common);
+ process_bloom(data, &common);
if (data->done) {
ret = 1;
} else if (send_acks(data, &common)) {
--
2.30.0.4.gbe3229325d
next prev parent reply other threads:[~2021-01-13 12:04 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-31 3:54 test-tool: bloom: generate_filter for multiple string? Đoàn Trần Công Danh
2020-12-31 11:31 ` Derrick Stolee
2021-01-05 13:34 ` Đoàn Trần Công Danh
2021-01-12 19:53 ` Taylor Blau
2021-01-13 11:59 ` Đoàn Trần Công Danh [this message]
2021-01-13 12:06 ` Derrick Stolee
2021-01-13 12:13 ` Đoàn Trần Công Danh
2021-01-13 15:13 ` Taylor Blau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=X/7guF05a/Bb/VNp@danh.dev \
--to=congdanhqx@gmail.com \
--cc=git@vger.kernel.org \
--cc=me@ttaylorr.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).