* [PATCH 4/7] gcf2: transparently retry on missing OID
2020-09-19 9:37 7% [PATCH 0/7] gcf2: libgit2-based cat-file alternative Eric Wong
@ 2020-09-19 9:37 4% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-09-19 9:37 UTC (permalink / raw)
To: meta
Since we only get OIDs from trusted local data sources
(over.sqlite3), we can safely retry within the -gcf2 process
without worry about clients spamming us with requests for
invalid OIDs and triggering reopens.
---
lib/PublicInbox/Gcf2Client.pm | 11 +++++--
lib/PublicInbox/Git.pm | 5 ++--
lib/PublicInbox/gcf2_libgit2.h | 17 ++++++-----
script/public-inbox-gcf2 | 25 ++++++++++++++--
t/gcf2.t | 34 ++++++++++-----------
t/gcf2_client.t | 54 ++++++++++++++++++++++++++++++----
6 files changed, 109 insertions(+), 37 deletions(-)
diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm
index 71fbb1d1..5120698f 100644
--- a/lib/PublicInbox/Gcf2Client.pm
+++ b/lib/PublicInbox/Gcf2Client.pm
@@ -7,11 +7,13 @@ use PublicInbox::Spawn qw(popen_rd);
use IO::Handle ();
sub new {
- my $self = shift->SUPER::new('/nonexistent');
+ my ($rdr) = @_;
+ my $self = bless {}, __PACKAGE__;
my ($out_r, $out_w);
pipe($out_r, $out_w) or $self->fail("pipe failed: $!");
- my $cmd = [ 'public-inbox-gcf2' ];
- @$self{qw(in pid)} = popen_rd($cmd, undef, { 0 => $out_r });
+ $rdr //= {};
+ $rdr->{0} = $out_r;
+ @$self{qw(in pid)} = popen_rd(['public-inbox-gcf2'], undef, $rdr);
$self->{inflight} = [];
$self->{out} = $out_w;
fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ
@@ -32,4 +34,7 @@ sub add_git_dir {
$self->fail("write error: $!");
}
+# always false, since -gcf2 retries internally
+sub alternates_changed {}
+
1;
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index a7ba57f9..b49b5bd3 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -192,7 +192,8 @@ sub cat_async_step ($$) {
chop($$bref) eq "\n" or fail($self, 'LF missing after blob');
} elsif ($head =~ / missing$/) {
# ref($req) indicates it's already been retried
- if (!ref($req) && !$in_cleanup && alternates_changed($self)) {
+ # -gcf2 retries internally, so it never hits this path:
+ if (!ref($req) && !$in_cleanup && $self->alternates_changed) {
return cat_async_retry($self, $inflight,
$req, $cb, $arg);
}
@@ -394,7 +395,7 @@ sub pub_urls {
sub cat_async_begin {
my ($self) = @_;
- cleanup($self) if alternates_changed($self);
+ cleanup($self) if $self->alternates_changed;
batch_prepare($self);
die 'BUG: already in async' if $self->{inflight};
$self->{inflight} = [];
diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h
index d9c79cf9..800c6bad 100644
--- a/lib/PublicInbox/gcf2_libgit2.h
+++ b/lib/PublicInbox/gcf2_libgit2.h
@@ -52,9 +52,13 @@ void add_alternate(SV *self, const char *objects_path)
croak_if_err(rc, "git_odb_add_disk_alternate");
}
-/* this requires an unabbreviated git OID */
#define CAPA(v) (sizeof(v) / sizeof((v)[0]))
-void cat_oid(SV *self, int fd, SV *oidsv)
+
+/*
+ * returns true on success, false on failure
+ * this requires an unabbreviated git OID
+ */
+int cat_oid(SV *self, int fd, SV *oidsv)
{
/*
* adjust when libgit2 gets SHA-256 support, we return the
@@ -89,11 +93,8 @@ void cat_oid(SV *self, int fd, SV *oidsv)
git_object_type2string(
git_odb_object_type(object)),
vec[1].iov_len);
- } else {
- vec[0].iov_base = oidptr;
- vec[0].iov_len = oidlen;
- vec[1].iov_base = " missing";
- vec[1].iov_len = strlen(vec[1].iov_base);
+ } else { /* caller retries */
+ nvec = 0;
}
while (nvec && !err) {
ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec);
@@ -136,4 +137,6 @@ void cat_oid(SV *self, int fd, SV *oidsv)
git_odb_object_free(object);
if (err)
croak("writev error: %s", strerror(err));
+
+ return rc == GIT_OK;
}
diff --git a/script/public-inbox-gcf2 b/script/public-inbox-gcf2
index 51811698..d2d2ac8b 100755
--- a/script/public-inbox-gcf2
+++ b/script/public-inbox-gcf2
@@ -3,12 +3,33 @@
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
eval { require PublicInbox::Gcf2 };
die "libgit2 development package or Inline::C missing for $0: $@\n" if $@;
+my @dirs; # may get big (30K-100K)
my $gcf2 = PublicInbox::Gcf2::new();
+use IO::Handle; # autoflush
+STDERR->autoflush(1);
+STDOUT->autoflush(1);
+
while (<STDIN>) {
chomp;
if (m!\A/!) { # +/path/to/git-dir
+ push @dirs, $_;
$gcf2->add_alternate("$_/objects");
- } else {
- $gcf2->cat_oid(1, $_);
+ } elsif (!$gcf2->cat_oid(1, $_)) {
+ # retry once if missing. We only get unabbreviated OIDs
+ # from SQLite or Xapian DBs, here, so malicious clients
+ # can't trigger excessive retries:
+ my $oid = $_;
+ warn "I: $$ $oid missing, retrying...\n";
+
+ # clients may need to wait a bit for this:
+ $gcf2 = PublicInbox::Gcf2::new();
+ $gcf2->add_alternate("$_/objects") for @dirs;
+
+ if ($gcf2->cat_oid(1, $oid)) {
+ warn "I: $$ $oid found after retry\n";
+ } else {
+ warn "W: $$ $oid missing after retry\n";
+ print "$oid missing\n"; # mimic git-cat-file
+ }
}
}
diff --git a/t/gcf2.t b/t/gcf2.t
index 9056b340..35b2f113 100644
--- a/t/gcf2.t
+++ b/t/gcf2.t
@@ -76,43 +76,41 @@ SKIP: {
}
open my $fh, '+>', undef or BAIL_OUT "open: $!";
- my $fd = fileno($fh);
$fh->autoflush(1);
- $gcf2->cat_oid($fd, 'invalid');
+ ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails');
seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
- is(do { local $/; <$fh> }, "invalid missing\n", 'got missing message');
+ is(do { local $/; <$fh> }, '', 'nothing written');
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
+ ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails');
seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
- $gcf2->cat_oid($fd, '0'x40);
- seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
- is(do { local $/; <$fh> }, ('0'x40)." missing\n",
- 'got missing message for 0x40');
+ is(do { local $/; <$fh> }, '', 'nothing written for z40');
- seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
- $gcf2->cat_oid($fd, $COPYING);
- my $buf;
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
my $ck_copying = sub {
my ($desc) = @_;
seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
- is(<$fh>, "$COPYING blob 34520\n", 'got expected header');
- $buf = do { local $/; <$fh> };
+ is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc");
+ my $buf = do { local $/; <$fh> };
is(chop($buf), "\n", 'got trailing \\n');
is($buf, $agpl, "AGPL matches ($desc)");
};
+ ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal');
$ck_copying->('regular file');
$gcf2 = PublicInbox::Gcf2::new();
$gcf2->add_alternate("$tmpdir/objects");
- $ck_copying->('alternates respected');
+ open $fh, '+>', undef or BAIL_OUT "open: $!";
+ ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate');
+ $ck_copying->('alternates after reopen');
- $^O eq 'linux' or skip('pipe tests are Linux-only', 12);
- my $size = -s $fh;
+ $^O eq 'linux' or skip('pipe tests are Linux-only', 14);
for my $blk (1, 0) {
my ($r, $w);
pipe($r, $w) or BAIL_OUT $!;
fcntl($w, 1031, 4096) or
- skip('Linux too old for F_SETPIPE_SZ', 12);
+ skip('Linux too old for F_SETPIPE_SZ', 14);
$w->blocking($blk);
seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
truncate($fh, 0) or BAIL_OUT "truncate: $!";
@@ -120,11 +118,11 @@ SKIP: {
if ($pid == 0) {
close $w;
tick; # wait for parent to block on writev
- $buf = do { local $/; <$r> };
+ my $buf = do { local $/; <$r> };
print $fh $buf or _exit(1);
_exit(0);
}
- $gcf2->cat_oid(fileno($w), $COPYING);
+ ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk");
close $w or BAIL_OUT "close: $!";
is(waitpid($pid, 0), $pid, 'child exited');
is($?, 0, 'no error in child');
diff --git a/t/gcf2_client.t b/t/gcf2_client.t
index 39f9f296..0f7e7203 100644
--- a/t/gcf2_client.t
+++ b/t/gcf2_client.t
@@ -10,19 +10,25 @@ use PublicInbox::Import;
require_mods('PublicInbox::Gcf2');
use_ok 'PublicInbox::Gcf2Client';
my ($tmpdir, $for_destroy) = tmpdir();
-PublicInbox::Import::init_bare($tmpdir);
+my $git_a = "$tmpdir/a.git";
+my $git_b = "$tmpdir/b.git";
+PublicInbox::Import::init_bare($git_a);
+PublicInbox::Import::init_bare($git_b);
my $fi_data = './t/git.fast-import-data';
my $rdr = {};
open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
-xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr);
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr);
is($?, 0, 'fast-import succeeded');
my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
my $called = 0;
+my $err_f = "$tmpdir/err";
{
local $ENV{PATH} = getcwd()."/blib/script:$ENV{PATH}";
- my $gcf2c = PublicInbox::Gcf2Client->new;
- $gcf2c->add_git_dir($tmpdir);
+ open my $err, '>', $err_f or BAIL_OUT $!;
+ my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+ $gcf2c->add_git_dir($git_a);
+
$gcf2c->cat_async($tree, sub {
my ($bref, $oid, $type, $size, $arg) = @_;
is($oid, $tree, 'got expected OID');
@@ -32,6 +38,12 @@ my $called = 0;
is($arg, 'hi', 'arg passed');
$called++;
}, 'hi');
+ $gcf2c->cat_async_wait;
+
+ open $err, '<', $err_f or BAIL_OUT $!;
+ my $estr = do { local $/; <$err> };
+ is($estr, '', 'nothing in stderr');
+
my $trunc = substr($tree, 0, 39);
$gcf2c->cat_async($trunc, sub {
my ($bref, $oid, $type, $size, $arg) = @_;
@@ -42,6 +54,38 @@ my $called = 0;
is($arg, 'bye', 'arg passed when missing');
$called++;
}, 'bye');
+ $gcf2c->cat_async_wait;
+
+ open $err, '<', $err_f or BAIL_OUT $!;
+ $estr = do { local $/; <$err> };
+ like($estr, qr/retrying/, 'warned about retry');
+
+ # try failed alternates lookup
+ open $err, '>', $err_f or BAIL_OUT $!;
+ $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+ $gcf2c->add_git_dir($git_b);
+ $gcf2c->cat_async($tree, sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is(undef, $bref, 'missing bref from alt is undef');
+ $called++;
+ });
+ $gcf2c->cat_async_wait;
+ open $err, '<', $err_f or BAIL_OUT $!;
+ $estr = do { local $/; <$err> };
+ like($estr, qr/retrying/, 'warned about retry before alt update');
+
+ # now try successful alternates lookup
+ open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!;
+ print $alt "$git_a/objects\n" or BAIL_OUT $!;
+ close $alt or BAIL_OUT;
+ my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]);
+ $gcf2c->cat_async($tree, sub {
+ my ($bref, $oid, $type, $size, $arg) = @_;
+ is($oid, $tree, 'oid match on alternates retry');
+ is($$bref, $expect, 'tree content matched');
+ $called++;
+ });
+ $gcf2c->cat_async_wait;
}
-is($called, 2, 'cat_async callbacks hit');
+is($called, 4, 'cat_async callbacks hit');
done_testing;
^ permalink raw reply related [relevance 4%]
* [PATCH 0/7] gcf2: libgit2-based cat-file alternative
@ 2020-09-19 9:37 7% Eric Wong
2020-09-19 9:37 4% ` [PATCH 4/7] gcf2: transparently retry on missing OID Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-09-19 9:37 UTC (permalink / raw)
To: meta
This allows a single cat-file-like process to handle multiple
inboxes; instead of having a "git cat-file --batch" process for
every inbox; saving pipes and process table space.
The preliminary code was done months ago, but I struggled to put
all the pieces together in a coherent way. My brain has been
scattered :x
I finally decided to make the gcf2 process a global singleton
(per-worker) to avoid complexity elsewhere in the config...
It doesn't detect or release unlinked packs + indices, yet,
so "git gc" may not free disk space until restarted.
Otherwise it does detect new epochs and seems mostly working
otherwise...
Eric Wong (7):
gcf2: libgit2-based git cat-file alternative
t/gcf2: test changes to alternates
add gcf2 client and executable script
gcf2: transparently retry on missing OID
gcf2*: more descriptive package descriptions
gcf2: require git dir with OID
gcf2: wire up read-only daemons and rm -gcf2 script
MANIFEST | 5 +
lib/PublicInbox/Daemon.pm | 11 +++
lib/PublicInbox/Gcf2.pm | 89 ++++++++++++++++++
lib/PublicInbox/Gcf2Client.pm | 62 +++++++++++++
lib/PublicInbox/Git.pm | 41 ++++++---
lib/PublicInbox/GitAsyncCat.pm | 73 +++++++++++++--
lib/PublicInbox/IMAP.pm | 2 +-
lib/PublicInbox/gcf2_libgit2.h | 142 +++++++++++++++++++++++++++++
script/public-inbox-httpd | 1 +
t/gcf2.t | 162 +++++++++++++++++++++++++++++++++
t/gcf2_client.t | 90 ++++++++++++++++++
11 files changed, 652 insertions(+), 26 deletions(-)
create mode 100644 lib/PublicInbox/Gcf2.pm
create mode 100644 lib/PublicInbox/Gcf2Client.pm
create mode 100644 lib/PublicInbox/gcf2_libgit2.h
create mode 100644 t/gcf2.t
create mode 100644 t/gcf2_client.t
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-09-19 9:37 7% [PATCH 0/7] gcf2: libgit2-based cat-file alternative Eric Wong
2020-09-19 9:37 4% ` [PATCH 4/7] gcf2: transparently retry on missing OID Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).