From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E64B51FA12 for ; Sat, 19 Sep 2020 09:37:14 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 4/7] gcf2: transparently retry on missing OID Date: Sat, 19 Sep 2020 09:37:11 +0000 Message-Id: <20200919093714.21776-5-e@80x24.org> In-Reply-To: <20200919093714.21776-1-e@80x24.org> References: <20200919093714.21776-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Since we only get OIDs from trusted local data sources (over.sqlite3), we can safely retry within the -gcf2 process without worry about clients spamming us with requests for invalid OIDs and triggering reopens. --- lib/PublicInbox/Gcf2Client.pm | 11 +++++-- lib/PublicInbox/Git.pm | 5 ++-- lib/PublicInbox/gcf2_libgit2.h | 17 ++++++----- script/public-inbox-gcf2 | 25 ++++++++++++++-- t/gcf2.t | 34 ++++++++++----------- t/gcf2_client.t | 54 ++++++++++++++++++++++++++++++---- 6 files changed, 109 insertions(+), 37 deletions(-) diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm index 71fbb1d1..5120698f 100644 --- a/lib/PublicInbox/Gcf2Client.pm +++ b/lib/PublicInbox/Gcf2Client.pm @@ -7,11 +7,13 @@ use PublicInbox::Spawn qw(popen_rd); use IO::Handle (); sub new { - my $self = shift->SUPER::new('/nonexistent'); + my ($rdr) = @_; + my $self = bless {}, __PACKAGE__; my ($out_r, $out_w); pipe($out_r, $out_w) or $self->fail("pipe failed: $!"); - my $cmd = [ 'public-inbox-gcf2' ]; - @$self{qw(in pid)} = popen_rd($cmd, undef, { 0 => $out_r }); + $rdr //= {}; + $rdr->{0} = $out_r; + @$self{qw(in pid)} = popen_rd(['public-inbox-gcf2'], undef, $rdr); $self->{inflight} = []; $self->{out} = $out_w; fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ @@ -32,4 +34,7 @@ sub add_git_dir { $self->fail("write error: $!"); } +# always false, since -gcf2 retries internally +sub alternates_changed {} + 1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index a7ba57f9..b49b5bd3 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -192,7 +192,8 @@ sub cat_async_step ($$) { chop($$bref) eq "\n" or fail($self, 'LF missing after blob'); } elsif ($head =~ / missing$/) { # ref($req) indicates it's already been retried - if (!ref($req) && !$in_cleanup && alternates_changed($self)) { + # -gcf2 retries internally, so it never hits this path: + if (!ref($req) && !$in_cleanup && $self->alternates_changed) { return cat_async_retry($self, $inflight, $req, $cb, $arg); } @@ -394,7 +395,7 @@ sub pub_urls { sub cat_async_begin { my ($self) = @_; - cleanup($self) if alternates_changed($self); + cleanup($self) if $self->alternates_changed; batch_prepare($self); die 'BUG: already in async' if $self->{inflight}; $self->{inflight} = []; diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h index d9c79cf9..800c6bad 100644 --- a/lib/PublicInbox/gcf2_libgit2.h +++ b/lib/PublicInbox/gcf2_libgit2.h @@ -52,9 +52,13 @@ void add_alternate(SV *self, const char *objects_path) croak_if_err(rc, "git_odb_add_disk_alternate"); } -/* this requires an unabbreviated git OID */ #define CAPA(v) (sizeof(v) / sizeof((v)[0])) -void cat_oid(SV *self, int fd, SV *oidsv) + +/* + * returns true on success, false on failure + * this requires an unabbreviated git OID + */ +int cat_oid(SV *self, int fd, SV *oidsv) { /* * adjust when libgit2 gets SHA-256 support, we return the @@ -89,11 +93,8 @@ void cat_oid(SV *self, int fd, SV *oidsv) git_object_type2string( git_odb_object_type(object)), vec[1].iov_len); - } else { - vec[0].iov_base = oidptr; - vec[0].iov_len = oidlen; - vec[1].iov_base = " missing"; - vec[1].iov_len = strlen(vec[1].iov_base); + } else { /* caller retries */ + nvec = 0; } while (nvec && !err) { ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec); @@ -136,4 +137,6 @@ void cat_oid(SV *self, int fd, SV *oidsv) git_odb_object_free(object); if (err) croak("writev error: %s", strerror(err)); + + return rc == GIT_OK; } diff --git a/script/public-inbox-gcf2 b/script/public-inbox-gcf2 index 51811698..d2d2ac8b 100755 --- a/script/public-inbox-gcf2 +++ b/script/public-inbox-gcf2 @@ -3,12 +3,33 @@ # License: AGPL-3.0+ eval { require PublicInbox::Gcf2 }; die "libgit2 development package or Inline::C missing for $0: $@\n" if $@; +my @dirs; # may get big (30K-100K) my $gcf2 = PublicInbox::Gcf2::new(); +use IO::Handle; # autoflush +STDERR->autoflush(1); +STDOUT->autoflush(1); + while () { chomp; if (m!\A/!) { # +/path/to/git-dir + push @dirs, $_; $gcf2->add_alternate("$_/objects"); - } else { - $gcf2->cat_oid(1, $_); + } elsif (!$gcf2->cat_oid(1, $_)) { + # retry once if missing. We only get unabbreviated OIDs + # from SQLite or Xapian DBs, here, so malicious clients + # can't trigger excessive retries: + my $oid = $_; + warn "I: $$ $oid missing, retrying...\n"; + + # clients may need to wait a bit for this: + $gcf2 = PublicInbox::Gcf2::new(); + $gcf2->add_alternate("$_/objects") for @dirs; + + if ($gcf2->cat_oid(1, $oid)) { + warn "I: $$ $oid found after retry\n"; + } else { + warn "W: $$ $oid missing after retry\n"; + print "$oid missing\n"; # mimic git-cat-file + } } } diff --git a/t/gcf2.t b/t/gcf2.t index 9056b340..35b2f113 100644 --- a/t/gcf2.t +++ b/t/gcf2.t @@ -76,43 +76,41 @@ SKIP: { } open my $fh, '+>', undef or BAIL_OUT "open: $!"; - my $fd = fileno($fh); $fh->autoflush(1); - $gcf2->cat_oid($fd, 'invalid'); + ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails'); seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; - is(do { local $/; <$fh> }, "invalid missing\n", 'got missing message'); + is(do { local $/; <$fh> }, '', 'nothing written'); + open $fh, '+>', undef or BAIL_OUT "open: $!"; + ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails'); seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; - $gcf2->cat_oid($fd, '0'x40); - seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; - is(do { local $/; <$fh> }, ('0'x40)." missing\n", - 'got missing message for 0x40'); + is(do { local $/; <$fh> }, '', 'nothing written for z40'); - seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; - $gcf2->cat_oid($fd, $COPYING); - my $buf; + open $fh, '+>', undef or BAIL_OUT "open: $!"; my $ck_copying = sub { my ($desc) = @_; seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; - is(<$fh>, "$COPYING blob 34520\n", 'got expected header'); - $buf = do { local $/; <$fh> }; + is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc"); + my $buf = do { local $/; <$fh> }; is(chop($buf), "\n", 'got trailing \\n'); is($buf, $agpl, "AGPL matches ($desc)"); }; + ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal'); $ck_copying->('regular file'); $gcf2 = PublicInbox::Gcf2::new(); $gcf2->add_alternate("$tmpdir/objects"); - $ck_copying->('alternates respected'); + open $fh, '+>', undef or BAIL_OUT "open: $!"; + ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate'); + $ck_copying->('alternates after reopen'); - $^O eq 'linux' or skip('pipe tests are Linux-only', 12); - my $size = -s $fh; + $^O eq 'linux' or skip('pipe tests are Linux-only', 14); for my $blk (1, 0) { my ($r, $w); pipe($r, $w) or BAIL_OUT $!; fcntl($w, 1031, 4096) or - skip('Linux too old for F_SETPIPE_SZ', 12); + skip('Linux too old for F_SETPIPE_SZ', 14); $w->blocking($blk); seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!"; truncate($fh, 0) or BAIL_OUT "truncate: $!"; @@ -120,11 +118,11 @@ SKIP: { if ($pid == 0) { close $w; tick; # wait for parent to block on writev - $buf = do { local $/; <$r> }; + my $buf = do { local $/; <$r> }; print $fh $buf or _exit(1); _exit(0); } - $gcf2->cat_oid(fileno($w), $COPYING); + ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk"); close $w or BAIL_OUT "close: $!"; is(waitpid($pid, 0), $pid, 'child exited'); is($?, 0, 'no error in child'); diff --git a/t/gcf2_client.t b/t/gcf2_client.t index 39f9f296..0f7e7203 100644 --- a/t/gcf2_client.t +++ b/t/gcf2_client.t @@ -10,19 +10,25 @@ use PublicInbox::Import; require_mods('PublicInbox::Gcf2'); use_ok 'PublicInbox::Gcf2Client'; my ($tmpdir, $for_destroy) = tmpdir(); -PublicInbox::Import::init_bare($tmpdir); +my $git_a = "$tmpdir/a.git"; +my $git_b = "$tmpdir/b.git"; +PublicInbox::Import::init_bare($git_a); +PublicInbox::Import::init_bare($git_b); my $fi_data = './t/git.fast-import-data'; my $rdr = {}; open $rdr->{0}, '<', $fi_data or BAIL_OUT $!; -xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr); +xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr); is($?, 0, 'fast-import succeeded'); my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c'; my $called = 0; +my $err_f = "$tmpdir/err"; { local $ENV{PATH} = getcwd()."/blib/script:$ENV{PATH}"; - my $gcf2c = PublicInbox::Gcf2Client->new; - $gcf2c->add_git_dir($tmpdir); + open my $err, '>', $err_f or BAIL_OUT $!; + my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err }); + $gcf2c->add_git_dir($git_a); + $gcf2c->cat_async($tree, sub { my ($bref, $oid, $type, $size, $arg) = @_; is($oid, $tree, 'got expected OID'); @@ -32,6 +38,12 @@ my $called = 0; is($arg, 'hi', 'arg passed'); $called++; }, 'hi'); + $gcf2c->cat_async_wait; + + open $err, '<', $err_f or BAIL_OUT $!; + my $estr = do { local $/; <$err> }; + is($estr, '', 'nothing in stderr'); + my $trunc = substr($tree, 0, 39); $gcf2c->cat_async($trunc, sub { my ($bref, $oid, $type, $size, $arg) = @_; @@ -42,6 +54,38 @@ my $called = 0; is($arg, 'bye', 'arg passed when missing'); $called++; }, 'bye'); + $gcf2c->cat_async_wait; + + open $err, '<', $err_f or BAIL_OUT $!; + $estr = do { local $/; <$err> }; + like($estr, qr/retrying/, 'warned about retry'); + + # try failed alternates lookup + open $err, '>', $err_f or BAIL_OUT $!; + $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err }); + $gcf2c->add_git_dir($git_b); + $gcf2c->cat_async($tree, sub { + my ($bref, $oid, $type, $size, $arg) = @_; + is(undef, $bref, 'missing bref from alt is undef'); + $called++; + }); + $gcf2c->cat_async_wait; + open $err, '<', $err_f or BAIL_OUT $!; + $estr = do { local $/; <$err> }; + like($estr, qr/retrying/, 'warned about retry before alt update'); + + # now try successful alternates lookup + open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!; + print $alt "$git_a/objects\n" or BAIL_OUT $!; + close $alt or BAIL_OUT; + my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]); + $gcf2c->cat_async($tree, sub { + my ($bref, $oid, $type, $size, $arg) = @_; + is($oid, $tree, 'oid match on alternates retry'); + is($$bref, $expect, 'tree content matched'); + $called++; + }); + $gcf2c->cat_async_wait; } -is($called, 2, 'cat_async callbacks hit'); +is($called, 4, 'cat_async callbacks hit'); done_testing;