From: Eric Wong <e@80x24.org> To: meta@public-inbox.org Subject: [PATCH 3/5] clone|--mirror: fix and test against pre-manifest WWW Date: Fri, 24 Sep 2021 10:56:43 +0000 [thread overview] Message-ID: <20210924105645.8627-4-e@80x24.org> (raw) In-Reply-To: <20210924105645.8627-1-e@80x24.org> There may still be pre-manifest.js.gz versions of PublicInbox::WWW. running and serving v2 inboxes. Since $INBOX_URL/manifest.js.gz was not understood, it was assumed to be a Message-ID and 301-ed to "$INBOX_URL/manifest.js.gz/" with a trailing slash, so our 404 checks were invalid. Update our fallbacks to deal with 301 by catching JSON decoding errors to trigger HTML scraping. For HTML parsing, be sure to not be fooled by potential user-generated content and only scan the part after the last <hr>. We also need to avoid propagating $? from curl unnecessarily when we can continue safely. Finally, update v2mirror.t with tests to use PublicInbox::WWW from our "v1.1.0-pre1" tag to ensure these code paths get tested --- lib/PublicInbox/LeiMirror.pm | 13 ++++-- lib/PublicInbox/TestCommon.pm | 1 + t/v2mirror.t | 78 +++++++++++++++++++++++++++++++---- 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm index 53f7dd31..fe81b967 100644 --- a/lib/PublicInbox/LeiMirror.pm +++ b/lib/PublicInbox/LeiMirror.pm @@ -42,7 +42,8 @@ sub try_scrape { # we grep with URL below, we don't want Subject/From headers # making us clone random URLs - my @urls = ($html =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g); + my @html = split(/<hr>/, $html); + my @urls = ($html[-1] =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g); my $url = $uri->as_string; chop($url) eq '/' or die "BUG: $uri not canonicalized"; @@ -184,7 +185,9 @@ sub run_reap { my $reap = PublicInbox::OnDestroy->new($lei->can('sigint_reap'), $pid); waitpid($pid, 0) == $pid or die "waitpid @$cmd: $!"; @$reap = (); # cancel reap - $? + my $ret = $?; + $? = 0; # don't let it influence normal exit + $ret; } sub clone_v1 { @@ -358,7 +361,11 @@ sub try_manifest { return try_scrape($self) if ($cerr >> 8) == 22; # 404 missing return $lei->child_error($cerr, "@$cmd failed"); } - my $m = decode_manifest($ft, $fn, $uri); + my $m = eval { decode_manifest($ft, $fn, $uri) }; + if ($@) { + warn $@; + return try_scrape($self); + } my ($path_pfx, $v1_path, @v2_epochs) = deduce_epochs($m, $path); if (@v2_epochs) { # It may be possible to have v1 + v2 in parallel someday: diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index aff34853..cd706e0e 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -469,6 +469,7 @@ sub start_script { $ENV{LISTEN_PID} = $$; $ENV{LISTEN_FDS} = $fds; } + if ($opt->{-C}) { chdir($opt->{-C}) or die "chdir: $!" } $0 = join(' ', @$cmd); if ($sub) { eval { PublicInbox::DS->Reset }; diff --git a/t/v2mirror.t b/t/v2mirror.t index 20a8daaa..1231b72d 100644 --- a/t/v2mirror.t +++ b/t/v2mirror.t @@ -5,6 +5,7 @@ use v5.10.1; use PublicInbox::TestCommon; use File::Path qw(remove_tree make_path); use Cwd qw(abs_path); +use PublicInbox::Spawn qw(which); require_git(2.6); require_cmd('curl'); local $ENV{HOME} = abs_path('t'); @@ -23,7 +24,8 @@ my $pi_config = "$tmpdir/config"; open my $fh, '>', $pi_config or die "open($pi_config): $!"; print $fh <<"" or die "print $pi_config: $!"; [publicinbox "v2"] - inboxdir = $tmpdir/in +; using "mainrepo" rather than "inboxdir" for v1.1.0-pre1 WWW compat below + mainrepo = $tmpdir/in address = test\@example.com close $fh or die "close($pi_config): $!"; @@ -62,11 +64,11 @@ $v2w->done; } $ibx->cleanup; -my $sock = tcp_server(); +local $ENV{TEST_IPV4_ONLY} = 1; # plackup (below) doesn't do IPv6 +my $rdr = { 3 => tcp_server() }; my @cmd = ('-httpd', '-W0', "--stdout=$tmpdir/out", "--stderr=$tmpdir/err"); -my $td = start_script(\@cmd, undef, { 3 => $sock }); -my ($host, $port) = tcp_host_port($sock); -$sock = undef; +my $td = start_script(\@cmd, undef, $rdr); +my ($host, $port) = tcp_host_port(delete $rdr->{3}); @cmd = (qw(-clone -q), "http://$host:$port/v2/", "$tmpdir/m"); run_script(\@cmd) or xbail '-clone'; @@ -288,7 +290,69 @@ if ('test read-only epoch dirs') { 'got one more cloned epoch'); } -ok($td->kill, 'killed httpd'); -$td->join; +my $err = ''; +my $v110 = xqx([qw(git rev-parse v1.1.0-pre1)], undef, { 2 => \$err }); +SKIP: { + skip("no detected public-inbox GIT_DIR ($err)", 1) if $?; + # using plackup to test old PublicInbox::WWW since -httpd from + # back then relied on some packages we no longer depend on + my $plackup = which('plackup') or skip('no plackup in path', 1); + require PublicInbox::Lock; + chomp $v110; + my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); + my $wt = "t/data-gen/$base.pre-manifest"; + my $lk = bless { lock_path => __FILE__ }, 'PublicInbox::Lock'; + $lk->lock_acquire; + my $psgi = "$wt/app.psgi"; + if (!-f $psgi) { # checkout a pre-manifest.js.gz version + my $t = File::Temp->new(TEMPLATE => 'g-XXXX', TMPDIR => 1); + my $env = { GIT_INDEX_FILE => $t->filename }; + xsys([qw(git read-tree), $v110], $env) and xbail 'read-tree'; + xsys([qw(git checkout-index -a), "--prefix=$wt/"], $env) + and xbail 'checkout-index'; + my $f = "$wt/app.psgi.tmp.$$"; + open my $fh, '>', $f or xbail $!; + print $fh <<'EOM' or xbail $!; +use Plack::Builder; +use PublicInbox::WWW; +my $www = PublicInbox::WWW->new; +builder { enable 'Head'; sub { $www->call(@_) } } +EOM + close $fh or xbail $!; + rename($f, $psgi) or xbail $!; + } + $lk->lock_release; + + $rdr->{run_mode} = 0; + $rdr->{-C} = $wt; + my $cmd = [$plackup, qw(-Enone -Ilib), "--host=$host", "--port=$port"]; + $td->join('TERM'); + open $rdr->{2}, '>>', "$tmpdir/plackup.err.log" or xbail "open: $!"; + open $rdr->{1}, '>>&', $rdr->{2} or xbail "open: $!"; + $td = start_script($cmd, { PERL5LIB => 'lib' }, $rdr); + # wait for plackup socket()+bind()+listen() + my %opt = ( Proto => 'tcp', Type => Socket::SOCK_STREAM(), + PeerAddr => "$host:$port" ); + for (0..50) { + tick(); + last if IO::Socket::INET->new(%opt); + } + my $dst = "$tmpdir/scrape"; + @cmd = (qw(-clone -q), "http://$host:$port/v2", $dst); + run_script(\@cmd, undef, { 2 => \(my $err = '') }); + is($?, 0, 'scraping clone on old PublicInbox::WWW') + or diag $err; + my @g_all = glob("$dst/git/*.git"); + ok(scalar(@g_all) > 1, 'cloned multiple epochs'); + + remove_tree($dst); + @cmd = (qw(-clone -q --epoch=~0), "http://$host:$port/v2", $dst); + run_script(\@cmd, undef, { 2 => \($err = '') }); + is($?, 0, 'partial scraping clone on old PublicInbox::WWW'); + my @g_last = grep { -w $_ } glob("$dst/git/*.git"); + is_deeply(\@g_last, [ $g_all[-1] ], 'partial clone of ~0 worked'); + + $td->join('TERM'); +} done_testing;
next prev parent reply other threads:[~2021-09-24 10:56 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-09-24 10:56 [PATCH 0/5] clone|fetch: flesh out partial mirror support Eric Wong 2021-09-24 10:56 ` [PATCH 1/5] clone|--mirror: support --epoch=RANGE for partial clones Eric Wong 2021-09-24 10:56 ` [PATCH 2/5] fetch: fix skipping with multi-epoch inboxes Eric Wong 2021-09-24 10:56 ` Eric Wong [this message] 2021-09-24 10:56 ` [PATCH 4/5] clone|fetch|--mirror: cull manifest in partial mirrors Eric Wong 2021-09-24 10:56 ` [PATCH 5/5] fetch: support v2 w/o manifest on old WWW Eric Wong 2021-09-25 3:21 ` [PATCH 6/5] t/v2mirror: check dependencies for legacy test Eric Wong
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style List information: https://public-inbox.org/README * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210924105645.8627-4-e@80x24.org \ --to=e@80x24.org \ --cc=meta@public-inbox.org \ --subject='Re: [PATCH 3/5] clone|--mirror: fix and test against pre-manifest WWW' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Code repositories for project(s) associated with this inbox: https://80x24.org/public-inbox.git This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).