about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-09-24 10:56:43 +0000
committerEric Wong <e@80x24.org>2021-09-24 23:22:07 +0000
commit3596019278ef489f27e0659c752977f60f847903 (patch)
treedc3c938370ca612e66c755fa9f90566090b4fec8 /lib/PublicInbox
parent3f7ba918e134e9f86c1f2bc90a89ae94f0c2dbf6 (diff)
downloadpublic-inbox-3596019278ef489f27e0659c752977f60f847903.tar.gz
There may still be pre-manifest.js.gz versions of PublicInbox::WWW.
running and serving v2 inboxes.

Since $INBOX_URL/manifest.js.gz was not understood, it was
assumed to be a Message-ID and 301-ed to
"$INBOX_URL/manifest.js.gz/" with a trailing slash, so our 404
checks were invalid.  Update our fallbacks to deal with 301
by catching JSON decoding errors to trigger HTML scraping.

For HTML parsing, be sure to not be fooled by potential
user-generated content and only scan the part after the last
<hr>.

We also need to avoid propagating $? from curl unnecessarily
when we can continue safely.

Finally, update v2mirror.t with tests to use PublicInbox::WWW
from our "v1.1.0-pre1" tag to ensure these code paths get tested
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/LeiMirror.pm13
-rw-r--r--lib/PublicInbox/TestCommon.pm1
2 files changed, 11 insertions, 3 deletions
diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm
index 53f7dd31..fe81b967 100644
--- a/lib/PublicInbox/LeiMirror.pm
+++ b/lib/PublicInbox/LeiMirror.pm
@@ -42,7 +42,8 @@ sub try_scrape {
 
         # we grep with URL below, we don't want Subject/From headers
         # making us clone random URLs
-        my @urls = ($html =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g);
+        my @html = split(/<hr>/, $html);
+        my @urls = ($html[-1] =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g);
         my $url = $uri->as_string;
         chop($url) eq '/' or die "BUG: $uri not canonicalized";
 
@@ -184,7 +185,9 @@ sub run_reap {
         my $reap = PublicInbox::OnDestroy->new($lei->can('sigint_reap'), $pid);
         waitpid($pid, 0) == $pid or die "waitpid @$cmd: $!";
         @$reap = (); # cancel reap
-        $?
+        my $ret = $?;
+        $? = 0; # don't let it influence normal exit
+        $ret;
 }
 
 sub clone_v1 {
@@ -358,7 +361,11 @@ sub try_manifest {
                 return try_scrape($self) if ($cerr >> 8) == 22; # 404 missing
                 return $lei->child_error($cerr, "@$cmd failed");
         }
-        my $m = decode_manifest($ft, $fn, $uri);
+        my $m = eval { decode_manifest($ft, $fn, $uri) };
+        if ($@) {
+                warn $@;
+                return try_scrape($self);
+        }
         my ($path_pfx, $v1_path, @v2_epochs) = deduce_epochs($m, $path);
         if (@v2_epochs) {
                 # It may be possible to have v1 + v2 in parallel someday:
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index aff34853..cd706e0e 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -469,6 +469,7 @@ sub start_script {
                         $ENV{LISTEN_PID} = $$;
                         $ENV{LISTEN_FDS} = $fds;
                 }
+                if ($opt->{-C}) { chdir($opt->{-C}) or die "chdir: $!" }
                 $0 = join(' ', @$cmd);
                 if ($sub) {
                         eval { PublicInbox::DS->Reset };