about summary refs log tree commit homepage
diff options
authorEric Wong <e@yhbt.net>2020-03-21 02:03:54 +0000
committerEric Wong <e@yhbt.net>2020-03-25 01:48:35 +0000
commit6512b1245ebc6fe30bb32227c0ef8f912d4988ab (patch)
parent7909c5f7439777e3b3643f14224b27f0a8f8fb07 (diff)
This ensures all our indexed data, including data from altid
searches (e.g. "gmane:$ARTNUM") is retrievable.

It uses a "POST" request to avoid wasting cycles when invoked by
crawlers, since it could potentially be several megabytes of
data not indexable by search engines.
5 files changed, 192 insertions, 2 deletions
diff --git a/MANIFEST b/MANIFEST
index d53af77c..baaf25ca 100644
@@ -169,6 +169,7 @@ lib/PublicInbox/ViewVCS.pm
@@ -302,6 +303,7 @@ t/view.t
diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm
index 3be6c73c..6d16242a 100644
--- a/lib/PublicInbox/AltId.pm
+++ b/lib/PublicInbox/AltId.pm
@@ -39,6 +39,7 @@ sub new {
         bless {
                 filename => $f,
                 writable => $writable,
+                prefix => $prefix,
                 xprefix => 'X'.uc($prefix),
         }, $class;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 2434f2f5..5017f572 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -65,6 +65,8 @@ sub call {
                         my ($epoch, $path) = ($2, $3);
                         return invalid_inbox($ctx, $1) ||
                                 serve_git($ctx, $epoch, $path);
+                } elsif ($path_info =~ m!$INBOX_RE/(\w+)\.sql\.gz\z!o) {
+                        return get_altid_dump($ctx, $1, $2);
                 } elsif ($path_info =~ m!$INBOX_RE/!o) {
                         return invalid_inbox($ctx, $1) || mbox_results($ctx);
@@ -150,8 +152,8 @@ sub preload {
                 require PublicInbox::Search;
-        foreach (qw(PublicInbox::SearchView PublicInbox::MboxGz)) {
-                eval "require $_;";
+        for (qw(SearchView MboxGz WwwAltId)) {
+                eval "require PublicInbox::$_;";
         if (ref($self)) {
                 my $pi_config = $self->{pi_config};
@@ -301,6 +303,14 @@ sub get_vcs_object ($$$;$) {
         PublicInbox::ViewVCS::show($ctx, $oid, $filename);
+sub get_altid_dump {
+        my ($ctx, $inbox, $altid_pfx) =@_;
+        my $r404 = invalid_inbox($ctx, $inbox);
+        return $r404 if $r404;
+        eval { require PublicInbox::WwwAltId } or return need($ctx, 'sqlite3');
+        PublicInbox::WwwAltId::sqldump($ctx, $altid_pfx);
 sub need {
         my ($ctx, $extra) = @_;
         my $msg = <<EOF;
diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm
new file mode 100644
index 00000000..34641a92
--- /dev/null
+++ b/lib/PublicInbox/WwwAltId.pm
@@ -0,0 +1,94 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# dumps using the ".dump" command of sqlite3(1)
+package PublicInbox::WwwAltId;
+use strict;
+use PublicInbox::Qspawn;
+use PublicInbox::WwwStream;
+use PublicInbox::AltId;
+use PublicInbox::Spawn qw(which);
+our $sqlite3 = $ENV{SQLITE3};
+# returns prefix => pathname mapping
+# (pathname is NOT public, but prefix is used for Xapian queries)
+sub altid_map ($) {
+        my ($ibx) = @_;
+        my $altid = $ibx->{altid} or return {};
+        my %h = map {;
+                my $x = PublicInbox::AltId->new($ibx, $_);
+                "$x->{prefix}" => $x->{filename}
+        } @$altid;
+        \%h;
+sub sqlite3_missing ($) {
+        PublicInbox::WwwResponse::oneshot($_[0], 501, \<<EOF);
+<pre>sqlite3 not available
+The administrator needs to install the sqlite3(1) binary
+to support gzipped sqlite3 dumps.</pre>
+sub check_output {
+        my ($r, $bref, $ctx) = @_;
+        return PublicInbox::WwwResponse::oneshot($ctx, 500) if !defined($r);
+        if ($r == 0) {
+                my $err = eval { $ctx->{env}->{'psgi.errors'} } // \*STDERR;
+                $err->print("unexpected EOF from sqlite3\n");
+                return PublicInbox::WwwResponse::oneshot($ctx, 501);
+        }
+        [200, [ qw(Content-Type application/gzip), 'Content-Disposition',
+                "inline; filename=$ctx->{altid_pfx}.sql.gz" ] ]
+# POST $INBOX/$prefix.sql.gz
+# we use the sqlite3(1) binary here since that's where the ".dump"
+# command is implemented, not (AFAIK) in the libsqlite3 library
+# and thus not usable from DBD::SQLite.
+sub sqldump ($$) {
+        my ($ctx, $altid_pfx) = @_;
+        my $ibx = $ctx->{-inbox};
+        my $altid_map = $ibx->{-altid_map} //= altid_map($ibx);
+        my $fn = $altid_map->{$altid_pfx};
+        unless (defined $fn) {
+                return PublicInbox::WwwStream::oneshot($ctx, 404, \<<EOF);
+<pre>`$altid_pfx' is not a valid altid for this inbox</pre>
+        }
+        eval { require PublicInbox::GzipFilter } or
+                return PublicInbox::WwwStream::oneshot($ctx, 501, \<<EOF);
+<pre>gzip output not available
+The administrator needs to install the Compress::Raw::Zlib Perl module
+to support gzipped sqlite3 dumps.</pre>
+        $sqlite3 //= which('sqlite3');
+        if (!defined($sqlite3)) {
+                return PublicInbox::WwwStream::oneshot($ctx, 501, \<<EOF);
+<pre>sqlite3 not available
+The administrator needs to install the sqlite3(1) binary
+to support gzipped sqlite3 dumps.</pre>
+        }
+        # setup stdin, POSIX requires writes <= 512 bytes to succeed so
+        # we can close the pipe right away.
+        pipe(my ($r, $w)) or die "pipe: $!";
+        syswrite($w, ".dump\n") == 6 or die "write: $!";
+        close($w) or die "close: $!";
+        # TODO: use -readonly if available with newer sqlite3(1)
+        my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r });
+        my $env = $ctx->{env};
+        $ctx->{altid_pfx} = $altid_pfx;
+        $env->{'qspawn.filter'} = PublicInbox::GzipFilter->new;
+        $qsp->psgi_return($env, undef, \&check_output, $ctx);
diff --git a/t/www_altid.t b/t/www_altid.t
new file mode 100644
index 00000000..a885c389
--- /dev/null
+++ b/t/www_altid.t
@@ -0,0 +1,83 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Inbox;
+use PublicInbox::InboxWritable;
+use PublicInbox::Config;
+use PublicInbox::Spawn qw(which spawn);
+which('sqlite3') or plan skip_all => 'sqlite3 binary missing';
+require_mods(qw(DBD::SQLite HTTP::Request::Common Plack::Test URI::Escape
+        Plack::Builder IO::Uncompress::Gunzip));
+use_ok($_) for qw(Plack::Test HTTP::Request::Common);
+require_ok 'PublicInbox::Msgmap';
+require_ok 'PublicInbox::AltId';
+require_ok 'PublicInbox::WWW';
+my ($inboxdir, $for_destroy) = tmpdir();
+my $aid = 'xyz';
+my $spec = "serial:$aid:file=blah.sqlite3";
+if ('setup') {
+        my $opts = {
+                inboxdir => $inboxdir,
+                name => 'test',
+                -primary_address => 'test@example.com',
+        };
+        my $ibx = PublicInbox::Inbox->new($opts);
+        $ibx = PublicInbox::InboxWritable->new($ibx, 1);
+        my $im = $ibx->importer(0);
+        my $mime = PublicInbox::MIME->new(<<'EOF');
+From: a@example.com
+Message-Id: <a@example.com>
+        $im->add($mime);
+        $im->done;
+        mkdir "$inboxdir/public-inbox" or die;
+        my $altid = PublicInbox::AltId->new($ibx, $spec, 1);
+        $altid->mm_alt->mid_set(1, 'a@example.com');
+my $cfgpath = "$inboxdir/cfg";
+open my $fh, '>', $cfgpath or die;
+print $fh <<EOF or die;
+[publicinbox "test"]
+        inboxdir = $inboxdir
+        address = test\@example.com
+        altid = $spec
+        url = http://example.com/test
+close $fh or die;
+my $cfg = PublicInbox::Config->new($cfgpath);
+my $www = PublicInbox::WWW->new($cfg);
+my $cmpfile = "$inboxdir/cmp.sqlite3";
+my $client = sub {
+        my ($cb) = @_;
+        my $res = $cb->(POST("/test/$aid.sql.gz"));
+        is($res->code, 200, 'retrieved gzipped dump');
+        IO::Uncompress::Gunzip::gunzip(\($res->content) => \(my $buf));
+        pipe(my ($r, $w)) or die;
+        my $cmd = ['sqlite3', $cmpfile];
+        my $pid = spawn($cmd, undef, { 0 => $r });
+        print $w $buf or die;
+        close $w or die;
+        is(waitpid($pid, 0), $pid, 'sqlite3 exited');
+        is($?, 0, 'sqlite3 loaded dump');
+        my $mm_cmp = PublicInbox::Msgmap->new_file($cmpfile);
+        is($mm_cmp->mid_for(1), 'a@example.com', 'sqlite3 dump valid');
+        $mm_cmp = undef;
+        unlink $cmpfile or die;
+test_psgi(sub { $www->call(@_) }, $client);
+SKIP: {
+        require_mods(qw(Plack::Test::ExternalServer), 4);
+        my $env = { PI_CONFIG => $cfgpath };
+        my $sock = tcp_server() or die;
+        my ($out, $err) = map { "$inboxdir/std$_.log" } qw(out err);
+        my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ];
+        my $td = start_script($cmd, $env, { 3 => $sock });
+        my ($h, $p) = ($sock->sockhost, $sock->sockport);
+        local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p";
+        Plack::Test::ExternalServer::test_psgi(client => $client);