From 6512b1245ebc6fe30bb32227c0ef8f912d4988ab Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 21 Mar 2020 02:03:54 +0000 Subject: www: add endpoint to retrieve altid dumps This ensures all our indexed data, including data from altid searches (e.g. "gmane:$ARTNUM") is retrievable. It uses a "POST" request to avoid wasting cycles when invoked by crawlers, since it could potentially be several megabytes of data not indexable by search engines. --- MANIFEST | 2 + lib/PublicInbox/AltId.pm | 1 + lib/PublicInbox/WWW.pm | 14 ++++++- lib/PublicInbox/WwwAltId.pm | 94 +++++++++++++++++++++++++++++++++++++++++++++ t/www_altid.t | 83 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 192 insertions(+), 2 deletions(-) create mode 100644 lib/PublicInbox/WwwAltId.pm create mode 100644 t/www_altid.t diff --git a/MANIFEST b/MANIFEST index d53af77c..baaf25ca 100644 --- a/MANIFEST +++ b/MANIFEST @@ -169,6 +169,7 @@ lib/PublicInbox/ViewVCS.pm lib/PublicInbox/WWW.pm lib/PublicInbox/WWW.pod lib/PublicInbox/WatchMaildir.pm +lib/PublicInbox/WwwAltId.pm lib/PublicInbox/WwwAtomStream.pm lib/PublicInbox/WwwAttach.pm lib/PublicInbox/WwwHighlight.pm @@ -302,6 +303,7 @@ t/view.t t/watch_filter_rubylang.t t/watch_maildir.t t/watch_maildir_v2.t +t/www_altid.t t/www_listing.t t/www_static.t t/x-unknown-alpine.eml diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index 3be6c73c..6d16242a 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -39,6 +39,7 @@ sub new { bless { filename => $f, writable => $writable, + prefix => $prefix, xprefix => 'X'.uc($prefix), }, $class; } diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 2434f2f5..5017f572 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -65,6 +65,8 @@ sub call { my ($epoch, $path) = ($2, $3); return invalid_inbox($ctx, $1) || serve_git($ctx, $epoch, $path); + } elsif ($path_info =~ m!$INBOX_RE/(\w+)\.sql\.gz\z!o) { + return get_altid_dump($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } @@ -150,8 +152,8 @@ sub preload { require PublicInbox::Search; PublicInbox::Search::load_xapian(); }; - foreach (qw(PublicInbox::SearchView PublicInbox::MboxGz)) { - eval "require $_;"; + for (qw(SearchView MboxGz WwwAltId)) { + eval "require PublicInbox::$_;"; } if (ref($self)) { my $pi_config = $self->{pi_config}; @@ -301,6 +303,14 @@ sub get_vcs_object ($$$;$) { PublicInbox::ViewVCS::show($ctx, $oid, $filename); } +sub get_altid_dump { + my ($ctx, $inbox, $altid_pfx) =@_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + eval { require PublicInbox::WwwAltId } or return need($ctx, 'sqlite3'); + PublicInbox::WwwAltId::sqldump($ctx, $altid_pfx); +} + sub need { my ($ctx, $extra) = @_; my $msg = < +# License: AGPL-3.0+ + +# dumps using the ".dump" command of sqlite3(1) +package PublicInbox::WwwAltId; +use strict; +use PublicInbox::Qspawn; +use PublicInbox::WwwStream; +use PublicInbox::AltId; +use PublicInbox::Spawn qw(which); +our $sqlite3 = $ENV{SQLITE3}; + +# returns prefix => pathname mapping +# (pathname is NOT public, but prefix is used for Xapian queries) +sub altid_map ($) { + my ($ibx) = @_; + my $altid = $ibx->{altid} or return {}; + my %h = map {; + my $x = PublicInbox::AltId->new($ibx, $_); + "$x->{prefix}" => $x->{filename} + } @$altid; + \%h; +} + +sub sqlite3_missing ($) { + PublicInbox::WwwResponse::oneshot($_[0], 501, \<sqlite3 not available + +The administrator needs to install the sqlite3(1) binary +to support gzipped sqlite3 dumps. + +EOF +} + +sub check_output { + my ($r, $bref, $ctx) = @_; + return PublicInbox::WwwResponse::oneshot($ctx, 500) if !defined($r); + if ($r == 0) { + my $err = eval { $ctx->{env}->{'psgi.errors'} } // \*STDERR; + $err->print("unexpected EOF from sqlite3\n"); + return PublicInbox::WwwResponse::oneshot($ctx, 501); + } + [200, [ qw(Content-Type application/gzip), 'Content-Disposition', + "inline; filename=$ctx->{altid_pfx}.sql.gz" ] ] +} + +# POST $INBOX/$prefix.sql.gz +# we use the sqlite3(1) binary here since that's where the ".dump" +# command is implemented, not (AFAIK) in the libsqlite3 library +# and thus not usable from DBD::SQLite. +sub sqldump ($$) { + my ($ctx, $altid_pfx) = @_; + my $ibx = $ctx->{-inbox}; + my $altid_map = $ibx->{-altid_map} //= altid_map($ibx); + my $fn = $altid_map->{$altid_pfx}; + unless (defined $fn) { + return PublicInbox::WwwStream::oneshot($ctx, 404, \<`$altid_pfx' is not a valid altid for this inbox +EOF + } + + eval { require PublicInbox::GzipFilter } or + return PublicInbox::WwwStream::oneshot($ctx, 501, \<gzip output not available + +The administrator needs to install the Compress::Raw::Zlib Perl module +to support gzipped sqlite3 dumps. +EOF + $sqlite3 //= which('sqlite3'); + if (!defined($sqlite3)) { + return PublicInbox::WwwStream::oneshot($ctx, 501, \<sqlite3 not available + +The administrator needs to install the sqlite3(1) binary +to support gzipped sqlite3 dumps. + +EOF + } + + # setup stdin, POSIX requires writes <= 512 bytes to succeed so + # we can close the pipe right away. + pipe(my ($r, $w)) or die "pipe: $!"; + syswrite($w, ".dump\n") == 6 or die "write: $!"; + close($w) or die "close: $!"; + + # TODO: use -readonly if available with newer sqlite3(1) + my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r }); + my $env = $ctx->{env}; + $ctx->{altid_pfx} = $altid_pfx; + $env->{'qspawn.filter'} = PublicInbox::GzipFilter->new; + $qsp->psgi_return($env, undef, \&check_output, $ctx); +} + +1; diff --git a/t/www_altid.t b/t/www_altid.t new file mode 100644 index 00000000..a885c389 --- /dev/null +++ b/t/www_altid.t @@ -0,0 +1,83 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use PublicInbox::TestCommon; +use PublicInbox::Inbox; +use PublicInbox::InboxWritable; +use PublicInbox::Config; +use PublicInbox::Spawn qw(which spawn); +which('sqlite3') or plan skip_all => 'sqlite3 binary missing'; +require_mods(qw(DBD::SQLite HTTP::Request::Common Plack::Test URI::Escape + Plack::Builder IO::Uncompress::Gunzip)); +use_ok($_) for qw(Plack::Test HTTP::Request::Common); +require_ok 'PublicInbox::Msgmap'; +require_ok 'PublicInbox::AltId'; +require_ok 'PublicInbox::WWW'; +my ($inboxdir, $for_destroy) = tmpdir(); +my $aid = 'xyz'; +my $spec = "serial:$aid:file=blah.sqlite3"; +if ('setup') { + my $opts = { + inboxdir => $inboxdir, + name => 'test', + -primary_address => 'test@example.com', + }; + my $ibx = PublicInbox::Inbox->new($opts); + $ibx = PublicInbox::InboxWritable->new($ibx, 1); + my $im = $ibx->importer(0); + my $mime = PublicInbox::MIME->new(<<'EOF'); +From: a@example.com +Message-Id: + +EOF + $im->add($mime); + $im->done; + mkdir "$inboxdir/public-inbox" or die; + my $altid = PublicInbox::AltId->new($ibx, $spec, 1); + $altid->mm_alt->mid_set(1, 'a@example.com'); +} + +my $cfgpath = "$inboxdir/cfg"; +open my $fh, '>', $cfgpath or die; +print $fh <new($cfgpath); +my $www = PublicInbox::WWW->new($cfg); +my $cmpfile = "$inboxdir/cmp.sqlite3"; +my $client = sub { + my ($cb) = @_; + my $res = $cb->(POST("/test/$aid.sql.gz")); + is($res->code, 200, 'retrieved gzipped dump'); + IO::Uncompress::Gunzip::gunzip(\($res->content) => \(my $buf)); + pipe(my ($r, $w)) or die; + my $cmd = ['sqlite3', $cmpfile]; + my $pid = spawn($cmd, undef, { 0 => $r }); + print $w $buf or die; + close $w or die; + is(waitpid($pid, 0), $pid, 'sqlite3 exited'); + is($?, 0, 'sqlite3 loaded dump'); + my $mm_cmp = PublicInbox::Msgmap->new_file($cmpfile); + is($mm_cmp->mid_for(1), 'a@example.com', 'sqlite3 dump valid'); + $mm_cmp = undef; + unlink $cmpfile or die; +}; +test_psgi(sub { $www->call(@_) }, $client); +SKIP: { + require_mods(qw(Plack::Test::ExternalServer), 4); + my $env = { PI_CONFIG => $cfgpath }; + my $sock = tcp_server() or die; + my ($out, $err) = map { "$inboxdir/std$_.log" } qw(out err); + my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; + my $td = start_script($cmd, $env, { 3 => $sock }); + my ($h, $p) = ($sock->sockhost, $sock->sockport); + local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; + Plack::Test::ExternalServer::test_psgi(client => $client); +} +done_testing; -- cgit v1.2.3-24-ge0c7