From 6512b1245ebc6fe30bb32227c0ef8f912d4988ab Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 21 Mar 2020 02:03:54 +0000 Subject: www: add endpoint to retrieve altid dumps This ensures all our indexed data, including data from altid searches (e.g. "gmane:$ARTNUM") is retrievable. It uses a "POST" request to avoid wasting cycles when invoked by crawlers, since it could potentially be several megabytes of data not indexable by search engines. --- lib/PublicInbox/AltId.pm | 1 + lib/PublicInbox/WWW.pm | 14 ++++++- lib/PublicInbox/WwwAltId.pm | 94 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 lib/PublicInbox/WwwAltId.pm (limited to 'lib') diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index 3be6c73c..6d16242a 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -39,6 +39,7 @@ sub new { bless { filename => $f, writable => $writable, + prefix => $prefix, xprefix => 'X'.uc($prefix), }, $class; } diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 2434f2f5..5017f572 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -65,6 +65,8 @@ sub call { my ($epoch, $path) = ($2, $3); return invalid_inbox($ctx, $1) || serve_git($ctx, $epoch, $path); + } elsif ($path_info =~ m!$INBOX_RE/(\w+)\.sql\.gz\z!o) { + return get_altid_dump($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } @@ -150,8 +152,8 @@ sub preload { require PublicInbox::Search; PublicInbox::Search::load_xapian(); }; - foreach (qw(PublicInbox::SearchView PublicInbox::MboxGz)) { - eval "require $_;"; + for (qw(SearchView MboxGz WwwAltId)) { + eval "require PublicInbox::$_;"; } if (ref($self)) { my $pi_config = $self->{pi_config}; @@ -301,6 +303,14 @@ sub get_vcs_object ($$$;$) { PublicInbox::ViewVCS::show($ctx, $oid, $filename); } +sub get_altid_dump { + my ($ctx, $inbox, $altid_pfx) =@_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + eval { require PublicInbox::WwwAltId } or return need($ctx, 'sqlite3'); + PublicInbox::WwwAltId::sqldump($ctx, $altid_pfx); +} + sub need { my ($ctx, $extra) = @_; my $msg = < +# License: AGPL-3.0+ + +# dumps using the ".dump" command of sqlite3(1) +package PublicInbox::WwwAltId; +use strict; +use PublicInbox::Qspawn; +use PublicInbox::WwwStream; +use PublicInbox::AltId; +use PublicInbox::Spawn qw(which); +our $sqlite3 = $ENV{SQLITE3}; + +# returns prefix => pathname mapping +# (pathname is NOT public, but prefix is used for Xapian queries) +sub altid_map ($) { + my ($ibx) = @_; + my $altid = $ibx->{altid} or return {}; + my %h = map {; + my $x = PublicInbox::AltId->new($ibx, $_); + "$x->{prefix}" => $x->{filename} + } @$altid; + \%h; +} + +sub sqlite3_missing ($) { + PublicInbox::WwwResponse::oneshot($_[0], 501, \<sqlite3 not available + +The administrator needs to install the sqlite3(1) binary +to support gzipped sqlite3 dumps. + +EOF +} + +sub check_output { + my ($r, $bref, $ctx) = @_; + return PublicInbox::WwwResponse::oneshot($ctx, 500) if !defined($r); + if ($r == 0) { + my $err = eval { $ctx->{env}->{'psgi.errors'} } // \*STDERR; + $err->print("unexpected EOF from sqlite3\n"); + return PublicInbox::WwwResponse::oneshot($ctx, 501); + } + [200, [ qw(Content-Type application/gzip), 'Content-Disposition', + "inline; filename=$ctx->{altid_pfx}.sql.gz" ] ] +} + +# POST $INBOX/$prefix.sql.gz +# we use the sqlite3(1) binary here since that's where the ".dump" +# command is implemented, not (AFAIK) in the libsqlite3 library +# and thus not usable from DBD::SQLite. +sub sqldump ($$) { + my ($ctx, $altid_pfx) = @_; + my $ibx = $ctx->{-inbox}; + my $altid_map = $ibx->{-altid_map} //= altid_map($ibx); + my $fn = $altid_map->{$altid_pfx}; + unless (defined $fn) { + return PublicInbox::WwwStream::oneshot($ctx, 404, \<`$altid_pfx' is not a valid altid for this inbox +EOF + } + + eval { require PublicInbox::GzipFilter } or + return PublicInbox::WwwStream::oneshot($ctx, 501, \<gzip output not available + +The administrator needs to install the Compress::Raw::Zlib Perl module +to support gzipped sqlite3 dumps. +EOF + $sqlite3 //= which('sqlite3'); + if (!defined($sqlite3)) { + return PublicInbox::WwwStream::oneshot($ctx, 501, \<sqlite3 not available + +The administrator needs to install the sqlite3(1) binary +to support gzipped sqlite3 dumps. + +EOF + } + + # setup stdin, POSIX requires writes <= 512 bytes to succeed so + # we can close the pipe right away. + pipe(my ($r, $w)) or die "pipe: $!"; + syswrite($w, ".dump\n") == 6 or die "write: $!"; + close($w) or die "close: $!"; + + # TODO: use -readonly if available with newer sqlite3(1) + my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r }); + my $env = $ctx->{env}; + $ctx->{altid_pfx} = $altid_pfx; + $env->{'qspawn.filter'} = PublicInbox::GzipFilter->new; + $qsp->psgi_return($env, undef, \&check_output, $ctx); +} + +1; -- cgit v1.2.3-24-ge0c7