user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 11/11] www: add endpoint to retrieve altid dumps
Date: Sat, 21 Mar 2020 02:03:54 +0000
Message-ID: <20200321020354.9056-12-e@yhbt.net> (raw)
In-Reply-To: <20200321020354.9056-1-e@yhbt.net>

This ensures all our indexed data, including data from altid
searches (e.g. "gmane:$ARTNUM") is retrievable.

It uses a "POST" request to avoid wasting cycles when invoked by
crawlers, since it could potentially be several megabytes of
data not indexable by search engines.
---
 MANIFEST                    |  2 +
 lib/PublicInbox/AltId.pm    |  1 +
 lib/PublicInbox/WWW.pm      | 14 +++++-
 lib/PublicInbox/WwwAltId.pm | 94 +++++++++++++++++++++++++++++++++++++
 t/www_altid.t               | 83 ++++++++++++++++++++++++++++++++
 5 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 lib/PublicInbox/WwwAltId.pm
 create mode 100644 t/www_altid.t

diff --git a/MANIFEST b/MANIFEST
index be1c4ab5..84872561 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -168,6 +168,7 @@ lib/PublicInbox/ViewVCS.pm
 lib/PublicInbox/WWW.pm
 lib/PublicInbox/WWW.pod
 lib/PublicInbox/WatchMaildir.pm
+lib/PublicInbox/WwwAltId.pm
 lib/PublicInbox/WwwAtomStream.pm
 lib/PublicInbox/WwwAttach.pm
 lib/PublicInbox/WwwHighlight.pm
@@ -300,6 +301,7 @@ t/view.t
 t/watch_filter_rubylang.t
 t/watch_maildir.t
 t/watch_maildir_v2.t
+t/www_altid.t
 t/www_listing.t
 t/www_static.t
 t/x-unknown-alpine.eml
diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm
index 3be6c73c..6d16242a 100644
--- a/lib/PublicInbox/AltId.pm
+++ b/lib/PublicInbox/AltId.pm
@@ -39,6 +39,7 @@ sub new {
 	bless {
 		filename => $f,
 		writable => $writable,
+		prefix => $prefix,
 		xprefix => 'X'.uc($prefix),
 	}, $class;
 }
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 2434f2f5..5017f572 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -65,6 +65,8 @@ sub call {
 			my ($epoch, $path) = ($2, $3);
 			return invalid_inbox($ctx, $1) ||
 				serve_git($ctx, $epoch, $path);
+		} elsif ($path_info =~ m!$INBOX_RE/(\w+)\.sql\.gz\z!o) {
+			return get_altid_dump($ctx, $1, $2);
 		} elsif ($path_info =~ m!$INBOX_RE/!o) {
 			return invalid_inbox($ctx, $1) || mbox_results($ctx);
 		}
@@ -150,8 +152,8 @@ sub preload {
 		require PublicInbox::Search;
 		PublicInbox::Search::load_xapian();
 	};
-	foreach (qw(PublicInbox::SearchView PublicInbox::MboxGz)) {
-		eval "require $_;";
+	for (qw(SearchView MboxGz WwwAltId)) {
+		eval "require PublicInbox::$_;";
 	}
 	if (ref($self)) {
 		my $pi_config = $self->{pi_config};
@@ -301,6 +303,14 @@ sub get_vcs_object ($$$;$) {
 	PublicInbox::ViewVCS::show($ctx, $oid, $filename);
 }
 
+sub get_altid_dump {
+	my ($ctx, $inbox, $altid_pfx) =@_;
+	my $r404 = invalid_inbox($ctx, $inbox);
+	return $r404 if $r404;
+	eval { require PublicInbox::WwwAltId } or return need($ctx, 'sqlite3');
+	PublicInbox::WwwAltId::sqldump($ctx, $altid_pfx);
+}
+
 sub need {
 	my ($ctx, $extra) = @_;
 	my $msg = <<EOF;
diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm
new file mode 100644
index 00000000..34641a92
--- /dev/null
+++ b/lib/PublicInbox/WwwAltId.pm
@@ -0,0 +1,94 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# dumps using the ".dump" command of sqlite3(1)
+package PublicInbox::WwwAltId;
+use strict;
+use PublicInbox::Qspawn;
+use PublicInbox::WwwStream;
+use PublicInbox::AltId;
+use PublicInbox::Spawn qw(which);
+our $sqlite3 = $ENV{SQLITE3};
+
+# returns prefix => pathname mapping
+# (pathname is NOT public, but prefix is used for Xapian queries)
+sub altid_map ($) {
+	my ($ibx) = @_;
+	my $altid = $ibx->{altid} or return {};
+	my %h = map {;
+		my $x = PublicInbox::AltId->new($ibx, $_);
+		"$x->{prefix}" => $x->{filename}
+	} @$altid;
+	\%h;
+}
+
+sub sqlite3_missing ($) {
+	PublicInbox::WwwResponse::oneshot($_[0], 501, \<<EOF);
+<pre>sqlite3 not available
+
+The administrator needs to install the sqlite3(1) binary
+to support gzipped sqlite3 dumps.</pre>
+</pre>
+EOF
+}
+
+sub check_output {
+	my ($r, $bref, $ctx) = @_;
+	return PublicInbox::WwwResponse::oneshot($ctx, 500) if !defined($r);
+	if ($r == 0) {
+		my $err = eval { $ctx->{env}->{'psgi.errors'} } // \*STDERR;
+		$err->print("unexpected EOF from sqlite3\n");
+		return PublicInbox::WwwResponse::oneshot($ctx, 501);
+	}
+	[200, [ qw(Content-Type application/gzip), 'Content-Disposition',
+		"inline; filename=$ctx->{altid_pfx}.sql.gz" ] ]
+}
+
+# POST $INBOX/$prefix.sql.gz
+# we use the sqlite3(1) binary here since that's where the ".dump"
+# command is implemented, not (AFAIK) in the libsqlite3 library
+# and thus not usable from DBD::SQLite.
+sub sqldump ($$) {
+	my ($ctx, $altid_pfx) = @_;
+	my $ibx = $ctx->{-inbox};
+	my $altid_map = $ibx->{-altid_map} //= altid_map($ibx);
+	my $fn = $altid_map->{$altid_pfx};
+	unless (defined $fn) {
+		return PublicInbox::WwwStream::oneshot($ctx, 404, \<<EOF);
+<pre>`$altid_pfx' is not a valid altid for this inbox</pre>
+EOF
+	}
+
+	eval { require PublicInbox::GzipFilter } or
+		return PublicInbox::WwwStream::oneshot($ctx, 501, \<<EOF);
+<pre>gzip output not available
+
+The administrator needs to install the Compress::Raw::Zlib Perl module
+to support gzipped sqlite3 dumps.</pre>
+EOF
+	$sqlite3 //= which('sqlite3');
+	if (!defined($sqlite3)) {
+		return PublicInbox::WwwStream::oneshot($ctx, 501, \<<EOF);
+<pre>sqlite3 not available
+
+The administrator needs to install the sqlite3(1) binary
+to support gzipped sqlite3 dumps.</pre>
+</pre>
+EOF
+	}
+
+	# setup stdin, POSIX requires writes <= 512 bytes to succeed so
+	# we can close the pipe right away.
+	pipe(my ($r, $w)) or die "pipe: $!";
+	syswrite($w, ".dump\n") == 6 or die "write: $!";
+	close($w) or die "close: $!";
+
+	# TODO: use -readonly if available with newer sqlite3(1)
+	my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r });
+	my $env = $ctx->{env};
+	$ctx->{altid_pfx} = $altid_pfx;
+	$env->{'qspawn.filter'} = PublicInbox::GzipFilter->new;
+	$qsp->psgi_return($env, undef, \&check_output, $ctx);
+}
+
+1;
diff --git a/t/www_altid.t b/t/www_altid.t
new file mode 100644
index 00000000..a885c389
--- /dev/null
+++ b/t/www_altid.t
@@ -0,0 +1,83 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Inbox;
+use PublicInbox::InboxWritable;
+use PublicInbox::Config;
+use PublicInbox::Spawn qw(which spawn);
+which('sqlite3') or plan skip_all => 'sqlite3 binary missing';
+require_mods(qw(DBD::SQLite HTTP::Request::Common Plack::Test URI::Escape
+	Plack::Builder IO::Uncompress::Gunzip));
+use_ok($_) for qw(Plack::Test HTTP::Request::Common);
+require_ok 'PublicInbox::Msgmap';
+require_ok 'PublicInbox::AltId';
+require_ok 'PublicInbox::WWW';
+my ($inboxdir, $for_destroy) = tmpdir();
+my $aid = 'xyz';
+my $spec = "serial:$aid:file=blah.sqlite3";
+if ('setup') {
+	my $opts = {
+		inboxdir => $inboxdir,
+		name => 'test',
+		-primary_address => 'test@example.com',
+	};
+	my $ibx = PublicInbox::Inbox->new($opts);
+	$ibx = PublicInbox::InboxWritable->new($ibx, 1);
+	my $im = $ibx->importer(0);
+	my $mime = PublicInbox::MIME->new(<<'EOF');
+From: a@example.com
+Message-Id: <a@example.com>
+
+EOF
+	$im->add($mime);
+	$im->done;
+	mkdir "$inboxdir/public-inbox" or die;
+	my $altid = PublicInbox::AltId->new($ibx, $spec, 1);
+	$altid->mm_alt->mid_set(1, 'a@example.com');
+}
+
+my $cfgpath = "$inboxdir/cfg";
+open my $fh, '>', $cfgpath or die;
+print $fh <<EOF or die;
+[publicinbox "test"]
+	inboxdir = $inboxdir
+	address = test\@example.com
+	altid = $spec
+	url = http://example.com/test
+EOF
+close $fh or die;
+my $cfg = PublicInbox::Config->new($cfgpath);
+my $www = PublicInbox::WWW->new($cfg);
+my $cmpfile = "$inboxdir/cmp.sqlite3";
+my $client = sub {
+	my ($cb) = @_;
+	my $res = $cb->(POST("/test/$aid.sql.gz"));
+	is($res->code, 200, 'retrieved gzipped dump');
+	IO::Uncompress::Gunzip::gunzip(\($res->content) => \(my $buf));
+	pipe(my ($r, $w)) or die;
+	my $cmd = ['sqlite3', $cmpfile];
+	my $pid = spawn($cmd, undef, { 0 => $r });
+	print $w $buf or die;
+	close $w or die;
+	is(waitpid($pid, 0), $pid, 'sqlite3 exited');
+	is($?, 0, 'sqlite3 loaded dump');
+	my $mm_cmp = PublicInbox::Msgmap->new_file($cmpfile);
+	is($mm_cmp->mid_for(1), 'a@example.com', 'sqlite3 dump valid');
+	$mm_cmp = undef;
+	unlink $cmpfile or die;
+};
+test_psgi(sub { $www->call(@_) }, $client);
+SKIP: {
+	require_mods(qw(Plack::Test::ExternalServer), 4);
+	my $env = { PI_CONFIG => $cfgpath };
+	my $sock = tcp_server() or die;
+	my ($out, $err) = map { "$inboxdir/std$_.log" } qw(out err);
+	my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ];
+	my $td = start_script($cmd, $env, { 3 => $sock });
+	my ($h, $p) = ($sock->sockhost, $sock->sockport);
+	local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p";
+	Plack::Test::ExternalServer::test_psgi(client => $client);
+}
+done_testing;

      parent reply index

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-21  2:03 [PATCH 00/11] www: export SQLite " Eric Wong
2020-03-21  2:03 ` [PATCH 01/11] qspawn: reinstate filter support, add gzip filter Eric Wong
2020-03-21  2:03 ` [PATCH 02/11] gzipfilter: lazy allocate the deflate context Eric Wong
2020-03-21  2:03 ` [PATCH 03/11] wwwstream: introduce oneshot API to avoid ->getline Eric Wong
2020-03-21  2:03 ` [PATCH 04/11] extmsg: use WwwResponse::oneshot Eric Wong
2020-03-21  2:03 ` [PATCH 05/11] wwwstream: oneshot sets content-length Eric Wong
2020-03-21  2:03 ` [PATCH 06/11] mbox: need_gzip uses WwwStream::oneshot Eric Wong
2020-03-21  2:03 ` [PATCH 07/11] qspawn: handle ENOENT (and other errors on exec) Eric Wong
2020-03-21  2:03 ` [PATCH 08/11] search: clobber -user_pfx on query parser initialization Eric Wong
2020-03-21  2:03 ` [PATCH 09/11] wwwtext: show thread endpoint w/ indexlevel=basic Eric Wong
2020-03-21  2:03 ` [PATCH 10/11] altid: warn about non-word prefixes Eric Wong
2020-03-21  2:03 ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200321020354.9056-12-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git