user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 40/43] wwwattach: support async blob retrievals
Date: Sun,  5 Jul 2020 23:27:56 +0000	[thread overview]
Message-ID: <20200705232759.3161-41-e@yhbt.net> (raw)
In-Reply-To: <20200705232759.3161-1-e@yhbt.net>

We can reuse some of the GzipFilter infrastructure used by other
WWW components to handle slow blob retrieval, here.  The
difference from previous changes is we don't decide on the 200
status code until we've retrieved the blob and found the
attachment.

While we're at it, ensure we can compress text attachment
responses once again, since all text attachments are served
as text/plain.
---
 lib/PublicInbox/WwwAttach.pm |  63 +++++++++++---
 t/psgi_attach.t              | 162 ++++++++++++++++++++---------------
 2 files changed, 144 insertions(+), 81 deletions(-)

diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm
index 7e8496d7a..20417295e 100644
--- a/lib/PublicInbox/WwwAttach.pm
+++ b/lib/PublicInbox/WwwAttach.pm
@@ -4,15 +4,16 @@
 # For retrieving attachments from messages in the WWW interface
 package PublicInbox::WwwAttach; # internal package
 use strict;
-use warnings;
+use parent qw(PublicInbox::GzipFilter);
 use bytes (); # only for bytes::length
 use PublicInbox::EmlContentFoo qw(parse_content_type);
 use PublicInbox::Eml;
 
 sub get_attach_i { # ->each_part callback
 	my ($part, $depth, $idx) = @{$_[0]};
-	my $res = $_[1];
-	return if $idx ne $res->[3]; # [0-9]+(?:\.[0-9]+)+
+	my $ctx = $_[1];
+	return if $idx ne $ctx->{idx}; # [0-9]+(?:\.[0-9]+)+
+	my $res = $ctx->{res};
 	$res->[0] = 200;
 	my $ct = $part->content_type;
 	$ct = parse_content_type($ct) if $ct;
@@ -23,24 +24,64 @@ sub get_attach_i { # ->each_part callback
 		if ($cset && ($cset =~ /\A[a-zA-Z0-9_\-]+\z/)) {
 			$res->[1]->[1] .= qq(; charset=$cset);
 		}
+		$ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($res->[1],
+								$ctx->{env});
+		$part = $ctx->zflush($part->body);
 	} else { # TODO: allow user to configure safe types
 		$res->[1]->[1] = 'application/octet-stream';
+		$part = $part->body;
 	}
-	$part = $part->body;
 	push @{$res->[1]}, 'Content-Length', bytes::length($part);
 	$res->[2]->[0] = $part;
 }
 
+sub async_eml { # ->{async_eml} for async_blob_cb
+	my ($ctx, $eml) = @_;
+	eval { $eml->each_part(\&get_attach_i, $ctx, 1) };
+	if ($@) {
+		$ctx->{res}->[0] = 500;
+		warn "E: $@";
+	}
+}
+
+sub async_next {
+	my ($http) = @_;
+	my $ctx = $http->{forward} or return; # client aborted
+	# finally, we call the user-supplied callback
+	eval { $ctx->{wcb}->($ctx->{res}) };
+	warn "E: $@" if $@;
+}
+
+sub scan_attach ($) { # public-inbox-httpd only
+	my ($ctx) = @_;
+	$ctx->{env}->{'psgix.io'}->{forward} = $ctx;
+	$ctx->{async_eml} = \&async_eml;
+	$ctx->{async_next} = \&async_next;
+	$ctx->smsg_blob($ctx->{smsg});
+}
+
 # /$LISTNAME/$MESSAGE_ID/$IDX-$FILENAME
 sub get_attach ($$$) {
 	my ($ctx, $idx, $fn) = @_;
-	my $res = [ 404, [ 'Content-Type', 'text/plain' ], [ "Not found\n" ] ];
-	my $mime = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return $res;
-	$mime = PublicInbox::Eml->new($mime);
-	$res->[3] = $idx;
-	$mime->each_part(\&get_attach_i, $res, 1);
-	pop @$res; # cleanup before letting PSGI server see it
-	$res
+	$ctx->{res} = [ 404, [ 'Content-Type' => 'text/plain' ],
+				[ "Not found\n" ] ];
+	$ctx->{idx} = $idx;
+	bless $ctx, __PACKAGE__;
+	my $eml;
+	if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) {
+		return sub { # public-inbox-httpd-only
+			$ctx->{wcb} = $_[0];
+			scan_attach($ctx);
+		} if $ctx->{env}->{'pi-httpd.async'};
+		# generic PSGI:
+		$eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg});
+	} elsif (!$ctx->{-inbox}->over) {
+		if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) {
+			$eml = PublicInbox::Eml->new($bref);
+		}
+	}
+	$eml->each_part(\&get_attach_i, $ctx, 1) if $eml;
+	$ctx->{res}
 }
 
 1;
diff --git a/t/psgi_attach.t b/t/psgi_attach.t
index 9a734f813..14d20adb1 100644
--- a/t/psgi_attach.t
+++ b/t/psgi_attach.t
@@ -5,9 +5,8 @@ use warnings;
 use Test::More;
 use PublicInbox::TestCommon;
 my ($tmpdir, $for_destroy) = tmpdir();
-my $maindir = "$tmpdir/main.git";
+my $inboxdir = "$tmpdir/main.git";
 my $addr = 'test-public@example.com';
-my $cfgpfx = "publicinbox.test";
 my @mods = qw(HTTP::Request::Common Plack::Builder Plack::Test URI::Escape);
 require_mods(@mods);
 use_ok $_ foreach @mods;
@@ -17,85 +16,108 @@ use PublicInbox::Git;
 use PublicInbox::Config;
 use PublicInbox::Eml;
 use_ok 'PublicInbox::WwwAttach';
-my $config = PublicInbox::Config->new(\<<EOF);
-$cfgpfx.address=$addr
-$cfgpfx.inboxdir=$maindir
+
+my $cfgpath = "$tmpdir/config";
+open my $fh, '>', $cfgpath or BAIL_OUT $!;
+print $fh <<EOF or BAIL_OUT $!;
+[publicinbox "test"]
+	address = $addr
+	inboxdir = $inboxdir
 EOF
-my $git = PublicInbox::Git->new($maindir);
+close $fh or BAIL_OUT $!;
+my $config = PublicInbox::Config->new($cfgpath);
+my $git = PublicInbox::Git->new($inboxdir);
 my $im = PublicInbox::Import->new($git, 'test', $addr);
 $im->init_bare;
 
-{
-	my $qp = "abcdef=g\n==blah\n";
-	my $b64 = "b64\xde\xad\xbe\xef\n";
-	my $txt = "plain\ntext\npass\nthrough\n";
-	my $dot = "dotfile\n";
-	$im->add(eml_load('t/psgi_attach.eml'));
-	$im->add(eml_load('t/data/message_embed.eml'));
-	$im->done;
+my $qp = "abcdef=g\n==blah\n";
+my $b64 = "b64\xde\xad\xbe\xef\n";
+my $txt = "plain\ntext\npass\nthrough\n";
+my $dot = "dotfile\n";
+$im->add(eml_load('t/psgi_attach.eml'));
+$im->add(eml_load('t/data/message_embed.eml'));
+$im->done;
+
+my $www = PublicInbox::WWW->new($config);
+my $client = sub {
+	my ($cb) = @_;
+	my $res;
+	$res = $cb->(GET('/test/Z%40B/'));
+	my @href = ($res->content =~ /^href="([^"]+)"/gms);
+	@href = grep(/\A[\d\.]+-/, @href);
+	is_deeply([qw(1-queue-pee 2-bayce-sixty-four 3-noop.txt
+			4-a.txt)],
+		\@href, 'attachment links generated');
+
+	$res = $cb->(GET('/test/Z%40B/1-queue-pee'));
+	my $qp_res = $res->content;
+	ok(length($qp_res) >= length($qp), 'QP length is close');
+	like($qp_res, qr/\n\z/s, 'trailing newline exists');
+	# is(index($qp_res, $qp), 0, 'QP trailing newline is there');
+	$qp_res =~ s/\r\n/\n/g;
+	is(index($qp_res, $qp), 0, 'QP trailing newline is there');
+
+	$res = $cb->(GET('/test/Z%40B/2-base-sixty-four'));
+	is(quotemeta($res->content), quotemeta($b64),
+		'Base64 matches exactly');
 
-	my $www = PublicInbox::WWW->new($config);
-	test_psgi(sub { $www->call(@_) }, sub {
-		my ($cb) = @_;
-		my $res;
-		$res = $cb->(GET('/test/Z%40B/'));
-		my @href = ($res->content =~ /^href="([^"]+)"/gms);
-		@href = grep(/\A[\d\.]+-/, @href);
-		is_deeply([qw(1-queue-pee 2-bayce-sixty-four 3-noop.txt
-				4-a.txt)],
-			\@href, 'attachment links generated');
+	$res = $cb->(GET('/test/Z%40B/3-noop.txt'));
+	my $txt_res = $res->content;
+	ok(length($txt_res) >= length($txt),
+		'plain text almost matches');
+	like($txt_res, qr/\n\z/s, 'trailing newline exists in text');
+	is(index($txt_res, $txt), 0, 'plain text not truncated');
 
-		$res = $cb->(GET('/test/Z%40B/1-queue-pee'));
-		my $qp_res = $res->content;
-		ok(length($qp_res) >= length($qp), 'QP length is close');
-		like($qp_res, qr/\n\z/s, 'trailing newline exists');
-		# is(index($qp_res, $qp), 0, 'QP trailing newline is there');
-		$qp_res =~ s/\r\n/\n/g;
-		is(index($qp_res, $qp), 0, 'QP trailing newline is there');
+	$res = $cb->(GET('/test/Z%40B/4-a.txt'));
+	my $dot_res = $res->content;
+	ok(length($dot_res) >= length($dot), 'dot almost matches');
+	$res = $cb->(GET('/test/Z%40B/4-any-filename.txt'));
+	is($res->content, $dot_res, 'user-specified filename is OK');
 
-		$res = $cb->(GET('/test/Z%40B/2-base-sixty-four'));
-		is(quotemeta($res->content), quotemeta($b64),
-			'Base64 matches exactly');
+	my $mid = '20200418222508.GA13918@dcvr';
+	my $irt = '20200418222020.GA2745@dcvr';
+	$res = $cb->(GET("/test/$mid/"));
+	unlike($res->content, qr! multipart/mixed, Size: 0 bytes!,
+		'0-byte download not offered');
+	like($res->content, qr/\bhref="2-embed2x\.eml"/s,
+		'href to message/rfc822 attachment visible');
+	like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s,
+		'href to nested message/rfc822 attachment visible');
 
-		$res = $cb->(GET('/test/Z%40B/3-noop.txt'));
-		my $txt_res = $res->content;
-		ok(length($txt_res) >= length($txt),
-			'plain text almost matches');
-		like($txt_res, qr/\n\z/s, 'trailing newline exists in text');
-		is(index($txt_res, $txt), 0, 'plain text not truncated');
+	$res = $cb->(GET("/test/$mid/2-embed2x.eml"));
+	my $eml = PublicInbox::Eml->new(\($res->content));
+	is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ],
+		'got attached eml');
+	my @subs = $eml->subparts;
+	is(scalar(@subs), 2, 'attachment had 2 subparts');
+	like($subs[0]->body_str, qr/^testing embedded message\n*\z/sm,
+		'1st attachment is as expected');
+	is($subs[1]->header('Content-Type'), 'message/rfc822',
+		'2nd attachment is as expected');
 
-		$res = $cb->(GET('/test/Z%40B/4-a.txt'));
-		my $dot_res = $res->content;
-		ok(length($dot_res) >= length($dot), 'dot almost matches');
-		$res = $cb->(GET('/test/Z%40B/4-any-filename.txt'));
-		is($res->content, $dot_res, 'user-specified filename is OK');
+	$res = $cb->(GET("/test/$mid/2.1.2-test.eml"));
+	$eml = PublicInbox::Eml->new(\($res->content));
+	is_deeply([ $eml->header_raw('Message-ID') ],
+		[ '<20200418214114.7575-1-e@yhbt.net>' ],
+		'nested eml retrieved');
+};
 
-		my $mid = '20200418222508.GA13918@dcvr';
-		my $irt = '20200418222020.GA2745@dcvr';
-		$res = $cb->(GET("/test/$mid/"));
-		unlike($res->content, qr! multipart/mixed, Size: 0 bytes!,
-			'0-byte download not offered');
-		like($res->content, qr/\bhref="2-embed2x\.eml"/s,
-			'href to message/rfc822 attachment visible');
-		like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s,
-			'href to nested message/rfc822 attachment visible');
+test_psgi(sub { $www->call(@_) }, $client);
+SKIP: {
+	diag 'testing with index indexed';
+	require_mods('DBD::SQLite', 19);
+	my $env = { PI_CONFIG => $cfgpath };
+	ok(run_script(['-index', $inboxdir], $env), 'indexed');
 
-		$res = $cb->(GET("/test/$mid/2-embed2x.eml"));
-		my $eml = PublicInbox::Eml->new(\($res->content));
-		is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ],
-			'got attached eml');
-		my @subs = $eml->subparts;
-		is(scalar(@subs), 2, 'attachment had 2 subparts');
-		like($subs[0]->body_str, qr/^testing embedded message\n*\z/sm,
-			'1st attachment is as expected');
-		is($subs[1]->header('Content-Type'), 'message/rfc822',
-			'2nd attachment is as expected');
+	test_psgi(sub { $www->call(@_) }, $client);
 
-		$res = $cb->(GET("/test/$mid/2.1.2-test.eml"));
-		$eml = PublicInbox::Eml->new(\($res->content));
-		is_deeply([ $eml->header_raw('Message-ID') ],
-			[ '<20200418214114.7575-1-e@yhbt.net>' ],
-			'nested eml retrieved');
-	});
+	require_mods(qw(Plack::Test::ExternalServer), 18);
+	my $sock = tcp_server() or die;
+	my ($out, $err) = map { "$inboxdir/std$_.log" } qw(out err);
+	my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ];
+	my $td = start_script($cmd, $env, { 3 => $sock });
+	my ($h, $p) = ($sock->sockhost, $sock->sockport);
+	local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p";
+	Plack::Test::ExternalServer::test_psgi(client => $client);
 }
 done_testing();

  parent reply	other threads:[~2020-07-05 23:28 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-05 23:27 [PATCH 00/43] www: async git cat-file w/ -httpd Eric Wong
2020-07-05 23:27 ` [PATCH 01/43] gzipfilter: minor cleanups Eric Wong
2020-07-05 23:27 ` [PATCH 02/43] wwwstream: oneshot: perform gzip without middleware Eric Wong
2020-07-05 23:27 ` [PATCH 03/43] www*stream: gzip ->getline responses Eric Wong
2020-07-05 23:27 ` [PATCH 04/43] wwwtext: gzip text/plain responses, as well Eric Wong
2020-07-05 23:27 ` [PATCH 05/43] wwwtext: switch to html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 06/43] www: need: use WwwStream::html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 07/43] wwwlisting: use GzipFilter for HTML Eric Wong
2020-07-05 23:27 ` [PATCH 08/43] gzipfilter: replace Compress::Raw::Deflate usages Eric Wong
2020-07-05 23:27 ` [PATCH 09/43] {gzip,noop}filter: ->zmore returns undef, always Eric Wong
2020-07-05 23:27 ` [PATCH 10/43] mbox: remove html_oneshot import Eric Wong
2020-07-05 23:27 ` [PATCH 11/43] wwwstatic: support gzipped directory listings Eric Wong
2020-07-05 23:27 ` [PATCH 12/43] qspawn: learn to gzip streaming responses Eric Wong
2020-07-05 23:27 ` [PATCH 13/43] stop auto-loading Plack::Middleware::Deflater Eric Wong
2020-07-05 23:27 ` [PATCH 14/43] mboxgz: do asynchronous git blob retrievals Eric Wong
2020-07-05 23:27 ` [PATCH 15/43] mboxgz: reduce hash depth Eric Wong
2020-07-05 23:27 ` [PATCH 16/43] mbox: async blob fetch for "single message" raw mboxrd Eric Wong
2020-07-05 23:27 ` [PATCH 17/43] wwwatomstream: simplify feed_update callers Eric Wong
2020-07-05 23:27 ` [PATCH 18/43] wwwatomstream: use PublicInbox::Inbox->modified for feed_updated Eric Wong
2020-07-05 23:27 ` [PATCH 19/43] wwwatomstream: reuse $ctx as $self Eric Wong
2020-07-05 23:27 ` [PATCH 20/43] xt/httpd-async-stream: allow more options Eric Wong
2020-07-05 23:27 ` [PATCH 21/43] wwwatomstream: support async blob fetch Eric Wong
2020-07-05 23:27 ` [PATCH 22/43] wwwstream: reduce object graph depth Eric Wong
2020-07-05 23:27 ` [PATCH 23/43] wwwstream: reduce blob fetch paths for ->getline Eric Wong
2020-07-05 23:27 ` [PATCH 24/43] www: start making gzipfilter the parent response class Eric Wong
2020-07-05 23:27 ` [PATCH 25/43] remove unused/redundant zlib-related imports Eric Wong
2020-07-05 23:27 ` [PATCH 26/43] wwwstream: use parent.pm and no warnings Eric Wong
2020-07-05 23:27 ` [PATCH 27/43] wwwstream: subclass off GzipFilter Eric Wong
2020-07-05 23:27 ` [PATCH 28/43] view: make /$INBOX/$MSGID/ permalink async Eric Wong
2020-07-05 23:27 ` [PATCH 29/43] view: /$INBOX/$MSGID/t/ reads blobs asynchronously Eric Wong
2020-07-05 23:27 ` [PATCH 30/43] view: update /$INBOX/$MSGID/T/ to be async Eric Wong
2020-07-05 23:27 ` [PATCH 31/43] feed: generate_i: eliminate pointless loop Eric Wong
2020-07-05 23:27 ` [PATCH 32/43] feed: /$INBOX/new.html fetches blobs asynchronously Eric Wong
2020-07-05 23:27 ` [PATCH 33/43] ssearchview: /$INBOX/?q=$QUERY&x=t uses async blobs Eric Wong
2020-07-05 23:27 ` [PATCH 34/43] view: eml_entry: reduce parameters Eric Wong
2020-07-05 23:27 ` [PATCH 35/43] view: /$INBOX/$MSGID/t/: avoid extra hash lookup in eml case Eric Wong
2020-07-05 23:27 ` [PATCH 36/43] wwwstream: eliminate ::response, use html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 37/43] www: update internal docs Eric Wong
2020-07-05 23:27 ` [PATCH 38/43] view: simplify eml_entry callers further Eric Wong
2020-07-05 23:27 ` [PATCH 39/43] wwwtext: simplify gzf_maybe use Eric Wong
2020-07-05 23:27 ` Eric Wong [this message]
2020-07-05 23:27 ` [PATCH 41/43] gzipfilter: drop HTTP connection on bugs or data corruption Eric Wong
2020-07-05 23:27 ` [PATCH 42/43] daemon: warn on missing blobs Eric Wong
2020-07-05 23:27 ` [PATCH 43/43] gzipfilter: check http->{forward} for client disconnects Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200705232759.3161-41-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    --subject='Re: [PATCH 40/43] wwwattach: support async blob retrievals' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git