From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 8962D1F956 for ; Sun, 5 Jul 2020 23:28:15 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 40/43] wwwattach: support async blob retrievals Date: Sun, 5 Jul 2020 23:27:56 +0000 Message-Id: <20200705232759.3161-41-e@yhbt.net> In-Reply-To: <20200705232759.3161-1-e@yhbt.net> References: <20200705232759.3161-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We can reuse some of the GzipFilter infrastructure used by other WWW components to handle slow blob retrieval, here. The difference from previous changes is we don't decide on the 200 status code until we've retrieved the blob and found the attachment. While we're at it, ensure we can compress text attachment responses once again, since all text attachments are served as text/plain. --- lib/PublicInbox/WwwAttach.pm | 63 +++++++++++--- t/psgi_attach.t | 162 ++++++++++++++++++++--------------- 2 files changed, 144 insertions(+), 81 deletions(-) diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm index 7e8496d7a..20417295e 100644 --- a/lib/PublicInbox/WwwAttach.pm +++ b/lib/PublicInbox/WwwAttach.pm @@ -4,15 +4,16 @@ # For retrieving attachments from messages in the WWW interface package PublicInbox::WwwAttach; # internal package use strict; -use warnings; +use parent qw(PublicInbox::GzipFilter); use bytes (); # only for bytes::length use PublicInbox::EmlContentFoo qw(parse_content_type); use PublicInbox::Eml; sub get_attach_i { # ->each_part callback my ($part, $depth, $idx) = @{$_[0]}; - my $res = $_[1]; - return if $idx ne $res->[3]; # [0-9]+(?:\.[0-9]+)+ + my $ctx = $_[1]; + return if $idx ne $ctx->{idx}; # [0-9]+(?:\.[0-9]+)+ + my $res = $ctx->{res}; $res->[0] = 200; my $ct = $part->content_type; $ct = parse_content_type($ct) if $ct; @@ -23,24 +24,64 @@ sub get_attach_i { # ->each_part callback if ($cset && ($cset =~ /\A[a-zA-Z0-9_\-]+\z/)) { $res->[1]->[1] .= qq(; charset=$cset); } + $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($res->[1], + $ctx->{env}); + $part = $ctx->zflush($part->body); } else { # TODO: allow user to configure safe types $res->[1]->[1] = 'application/octet-stream'; + $part = $part->body; } - $part = $part->body; push @{$res->[1]}, 'Content-Length', bytes::length($part); $res->[2]->[0] = $part; } +sub async_eml { # ->{async_eml} for async_blob_cb + my ($ctx, $eml) = @_; + eval { $eml->each_part(\&get_attach_i, $ctx, 1) }; + if ($@) { + $ctx->{res}->[0] = 500; + warn "E: $@"; + } +} + +sub async_next { + my ($http) = @_; + my $ctx = $http->{forward} or return; # client aborted + # finally, we call the user-supplied callback + eval { $ctx->{wcb}->($ctx->{res}) }; + warn "E: $@" if $@; +} + +sub scan_attach ($) { # public-inbox-httpd only + my ($ctx) = @_; + $ctx->{env}->{'psgix.io'}->{forward} = $ctx; + $ctx->{async_eml} = \&async_eml; + $ctx->{async_next} = \&async_next; + $ctx->smsg_blob($ctx->{smsg}); +} + # /$LISTNAME/$MESSAGE_ID/$IDX-$FILENAME sub get_attach ($$$) { my ($ctx, $idx, $fn) = @_; - my $res = [ 404, [ 'Content-Type', 'text/plain' ], [ "Not found\n" ] ]; - my $mime = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return $res; - $mime = PublicInbox::Eml->new($mime); - $res->[3] = $idx; - $mime->each_part(\&get_attach_i, $res, 1); - pop @$res; # cleanup before letting PSGI server see it - $res + $ctx->{res} = [ 404, [ 'Content-Type' => 'text/plain' ], + [ "Not found\n" ] ]; + $ctx->{idx} = $idx; + bless $ctx, __PACKAGE__; + my $eml; + if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) { + return sub { # public-inbox-httpd-only + $ctx->{wcb} = $_[0]; + scan_attach($ctx); + } if $ctx->{env}->{'pi-httpd.async'}; + # generic PSGI: + $eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg}); + } elsif (!$ctx->{-inbox}->over) { + if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) { + $eml = PublicInbox::Eml->new($bref); + } + } + $eml->each_part(\&get_attach_i, $ctx, 1) if $eml; + $ctx->{res} } 1; diff --git a/t/psgi_attach.t b/t/psgi_attach.t index 9a734f813..14d20adb1 100644 --- a/t/psgi_attach.t +++ b/t/psgi_attach.t @@ -5,9 +5,8 @@ use warnings; use Test::More; use PublicInbox::TestCommon; my ($tmpdir, $for_destroy) = tmpdir(); -my $maindir = "$tmpdir/main.git"; +my $inboxdir = "$tmpdir/main.git"; my $addr = 'test-public@example.com'; -my $cfgpfx = "publicinbox.test"; my @mods = qw(HTTP::Request::Common Plack::Builder Plack::Test URI::Escape); require_mods(@mods); use_ok $_ foreach @mods; @@ -17,85 +16,108 @@ use PublicInbox::Git; use PublicInbox::Config; use PublicInbox::Eml; use_ok 'PublicInbox::WwwAttach'; -my $config = PublicInbox::Config->new(\<', $cfgpath or BAIL_OUT $!; +print $fh <new($maindir); +close $fh or BAIL_OUT $!; +my $config = PublicInbox::Config->new($cfgpath); +my $git = PublicInbox::Git->new($inboxdir); my $im = PublicInbox::Import->new($git, 'test', $addr); $im->init_bare; -{ - my $qp = "abcdef=g\n==blah\n"; - my $b64 = "b64\xde\xad\xbe\xef\n"; - my $txt = "plain\ntext\npass\nthrough\n"; - my $dot = "dotfile\n"; - $im->add(eml_load('t/psgi_attach.eml')); - $im->add(eml_load('t/data/message_embed.eml')); - $im->done; +my $qp = "abcdef=g\n==blah\n"; +my $b64 = "b64\xde\xad\xbe\xef\n"; +my $txt = "plain\ntext\npass\nthrough\n"; +my $dot = "dotfile\n"; +$im->add(eml_load('t/psgi_attach.eml')); +$im->add(eml_load('t/data/message_embed.eml')); +$im->done; + +my $www = PublicInbox::WWW->new($config); +my $client = sub { + my ($cb) = @_; + my $res; + $res = $cb->(GET('/test/Z%40B/')); + my @href = ($res->content =~ /^href="([^"]+)"/gms); + @href = grep(/\A[\d\.]+-/, @href); + is_deeply([qw(1-queue-pee 2-bayce-sixty-four 3-noop.txt + 4-a.txt)], + \@href, 'attachment links generated'); + + $res = $cb->(GET('/test/Z%40B/1-queue-pee')); + my $qp_res = $res->content; + ok(length($qp_res) >= length($qp), 'QP length is close'); + like($qp_res, qr/\n\z/s, 'trailing newline exists'); + # is(index($qp_res, $qp), 0, 'QP trailing newline is there'); + $qp_res =~ s/\r\n/\n/g; + is(index($qp_res, $qp), 0, 'QP trailing newline is there'); + + $res = $cb->(GET('/test/Z%40B/2-base-sixty-four')); + is(quotemeta($res->content), quotemeta($b64), + 'Base64 matches exactly'); - my $www = PublicInbox::WWW->new($config); - test_psgi(sub { $www->call(@_) }, sub { - my ($cb) = @_; - my $res; - $res = $cb->(GET('/test/Z%40B/')); - my @href = ($res->content =~ /^href="([^"]+)"/gms); - @href = grep(/\A[\d\.]+-/, @href); - is_deeply([qw(1-queue-pee 2-bayce-sixty-four 3-noop.txt - 4-a.txt)], - \@href, 'attachment links generated'); + $res = $cb->(GET('/test/Z%40B/3-noop.txt')); + my $txt_res = $res->content; + ok(length($txt_res) >= length($txt), + 'plain text almost matches'); + like($txt_res, qr/\n\z/s, 'trailing newline exists in text'); + is(index($txt_res, $txt), 0, 'plain text not truncated'); - $res = $cb->(GET('/test/Z%40B/1-queue-pee')); - my $qp_res = $res->content; - ok(length($qp_res) >= length($qp), 'QP length is close'); - like($qp_res, qr/\n\z/s, 'trailing newline exists'); - # is(index($qp_res, $qp), 0, 'QP trailing newline is there'); - $qp_res =~ s/\r\n/\n/g; - is(index($qp_res, $qp), 0, 'QP trailing newline is there'); + $res = $cb->(GET('/test/Z%40B/4-a.txt')); + my $dot_res = $res->content; + ok(length($dot_res) >= length($dot), 'dot almost matches'); + $res = $cb->(GET('/test/Z%40B/4-any-filename.txt')); + is($res->content, $dot_res, 'user-specified filename is OK'); - $res = $cb->(GET('/test/Z%40B/2-base-sixty-four')); - is(quotemeta($res->content), quotemeta($b64), - 'Base64 matches exactly'); + my $mid = '20200418222508.GA13918@dcvr'; + my $irt = '20200418222020.GA2745@dcvr'; + $res = $cb->(GET("/test/$mid/")); + unlike($res->content, qr! multipart/mixed, Size: 0 bytes!, + '0-byte download not offered'); + like($res->content, qr/\bhref="2-embed2x\.eml"/s, + 'href to message/rfc822 attachment visible'); + like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s, + 'href to nested message/rfc822 attachment visible'); - $res = $cb->(GET('/test/Z%40B/3-noop.txt')); - my $txt_res = $res->content; - ok(length($txt_res) >= length($txt), - 'plain text almost matches'); - like($txt_res, qr/\n\z/s, 'trailing newline exists in text'); - is(index($txt_res, $txt), 0, 'plain text not truncated'); + $res = $cb->(GET("/test/$mid/2-embed2x.eml")); + my $eml = PublicInbox::Eml->new(\($res->content)); + is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ], + 'got attached eml'); + my @subs = $eml->subparts; + is(scalar(@subs), 2, 'attachment had 2 subparts'); + like($subs[0]->body_str, qr/^testing embedded message\n*\z/sm, + '1st attachment is as expected'); + is($subs[1]->header('Content-Type'), 'message/rfc822', + '2nd attachment is as expected'); - $res = $cb->(GET('/test/Z%40B/4-a.txt')); - my $dot_res = $res->content; - ok(length($dot_res) >= length($dot), 'dot almost matches'); - $res = $cb->(GET('/test/Z%40B/4-any-filename.txt')); - is($res->content, $dot_res, 'user-specified filename is OK'); + $res = $cb->(GET("/test/$mid/2.1.2-test.eml")); + $eml = PublicInbox::Eml->new(\($res->content)); + is_deeply([ $eml->header_raw('Message-ID') ], + [ '<20200418214114.7575-1-e@yhbt.net>' ], + 'nested eml retrieved'); +}; - my $mid = '20200418222508.GA13918@dcvr'; - my $irt = '20200418222020.GA2745@dcvr'; - $res = $cb->(GET("/test/$mid/")); - unlike($res->content, qr! multipart/mixed, Size: 0 bytes!, - '0-byte download not offered'); - like($res->content, qr/\bhref="2-embed2x\.eml"/s, - 'href to message/rfc822 attachment visible'); - like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s, - 'href to nested message/rfc822 attachment visible'); +test_psgi(sub { $www->call(@_) }, $client); +SKIP: { + diag 'testing with index indexed'; + require_mods('DBD::SQLite', 19); + my $env = { PI_CONFIG => $cfgpath }; + ok(run_script(['-index', $inboxdir], $env), 'indexed'); - $res = $cb->(GET("/test/$mid/2-embed2x.eml")); - my $eml = PublicInbox::Eml->new(\($res->content)); - is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ], - 'got attached eml'); - my @subs = $eml->subparts; - is(scalar(@subs), 2, 'attachment had 2 subparts'); - like($subs[0]->body_str, qr/^testing embedded message\n*\z/sm, - '1st attachment is as expected'); - is($subs[1]->header('Content-Type'), 'message/rfc822', - '2nd attachment is as expected'); + test_psgi(sub { $www->call(@_) }, $client); - $res = $cb->(GET("/test/$mid/2.1.2-test.eml")); - $eml = PublicInbox::Eml->new(\($res->content)); - is_deeply([ $eml->header_raw('Message-ID') ], - [ '<20200418214114.7575-1-e@yhbt.net>' ], - 'nested eml retrieved'); - }); + require_mods(qw(Plack::Test::ExternalServer), 18); + my $sock = tcp_server() or die; + my ($out, $err) = map { "$inboxdir/std$_.log" } qw(out err); + my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; + my $td = start_script($cmd, $env, { 3 => $sock }); + my ($h, $p) = ($sock->sockhost, $sock->sockport); + local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; + Plack::Test::ExternalServer::test_psgi(client => $client); } done_testing();