From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 0A0271F9FC; Mon, 25 Oct 2021 02:45:54 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Cc: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Subject: [PATCH 2/2] www: $MSGID/raw: set charset in HTTP response Date: Mon, 25 Oct 2021 02:45:53 +0000 Message-Id: <20211025024553.14875-3-e@80x24.org> In-Reply-To: <20211025024553.14875-1-e@80x24.org> References: <20211025024553.14875-1-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit List-Id: By using the charset specified in the message, web browsers are more likely to display the raw text properly for human readers. Inspired by a patch by Thomas Weißschuh: https://public-inbox.org/meta/20211024214337.161779-3-thomas@t-8ch.de/ Cc: Thomas Weißschuh --- lib/PublicInbox/GzipFilter.pm | 19 +++++++++++++------ lib/PublicInbox/Mbox.pm | 24 +++++++++++++----------- t/plack.t | 26 +++++++++++++++++++++++--- t/psgi_v2.t | 5 ++++- 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index c4858a971495..e37f1f76bd4a 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -46,11 +46,10 @@ sub gz_or_noop { sub gzf_maybe ($$) { bless { gz => gz_or_noop(@_) }, __PACKAGE__ } sub psgi_response { + # $code may be an HTTP response code (e.g. 200) or a CODE ref (mbox_hdr) my ($self, $code, $res_hdr) = @_; - my $env = $self->{env}; - $self->{gz} //= gz_or_noop($res_hdr, $env); - if ($env->{'pi-httpd.async'}) { - my $http = $env->{'psgix.io'}; # PublicInbox::HTTP + if ($self->{env}->{'pi-httpd.async'}) { + my $http = $self->{env}->{'psgix.io'}; # PublicInbox::HTTP $http->{forward} = $self; sub { my ($wcb) = @_; # -httpd provided write callback @@ -58,6 +57,9 @@ sub psgi_response { $self->can('async_next')->($http); # start stepping }; } else { # generic PSGI code path + ref($code) eq 'CODE' and + ($code, $res_hdr) = @{$code->($self)}; + $self->{gz} //= gz_or_noop($res_hdr, $self->{env}); [ $code, $res_hdr, $self ]; } } @@ -116,9 +118,13 @@ sub translate ($$) { sub http_out ($) { my ($self) = @_; - $self->{http_out} //= do { + $self->{http_out} // do { my $args = delete $self->{wcb_args} // return undef; - pop(@$args)->($args); # $wcb->([$code, $hdr_ary]) + my $wcb = pop @$args; # from PublicInbox:HTTP async + # $args->[0] may be \&mbox_hdr or similar + $args = $args->[0]->($self) if ref($args->[0]) eq 'CODE'; + $self->{gz} //= gz_or_noop($args->[1], $self->{env}); + $self->{http_out} = $wcb->($args); # $wcb->([$code, $hdr_ary]) }; } @@ -131,6 +137,7 @@ sub write { # more data to buffer after this sub zmore { my $self = $_[0]; # $_[1] => input + http_out($self); my $err = $self->{gz}->deflate($_[1], $self->{zbuf}); die "gzip->deflate: $err" if $err != Z_OK; undef; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 4f84eea6745d..b977308d0541 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -18,7 +18,7 @@ sub getline { my ($ctx) = @_; # ctx my $smsg = $ctx->{smsg} or return; my $ibx = $ctx->{ibx}; - my $eml = $ibx->smsg_eml($smsg) or return; + my $eml = delete($ctx->{eml}) // $ibx->smsg_eml($smsg) // return; my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}); $ctx->zmore(msg_hdr($ctx, $eml)); if ($n) { @@ -45,14 +45,15 @@ sub async_eml { # for async_blob_cb my $smsg = delete $ctx->{smsg}; # next message $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}}); - + local $ctx->{eml} = $eml; # for mbox_hdr $ctx->zmore(msg_hdr($ctx, $eml)); $ctx->write(msg_body($eml)); } -sub res_hdr ($$) { - my ($ctx, $subject) = @_; - my $fn = $subject // ''; +sub mbox_hdr ($) { + my ($ctx) = @_; + my $eml = $ctx->{eml} //= $ctx->{ibx}->smsg_eml($ctx->{smsg}); + my $fn = $eml->header_str('Subject') // ''; $fn =~ s/^re:\s+//i; $fn = to_filename($fn) // 'no-subject'; my @hdr = ('Content-Type'); @@ -64,17 +65,19 @@ sub res_hdr ($$) { push @hdr, 'text/plain'; $fn .= '.txt'; } + my $cs = $ctx->{eml}->ct->{attributes}->{charset} // 'UTF-8'; + $cs = 'UTF-8' if $cs =~ /[^a-zA-Z0-9\-\_]/; # avoid header injection + $hdr[-1] .= "; charset=$cs"; push @hdr, 'Content-Disposition', "inline; filename=$fn"; - \@hdr; + [ 200, \@hdr ]; } # for rare cases where v1 inboxes aren't indexed w/ ->over at all sub no_over_raw ($) { my ($ctx) = @_; my $mref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; - my $eml = PublicInbox::Eml->new($mref); - [ 200, res_hdr($ctx, $eml->header_str('Subject')), - [ msg_hdr($ctx, $eml) . msg_body($eml) ] ] + my $eml = $ctx->{eml} = PublicInbox::Eml->new($mref); + [ @{mbox_hdr($ctx)}, [ msg_hdr($ctx, $eml) . msg_body($eml) ] ] } # /$INBOX/$MESSAGE_ID/raw @@ -85,9 +88,8 @@ sub emit_raw { my ($id, $prev); my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; my $smsg = $ctx->{smsg} = $over->next_by_mid(@$mip) or return; - my $res_hdr = res_hdr($ctx, $smsg->{subject}); bless $ctx, __PACKAGE__; - $ctx->psgi_response(200, $res_hdr); + $ctx->psgi_response(\&mbox_hdr); } sub msg_hdr ($$) { diff --git a/t/plack.t b/t/plack.t index 40ff2baa7273..e4dedce6a844 100644 --- a/t/plack.t +++ b/t/plack.t @@ -10,17 +10,24 @@ require_mods(@mods); foreach my $mod (@mods) { use_ok $mod; } ok(-f $psgi, "psgi example file found"); my $pfx = 'http://example.com/test'; -# ensure successful message delivery -my $ibx = create_inbox('test', sub { +my $eml = eml_load('t/iso-2202-jp.eml'); +# ensure successful message deliveries +my $ibx = create_inbox('test-1', sub { my ($im, $ibx) = @_; my $addr = $ibx->{-primary_address}; - $im->add(PublicInbox::Eml->new(<add'; + $im->add($eml) or xbail '->add'; + $eml->header_set('Content-Type', + "text/plain; charset=\rso\rb\0gus\rithurts"); + $eml->header_set('Message-ID', ''); + $im->add($eml) or xbail '->add'; + $im->add(PublicInbox::Eml->new(<add'; From: Me To: You Cc: $addr Message-Id: Subject: hihi Date: Fri, 02 Oct 1993 00:00:00 +0000 +Content-Type: text/plain; charset=iso-8859-1 > quoted text zzzzzz @@ -195,6 +202,19 @@ test_psgi($app, sub { my $res = $cb->(GET($pfx . '/blah@example.com/raw')); is(200, $res->code, 'success response received for /*/raw'); like($res->content, qr!^From !sm, "mbox returned"); + is($res->header('Content-Type'), 'text/plain; charset=iso-8859-1', + 'charset from message used'); + + $res = $cb->(GET($pfx . '/broken@example.com/raw')); + is($res->header('Content-Type'), 'text/plain; charset=UTF-8', + 'broken charset ignored'); + + $res = $cb->(GET($pfx . '/199707281508.AAA24167@hoyogw.example/raw')); + is($res->header('Content-Type'), 'text/plain; charset=ISO-2022-JP', + 'ISO-2002-JP returned'); + chomp(my $body = $res->content); + my $raw = PublicInbox::Eml->new(\$body); + is($raw->body_raw, $eml->body_raw, 'ISO-2022-JP body unmodified'); $res = $cb->(GET($pfx . '/blah@example.com/t.mbox.gz')); is(501, $res->code, '501 when overview missing'); diff --git a/t/psgi_v2.t b/t/psgi_v2.t index 64c1a8d38a0a..7d73b606dbef 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -20,11 +20,12 @@ To: test@example.com Subject: this is a subject Message-ID: Date: Fri, 02 Oct 1993 00:00:00 +0000 +Content-Type: text/plain; charset=iso-8859-1 hello world EOF my $new_mid; -my $ibx = create_inbox 'v2', version => 2, indexlevel => 'medium', +my $ibx = create_inbox 'v2-1', version => 2, indexlevel => 'medium', tmpdir => "$tmpdir/v2", sub { my ($im, $ibx) = @_; $im->add($eml) or BAIL_OUT; @@ -68,6 +69,8 @@ my $client0 = sub { like($res->content, qr!\$INBOX_DIR/description missing!, 'got v2 description missing message'); $res = $cb->(GET('/v2test/a-mid@b/raw')); + is($res->header('Content-Type'), 'text/plain; charset=iso-8859-1', + 'charset from message used'); $raw = $res->content; unlike($raw, qr/^From oldbug/sm, 'buggy "From_" line omitted'); like($raw, qr/^hello world$/m, 'got first message');