From e60231148eb604a379033c69e8c4494eb1753783 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 16 May 2020 10:03:22 +0000 Subject: descend into message/(rfc822|news|global) parts Email::MIME never supported this properly, but there's real instances of forwarded messages as message/rfc822 attachments. message/news is legacy thing which we'll see in archives, and message/global appears to be the new thing. gmime also supports message/rfc2822, so we'll support it anyways despite lacking other evidence of its existence. Existing attachments remain downloadable as a whole message, but individual attachments of subparts are now downloadable and can be displayed in HTML, too. Furthermore, ensure Xapian can now search for common headers inside those messages as well as the message bodies. --- lib/PublicInbox/Eml.pm | 37 ++++++++++++++++++++++++++++------ lib/PublicInbox/MsgIter.pm | 6 +++++- lib/PublicInbox/SearchIdx.pm | 47 +++++++++++++++++++++++++++----------------- lib/PublicInbox/View.pm | 30 +++++++++++++++++++++++++--- 4 files changed, 92 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index ef401141..6f6874cd 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -60,6 +60,14 @@ my %DECODE_FULL = ( our %STR_TYPE = (text => 1); our %STR_SUBTYPE = (plain => 1, html => 1); +# message/* subtypes we descend into +our %MESSAGE_DESCEND = ( + news => 1, # RFC 1849 (obsolete, but archives are forever) + rfc822 => 1, # RFC 2046 + rfc2822 => 1, # gmime handles this (but not rfc5322) + global => 1, # RFC 6532 +); + my %re_memo; sub re_memo ($) { my ($k) = @_; @@ -149,13 +157,25 @@ sub ct ($) { } # returns a queue of sub-parts iff it's worth descending into -# TODO: descend into message/rfc822 parts (Email::MIME didn't) sub mp_descend ($$) { my ($self, $nr) = @_; # or $once for top-level - my $bnd = ct($self)->{attributes}->{boundary} // return; # single-part + my $ct = ct($self); + my $type = lc($ct->{type}); + if ($type eq 'message' && $MESSAGE_DESCEND{lc($ct->{subtype})}) { + my $nxt = new(undef, body_raw($self)); + $self->{-call_cb} = $nxt->{is_submsg} = 1; + return [ $nxt ]; + } + return if $type ne 'multipart'; + my $bnd = $ct->{attributes}->{boundary} // return; # single-part return if $bnd eq '' || length($bnd) >= $mime_boundary_length_limit; $bnd = quotemeta($bnd); + # this is a multipart message that didn't get descended into in + # public-inbox <= 1.5.0, so ensure we call the user callback for + # this part to not break PSGI downloads. + $self->{-call_cb} = $self->{is_submsg}; + # "multipart" messages can exist w/o a body my $bdy = ($nr ? delete($self->{bdy}) : \(body_raw($self))) or return; @@ -189,14 +209,15 @@ sub mp_descend ($$) { # compatibility with Email::MIME $parts[-1] =~ s/\n\r?\n\z/\n/s if $epilogue_missing; - @parts = grep /[^ \t\r\n]/s, @parts; # ignore empty parts + # ignore empty parts + @parts = map { new_sub(undef, \$_) } grep /[^ \t\r\n]/s, @parts; # Keep "From: someone..." from preamble in old, # buggy versions of git-send-email, otherwise drop it # There's also a case where quoted text showed up in the # preamble # <20060515162817.65F0F1BBAE@citi.umich.edu> - unshift(@parts, $pre) if $pre =~ /:/s; + unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s; return \@parts; } # "multipart", but no boundary found, treat as single part @@ -217,6 +238,9 @@ sub each_part { my ($self, $cb, $arg, $once) = @_; my $p = mp_descend($self, $once // 0) or return $cb->([$self, 0, 0], $arg); + + $cb->([$self, 0, 0], $arg) if $self->{-call_cb}; # rare + $p = [ $p, 0 ]; my @s; # our virtual stack my $nr = 0; @@ -226,11 +250,12 @@ sub each_part { my (undef, @idx) = @$p; @idx = (join('.', @idx)); my $depth = ($idx[0] =~ tr/././) + 1; - my $sub = new_sub(undef, \(shift @{$p->[0]})); + my $sub = shift @{$p->[0]}; if ($depth < $mime_nesting_limit && (my $nxt = mp_descend($sub, $nr))) { push(@s, $p) if scalar @{$p->[0]}; $p = [ $nxt, @idx, 0 ]; + $cb->([$sub, $depth, @idx], $arg) if $sub->{-call_cb}; } else { # a leaf node $cb->([$sub, $depth, @idx], $arg); } @@ -270,7 +295,7 @@ sub subparts { if ($$bdy =~ /^--\Q$bnd\E--[ \t]*\r?\n(.+)\z/sm) { $self->{epilogue} = $1; } - map { new_sub(undef, \$_) } @$parts; + @$parts; } sub parts_set { diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index 7c28d019..5ec2a4d9 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -64,8 +64,12 @@ sub msg_part_text ($$) { # times when it should not have been: # <87llgalspt.fsf@free.fr> # <200308111450.h7BEoOu20077@mail.osdl.org> + # But also do not try this with ->{is_submsg} (message/rfc822), + # since a broken multipart/mixed inside a message/rfc822 part + # has not been seen in the wild, yet... if ($err && ($ct =~ m!\btext/\b!i || - $ct =~ m!\bmultipart/mixed\b!i)) { + (!$part->{is_submsg} && + $ct =~ m!\bmultipart/mixed\b!i) ) ) { my $cte = $part->header_raw('Content-Transfer-Encoding'); if (defined($cte) && $cte =~ /\b7bit\b/i) { $s = $part->body; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 4bdd69f5..5f5ae895 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -284,6 +284,13 @@ sub index_xapian { # msg_iter callback if (defined $fn && $fn ne '') { index_text($self, $fn, 1, 'XFN'); } + if ($part->{is_submsg}) { + my $mids = mids_for_index($part); + index_ids($self, $doc, $part, $mids); + my $smsg = PublicInbox::Smsg->new($part); + index_users($self, $smsg); + index_text($self, $smsg->subject, 1, 'S') if $smsg->subject; + } my ($s, undef) = msg_part_text($part, $ct); defined $s or return; @@ -307,6 +314,27 @@ sub index_xapian { # msg_iter callback } } +sub index_ids ($$$$) { + my ($self, $doc, $hdr, $mids) = @_; + for my $mid (@$mids) { + index_text($self, $mid, 1, 'XM'); + + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + index_text($self, join(' ', @long), 1, 'XM'); + } + } + $doc->add_boolean_term('Q' . $_) for @$mids; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } +} + sub add_xapian ($$$$) { my ($self, $mime, $smsg, $mids) = @_; $smsg->{mime} = $mime; # XXX dangerous @@ -321,22 +349,12 @@ sub add_xapian ($$$$) { add_val($doc, PublicInbox::Search::DT(), $dt); my $tg = term_generator($self); - $tg->set_document($doc); index_text($self, $subj, 1, 'S') if $subj; index_users($self, $smsg); msg_iter($mime, \&index_xapian, [ $self, $doc ]); - foreach my $mid (@$mids) { - index_text($self, $mid, 1, 'XM'); - - # because too many Message-IDs are prefixed with - # "Pine.LNX."... - if ($mid =~ /\w{12,}/) { - my @long = ($mid =~ /(\w{3,}+)/g); - index_text($self, join(' ', @long), 1, 'XM'); - } - } + index_ids($self, $doc, $hdr, $mids); $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids); my $data = $smsg->to_doc_data; @@ -351,13 +369,6 @@ sub add_xapian ($$$$) { } } } - $doc->add_boolean_term('Q' . $_) foreach @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } $self->{xdb}->replace_document($smsg->{num}, $doc); } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 7264f4b6..93a5b329 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -17,6 +17,7 @@ use PublicInbox::Address; use PublicInbox::WwwStream; use PublicInbox::Reply; use PublicInbox::ViewDiff qw(flush_diff); +use PublicInbox::Eml; use POSIX qw(strftime); use Time::Local qw(timegm); use PublicInbox::Smsg qw(subject_normalized); @@ -480,6 +481,21 @@ sub multipart_text_as_html { $_[0]->each_part(\&add_text_body, $_[1], 1); } +sub submsg_hdr ($$) { + my ($ctx, $eml) = @_; + my $obfs_ibx = $ctx->{-obfs_ibx}; + my $rv = $ctx->{obuf}; + $$rv .= "\n"; + for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) { + my @v = $eml->header($h); + for my $v (@v) { + obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; + $v = ascii_html($v); + $$rv .= "$h: $v\n"; + } + } +} + sub attach_link ($$$$;$) { my ($ctx, $ct, $p, $fn, $err) = @_; my ($part, $depth, $idx) = @$p; @@ -511,6 +527,9 @@ EOF $desc = ascii_html($desc); $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; $$rv .= "\n"; + + submsg_hdr($ctx, $part) if $part->{is_submsg}; + undef; } @@ -518,6 +537,7 @@ sub add_text_body { # callback for each_part my ($p, $ctx) = @_; my $upfx = $ctx->{mhref}; my $ibx = $ctx->{-inbox}; + my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; # $p - from each_part: [ Email::MIME-like, depth, $idx ] my ($part, $depth, $idx) = @$p; my $ct = $part->content_type || 'text/plain'; @@ -525,6 +545,12 @@ sub add_text_body { # callback for each_part my ($s, $err) = msg_part_text($part, $ct); return attach_link($ctx, $ct, $p, $fn) unless defined $s; + my $rv = $ctx->{obuf}; + if ($part->{is_submsg}) { + submsg_hdr($ctx, $part); + $$rv .= "\n"; + } + # makes no difference to browsers, and don't screw up filename # link generation in diffs with the extra '%0D' $s =~ s/\r\n/\n/sg; @@ -571,13 +597,11 @@ sub add_text_body { # callback for each_part # split off quoted and unquoted blocks: my @sections = PublicInbox::MsgIter::split_quotes($s); undef $s; # free memory - my $rv = $ctx->{obuf}; - if (defined($fn) || $depth > 0 || $err) { + if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) { # badly-encoded message with $err? tell the world about it! attach_link($ctx, $ct, $p, $fn, $err); $$rv .= "\n"; } - my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; foreach my $cur (@sections) { if ($cur =~ /\A>/) { # we use a here to allow users to specify -- cgit v1.2.3-24-ge0c7