diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/PublicInbox/Eml.pm | 37 | ||||
-rw-r--r-- | lib/PublicInbox/MsgIter.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 47 | ||||
-rw-r--r-- | lib/PublicInbox/View.pm | 30 |
4 files changed, 92 insertions, 28 deletions
diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index ef401141..6f6874cd 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -60,6 +60,14 @@ my %DECODE_FULL = ( our %STR_TYPE = (text => 1); our %STR_SUBTYPE = (plain => 1, html => 1); +# message/* subtypes we descend into +our %MESSAGE_DESCEND = ( + news => 1, # RFC 1849 (obsolete, but archives are forever) + rfc822 => 1, # RFC 2046 + rfc2822 => 1, # gmime handles this (but not rfc5322) + global => 1, # RFC 6532 +); + my %re_memo; sub re_memo ($) { my ($k) = @_; @@ -149,13 +157,25 @@ sub ct ($) { } # returns a queue of sub-parts iff it's worth descending into -# TODO: descend into message/rfc822 parts (Email::MIME didn't) sub mp_descend ($$) { my ($self, $nr) = @_; # or $once for top-level - my $bnd = ct($self)->{attributes}->{boundary} // return; # single-part + my $ct = ct($self); + my $type = lc($ct->{type}); + if ($type eq 'message' && $MESSAGE_DESCEND{lc($ct->{subtype})}) { + my $nxt = new(undef, body_raw($self)); + $self->{-call_cb} = $nxt->{is_submsg} = 1; + return [ $nxt ]; + } + return if $type ne 'multipart'; + my $bnd = $ct->{attributes}->{boundary} // return; # single-part return if $bnd eq '' || length($bnd) >= $mime_boundary_length_limit; $bnd = quotemeta($bnd); + # this is a multipart message that didn't get descended into in + # public-inbox <= 1.5.0, so ensure we call the user callback for + # this part to not break PSGI downloads. + $self->{-call_cb} = $self->{is_submsg}; + # "multipart" messages can exist w/o a body my $bdy = ($nr ? delete($self->{bdy}) : \(body_raw($self))) or return; @@ -189,14 +209,15 @@ sub mp_descend ($$) { # compatibility with Email::MIME $parts[-1] =~ s/\n\r?\n\z/\n/s if $epilogue_missing; - @parts = grep /[^ \t\r\n]/s, @parts; # ignore empty parts + # ignore empty parts + @parts = map { new_sub(undef, \$_) } grep /[^ \t\r\n]/s, @parts; # Keep "From: someone..." from preamble in old, # buggy versions of git-send-email, otherwise drop it # There's also a case where quoted text showed up in the # preamble # <20060515162817.65F0F1BBAE@citi.umich.edu> - unshift(@parts, $pre) if $pre =~ /:/s; + unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s; return \@parts; } # "multipart", but no boundary found, treat as single part @@ -217,6 +238,9 @@ sub each_part { my ($self, $cb, $arg, $once) = @_; my $p = mp_descend($self, $once // 0) or return $cb->([$self, 0, 0], $arg); + + $cb->([$self, 0, 0], $arg) if $self->{-call_cb}; # rare + $p = [ $p, 0 ]; my @s; # our virtual stack my $nr = 0; @@ -226,11 +250,12 @@ sub each_part { my (undef, @idx) = @$p; @idx = (join('.', @idx)); my $depth = ($idx[0] =~ tr/././) + 1; - my $sub = new_sub(undef, \(shift @{$p->[0]})); + my $sub = shift @{$p->[0]}; if ($depth < $mime_nesting_limit && (my $nxt = mp_descend($sub, $nr))) { push(@s, $p) if scalar @{$p->[0]}; $p = [ $nxt, @idx, 0 ]; + $cb->([$sub, $depth, @idx], $arg) if $sub->{-call_cb}; } else { # a leaf node $cb->([$sub, $depth, @idx], $arg); } @@ -270,7 +295,7 @@ sub subparts { if ($$bdy =~ /^--\Q$bnd\E--[ \t]*\r?\n(.+)\z/sm) { $self->{epilogue} = $1; } - map { new_sub(undef, \$_) } @$parts; + @$parts; } sub parts_set { diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index 7c28d019..5ec2a4d9 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -64,8 +64,12 @@ sub msg_part_text ($$) { # times when it should not have been: # <87llgalspt.fsf@free.fr> # <200308111450.h7BEoOu20077@mail.osdl.org> + # But also do not try this with ->{is_submsg} (message/rfc822), + # since a broken multipart/mixed inside a message/rfc822 part + # has not been seen in the wild, yet... if ($err && ($ct =~ m!\btext/\b!i || - $ct =~ m!\bmultipart/mixed\b!i)) { + (!$part->{is_submsg} && + $ct =~ m!\bmultipart/mixed\b!i) ) ) { my $cte = $part->header_raw('Content-Transfer-Encoding'); if (defined($cte) && $cte =~ /\b7bit\b/i) { $s = $part->body; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 4bdd69f5..5f5ae895 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -284,6 +284,13 @@ sub index_xapian { # msg_iter callback if (defined $fn && $fn ne '') { index_text($self, $fn, 1, 'XFN'); } + if ($part->{is_submsg}) { + my $mids = mids_for_index($part); + index_ids($self, $doc, $part, $mids); + my $smsg = PublicInbox::Smsg->new($part); + index_users($self, $smsg); + index_text($self, $smsg->subject, 1, 'S') if $smsg->subject; + } my ($s, undef) = msg_part_text($part, $ct); defined $s or return; @@ -307,6 +314,27 @@ sub index_xapian { # msg_iter callback } } +sub index_ids ($$$$) { + my ($self, $doc, $hdr, $mids) = @_; + for my $mid (@$mids) { + index_text($self, $mid, 1, 'XM'); + + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + index_text($self, join(' ', @long), 1, 'XM'); + } + } + $doc->add_boolean_term('Q' . $_) for @$mids; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = $1; + $doc->add_boolean_term('G' . $lid); + index_text($self, $lid, 1, 'XL'); # probabilistic + } +} + sub add_xapian ($$$$) { my ($self, $mime, $smsg, $mids) = @_; $smsg->{mime} = $mime; # XXX dangerous @@ -321,22 +349,12 @@ sub add_xapian ($$$$) { add_val($doc, PublicInbox::Search::DT(), $dt); my $tg = term_generator($self); - $tg->set_document($doc); index_text($self, $subj, 1, 'S') if $subj; index_users($self, $smsg); msg_iter($mime, \&index_xapian, [ $self, $doc ]); - foreach my $mid (@$mids) { - index_text($self, $mid, 1, 'XM'); - - # because too many Message-IDs are prefixed with - # "Pine.LNX."... - if ($mid =~ /\w{12,}/) { - my @long = ($mid =~ /(\w{3,}+)/g); - index_text($self, join(' ', @long), 1, 'XM'); - } - } + index_ids($self, $doc, $hdr, $mids); $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids); my $data = $smsg->to_doc_data; @@ -351,13 +369,6 @@ sub add_xapian ($$$$) { } } } - $doc->add_boolean_term('Q' . $_) foreach @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } $self->{xdb}->replace_document($smsg->{num}, $doc); } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 7264f4b6..93a5b329 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -17,6 +17,7 @@ use PublicInbox::Address; use PublicInbox::WwwStream; use PublicInbox::Reply; use PublicInbox::ViewDiff qw(flush_diff); +use PublicInbox::Eml; use POSIX qw(strftime); use Time::Local qw(timegm); use PublicInbox::Smsg qw(subject_normalized); @@ -480,6 +481,21 @@ sub multipart_text_as_html { $_[0]->each_part(\&add_text_body, $_[1], 1); } +sub submsg_hdr ($$) { + my ($ctx, $eml) = @_; + my $obfs_ibx = $ctx->{-obfs_ibx}; + my $rv = $ctx->{obuf}; + $$rv .= "\n"; + for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) { + my @v = $eml->header($h); + for my $v (@v) { + obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; + $v = ascii_html($v); + $$rv .= "$h: $v\n"; + } + } +} + sub attach_link ($$$$;$) { my ($ctx, $ct, $p, $fn, $err) = @_; my ($part, $depth, $idx) = @$p; @@ -511,6 +527,9 @@ EOF $desc = ascii_html($desc); $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; $$rv .= "</a>\n"; + + submsg_hdr($ctx, $part) if $part->{is_submsg}; + undef; } @@ -518,6 +537,7 @@ sub add_text_body { # callback for each_part my ($p, $ctx) = @_; my $upfx = $ctx->{mhref}; my $ibx = $ctx->{-inbox}; + my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; # $p - from each_part: [ Email::MIME-like, depth, $idx ] my ($part, $depth, $idx) = @$p; my $ct = $part->content_type || 'text/plain'; @@ -525,6 +545,12 @@ sub add_text_body { # callback for each_part my ($s, $err) = msg_part_text($part, $ct); return attach_link($ctx, $ct, $p, $fn) unless defined $s; + my $rv = $ctx->{obuf}; + if ($part->{is_submsg}) { + submsg_hdr($ctx, $part); + $$rv .= "\n"; + } + # makes no difference to browsers, and don't screw up filename # link generation in diffs with the extra '%0D' $s =~ s/\r\n/\n/sg; @@ -571,13 +597,11 @@ sub add_text_body { # callback for each_part # split off quoted and unquoted blocks: my @sections = PublicInbox::MsgIter::split_quotes($s); undef $s; # free memory - my $rv = $ctx->{obuf}; - if (defined($fn) || $depth > 0 || $err) { + if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) { # badly-encoded message with $err? tell the world about it! attach_link($ctx, $ct, $p, $fn, $err); $$rv .= "\n"; } - my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; foreach my $cur (@sections) { if ($cur =~ /\A>/) { # we use a <span> here to allow users to specify |