about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/Eml.pm37
-rw-r--r--lib/PublicInbox/MsgIter.pm6
-rw-r--r--lib/PublicInbox/SearchIdx.pm47
-rw-r--r--lib/PublicInbox/View.pm30
4 files changed, 92 insertions, 28 deletions
diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm
index ef401141..6f6874cd 100644
--- a/lib/PublicInbox/Eml.pm
+++ b/lib/PublicInbox/Eml.pm
@@ -60,6 +60,14 @@ my %DECODE_FULL = (
 our %STR_TYPE = (text => 1);
 our %STR_SUBTYPE = (plain => 1, html => 1);
 
+# message/* subtypes we descend into
+our %MESSAGE_DESCEND = (
+        news => 1, # RFC 1849 (obsolete, but archives are forever)
+        rfc822 => 1, # RFC 2046
+        rfc2822 => 1, # gmime handles this (but not rfc5322)
+        global => 1, # RFC 6532
+);
+
 my %re_memo;
 sub re_memo ($) {
         my ($k) = @_;
@@ -149,13 +157,25 @@ sub ct ($) {
 }
 
 # returns a queue of sub-parts iff it's worth descending into
-# TODO: descend into message/rfc822 parts (Email::MIME didn't)
 sub mp_descend ($$) {
         my ($self, $nr) = @_; # or $once for top-level
-        my $bnd = ct($self)->{attributes}->{boundary} // return; # single-part
+        my $ct = ct($self);
+        my $type = lc($ct->{type});
+        if ($type eq 'message' && $MESSAGE_DESCEND{lc($ct->{subtype})}) {
+                my $nxt = new(undef, body_raw($self));
+                $self->{-call_cb} = $nxt->{is_submsg} = 1;
+                return [ $nxt ];
+        }
+        return if $type ne 'multipart';
+        my $bnd = $ct->{attributes}->{boundary} // return; # single-part
         return if $bnd eq '' || length($bnd) >= $mime_boundary_length_limit;
         $bnd = quotemeta($bnd);
 
+        # this is a multipart message that didn't get descended into in
+        # public-inbox <= 1.5.0, so ensure we call the user callback for
+        # this part to not break PSGI downloads.
+        $self->{-call_cb} = $self->{is_submsg};
+
         # "multipart" messages can exist w/o a body
         my $bdy = ($nr ? delete($self->{bdy}) : \(body_raw($self))) or return;
 
@@ -189,14 +209,15 @@ sub mp_descend ($$) {
                 # compatibility with Email::MIME
                 $parts[-1] =~ s/\n\r?\n\z/\n/s if $epilogue_missing;
 
-                @parts = grep /[^ \t\r\n]/s, @parts; # ignore empty parts
+                # ignore empty parts
+                @parts = map { new_sub(undef, \$_) } grep /[^ \t\r\n]/s, @parts;
 
                 # Keep "From: someone..." from preamble in old,
                 # buggy versions of git-send-email, otherwise drop it
                 # There's also a case where quoted text showed up in the
                 # preamble
                 # <20060515162817.65F0F1BBAE@citi.umich.edu>
-                unshift(@parts, $pre) if $pre =~ /:/s;
+                unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s;
                 return \@parts;
         }
         # "multipart", but no boundary found, treat as single part
@@ -217,6 +238,9 @@ sub each_part {
         my ($self, $cb, $arg, $once) = @_;
         my $p = mp_descend($self, $once // 0) or
                                         return $cb->([$self, 0, 0], $arg);
+
+        $cb->([$self, 0, 0], $arg) if $self->{-call_cb}; # rare
+
         $p = [ $p, 0 ];
         my @s; # our virtual stack
         my $nr = 0;
@@ -226,11 +250,12 @@ sub each_part {
                 my (undef, @idx) = @$p;
                 @idx = (join('.', @idx));
                 my $depth = ($idx[0] =~ tr/././) + 1;
-                my $sub = new_sub(undef, \(shift @{$p->[0]}));
+                my $sub = shift @{$p->[0]};
                 if ($depth < $mime_nesting_limit &&
                                 (my $nxt = mp_descend($sub, $nr))) {
                         push(@s, $p) if scalar @{$p->[0]};
                         $p = [ $nxt, @idx, 0 ];
+                        $cb->([$sub, $depth, @idx], $arg) if $sub->{-call_cb};
                 } else { # a leaf node
                         $cb->([$sub, $depth, @idx], $arg);
                 }
@@ -270,7 +295,7 @@ sub subparts {
         if ($$bdy =~ /^--\Q$bnd\E--[ \t]*\r?\n(.+)\z/sm) {
                 $self->{epilogue} = $1;
         }
-        map { new_sub(undef, \$_) } @$parts;
+        @$parts;
 }
 
 sub parts_set {
diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm
index 7c28d019..5ec2a4d9 100644
--- a/lib/PublicInbox/MsgIter.pm
+++ b/lib/PublicInbox/MsgIter.pm
@@ -64,8 +64,12 @@ sub msg_part_text ($$) {
         # times when it should not have been:
         #   <87llgalspt.fsf@free.fr>
         #   <200308111450.h7BEoOu20077@mail.osdl.org>
+        # But also do not try this with ->{is_submsg} (message/rfc822),
+        # since a broken multipart/mixed inside a message/rfc822 part
+        # has not been seen in the wild, yet...
         if ($err && ($ct =~ m!\btext/\b!i ||
-                        $ct =~ m!\bmultipart/mixed\b!i)) {
+                        (!$part->{is_submsg} &&
+                                $ct =~ m!\bmultipart/mixed\b!i) ) ) {
                 my $cte = $part->header_raw('Content-Transfer-Encoding');
                 if (defined($cte) && $cte =~ /\b7bit\b/i) {
                         $s = $part->body;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 4bdd69f5..5f5ae895 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -284,6 +284,13 @@ sub index_xapian { # msg_iter callback
         if (defined $fn && $fn ne '') {
                 index_text($self, $fn, 1, 'XFN');
         }
+        if ($part->{is_submsg}) {
+                my $mids = mids_for_index($part);
+                index_ids($self, $doc, $part, $mids);
+                my $smsg = PublicInbox::Smsg->new($part);
+                index_users($self, $smsg);
+                index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+        }
 
         my ($s, undef) = msg_part_text($part, $ct);
         defined $s or return;
@@ -307,6 +314,27 @@ sub index_xapian { # msg_iter callback
         }
 }
 
+sub index_ids ($$$$) {
+        my ($self, $doc, $hdr, $mids) = @_;
+        for my $mid (@$mids) {
+                index_text($self, $mid, 1, 'XM');
+
+                # because too many Message-IDs are prefixed with
+                # "Pine.LNX."...
+                if ($mid =~ /\w{12,}/) {
+                        my @long = ($mid =~ /(\w{3,}+)/g);
+                        index_text($self, join(' ', @long), 1, 'XM');
+                }
+        }
+        $doc->add_boolean_term('Q' . $_) for @$mids;
+        for my $l ($hdr->header_raw('List-Id')) {
+                $l =~ /<([^>]+)>/ or next;
+                my $lid = $1;
+                $doc->add_boolean_term('G' . $lid);
+                index_text($self, $lid, 1, 'XL'); # probabilistic
+        }
+}
+
 sub add_xapian ($$$$) {
         my ($self, $mime, $smsg, $mids) = @_;
         $smsg->{mime} = $mime; # XXX dangerous
@@ -321,22 +349,12 @@ sub add_xapian ($$$$) {
         add_val($doc, PublicInbox::Search::DT(), $dt);
 
         my $tg = term_generator($self);
-
         $tg->set_document($doc);
         index_text($self, $subj, 1, 'S') if $subj;
         index_users($self, $smsg);
 
         msg_iter($mime, \&index_xapian, [ $self, $doc ]);
-        foreach my $mid (@$mids) {
-                index_text($self, $mid, 1, 'XM');
-
-                # because too many Message-IDs are prefixed with
-                # "Pine.LNX."...
-                if ($mid =~ /\w{12,}/) {
-                        my @long = ($mid =~ /(\w{3,}+)/g);
-                        index_text($self, join(' ', @long), 1, 'XM');
-                }
-        }
+        index_ids($self, $doc, $hdr, $mids);
         $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
         PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
         my $data = $smsg->to_doc_data;
@@ -351,13 +369,6 @@ sub add_xapian ($$$$) {
                         }
                 }
         }
-        $doc->add_boolean_term('Q' . $_) foreach @$mids;
-        for my $l ($hdr->header_raw('List-Id')) {
-                $l =~ /<([^>]+)>/ or next;
-                my $lid = $1;
-                $doc->add_boolean_term('G' . $lid);
-                index_text($self, $lid, 1, 'XL'); # probabilistic
-        }
         $self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 7264f4b6..93a5b329 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -17,6 +17,7 @@ use PublicInbox::Address;
 use PublicInbox::WwwStream;
 use PublicInbox::Reply;
 use PublicInbox::ViewDiff qw(flush_diff);
+use PublicInbox::Eml;
 use POSIX qw(strftime);
 use Time::Local qw(timegm);
 use PublicInbox::Smsg qw(subject_normalized);
@@ -480,6 +481,21 @@ sub multipart_text_as_html {
         $_[0]->each_part(\&add_text_body, $_[1], 1);
 }
 
+sub submsg_hdr ($$) {
+        my ($ctx, $eml) = @_;
+        my $obfs_ibx = $ctx->{-obfs_ibx};
+        my $rv = $ctx->{obuf};
+        $$rv .= "\n";
+        for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) {
+                my @v = $eml->header($h);
+                for my $v (@v) {
+                        obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
+                        $v = ascii_html($v);
+                        $$rv .= "$h: $v\n";
+                }
+        }
+}
+
 sub attach_link ($$$$;$) {
         my ($ctx, $ct, $p, $fn, $err) = @_;
         my ($part, $depth, $idx) = @$p;
@@ -511,6 +527,9 @@ EOF
         $desc = ascii_html($desc);
         $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]";
         $$rv .= "</a>\n";
+
+        submsg_hdr($ctx, $part) if $part->{is_submsg};
+
         undef;
 }
 
@@ -518,6 +537,7 @@ sub add_text_body { # callback for each_part
         my ($p, $ctx) = @_;
         my $upfx = $ctx->{mhref};
         my $ibx = $ctx->{-inbox};
+        my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
         # $p - from each_part: [ Email::MIME-like, depth, $idx ]
         my ($part, $depth, $idx) = @$p;
         my $ct = $part->content_type || 'text/plain';
@@ -525,6 +545,12 @@ sub add_text_body { # callback for each_part
         my ($s, $err) = msg_part_text($part, $ct);
         return attach_link($ctx, $ct, $p, $fn) unless defined $s;
 
+        my $rv = $ctx->{obuf};
+        if ($part->{is_submsg}) {
+                submsg_hdr($ctx, $part);
+                $$rv .= "\n";
+        }
+
         # makes no difference to browsers, and don't screw up filename
         # link generation in diffs with the extra '%0D'
         $s =~ s/\r\n/\n/sg;
@@ -571,13 +597,11 @@ sub add_text_body { # callback for each_part
         # split off quoted and unquoted blocks:
         my @sections = PublicInbox::MsgIter::split_quotes($s);
         undef $s; # free memory
-        my $rv = $ctx->{obuf};
-        if (defined($fn) || $depth > 0 || $err) {
+        if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) {
                 # badly-encoded message with $err? tell the world about it!
                 attach_link($ctx, $ct, $p, $fn, $err);
                 $$rv .= "\n";
         }
-        my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
         foreach my $cur (@sections) {
                 if ($cur =~ /\A>/) {
                         # we use a <span> here to allow users to specify