about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--lib/PublicInbox/Eml.pm37
-rw-r--r--lib/PublicInbox/MsgIter.pm6
-rw-r--r--lib/PublicInbox/SearchIdx.pm47
-rw-r--r--lib/PublicInbox/View.pm30
-rw-r--r--t/eml.t28
-rw-r--r--t/psgi_attach.t9
-rw-r--r--t/search.t25
7 files changed, 154 insertions, 28 deletions
diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm
index ef401141..6f6874cd 100644
--- a/lib/PublicInbox/Eml.pm
+++ b/lib/PublicInbox/Eml.pm
@@ -60,6 +60,14 @@ my %DECODE_FULL = (
 our %STR_TYPE = (text => 1);
 our %STR_SUBTYPE = (plain => 1, html => 1);
 
+# message/* subtypes we descend into
+our %MESSAGE_DESCEND = (
+        news => 1, # RFC 1849 (obsolete, but archives are forever)
+        rfc822 => 1, # RFC 2046
+        rfc2822 => 1, # gmime handles this (but not rfc5322)
+        global => 1, # RFC 6532
+);
+
 my %re_memo;
 sub re_memo ($) {
         my ($k) = @_;
@@ -149,13 +157,25 @@ sub ct ($) {
 }
 
 # returns a queue of sub-parts iff it's worth descending into
-# TODO: descend into message/rfc822 parts (Email::MIME didn't)
 sub mp_descend ($$) {
         my ($self, $nr) = @_; # or $once for top-level
-        my $bnd = ct($self)->{attributes}->{boundary} // return; # single-part
+        my $ct = ct($self);
+        my $type = lc($ct->{type});
+        if ($type eq 'message' && $MESSAGE_DESCEND{lc($ct->{subtype})}) {
+                my $nxt = new(undef, body_raw($self));
+                $self->{-call_cb} = $nxt->{is_submsg} = 1;
+                return [ $nxt ];
+        }
+        return if $type ne 'multipart';
+        my $bnd = $ct->{attributes}->{boundary} // return; # single-part
         return if $bnd eq '' || length($bnd) >= $mime_boundary_length_limit;
         $bnd = quotemeta($bnd);
 
+        # this is a multipart message that didn't get descended into in
+        # public-inbox <= 1.5.0, so ensure we call the user callback for
+        # this part to not break PSGI downloads.
+        $self->{-call_cb} = $self->{is_submsg};
+
         # "multipart" messages can exist w/o a body
         my $bdy = ($nr ? delete($self->{bdy}) : \(body_raw($self))) or return;
 
@@ -189,14 +209,15 @@ sub mp_descend ($$) {
                 # compatibility with Email::MIME
                 $parts[-1] =~ s/\n\r?\n\z/\n/s if $epilogue_missing;
 
-                @parts = grep /[^ \t\r\n]/s, @parts; # ignore empty parts
+                # ignore empty parts
+                @parts = map { new_sub(undef, \$_) } grep /[^ \t\r\n]/s, @parts;
 
                 # Keep "From: someone..." from preamble in old,
                 # buggy versions of git-send-email, otherwise drop it
                 # There's also a case where quoted text showed up in the
                 # preamble
                 # <20060515162817.65F0F1BBAE@citi.umich.edu>
-                unshift(@parts, $pre) if $pre =~ /:/s;
+                unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s;
                 return \@parts;
         }
         # "multipart", but no boundary found, treat as single part
@@ -217,6 +238,9 @@ sub each_part {
         my ($self, $cb, $arg, $once) = @_;
         my $p = mp_descend($self, $once // 0) or
                                         return $cb->([$self, 0, 0], $arg);
+
+        $cb->([$self, 0, 0], $arg) if $self->{-call_cb}; # rare
+
         $p = [ $p, 0 ];
         my @s; # our virtual stack
         my $nr = 0;
@@ -226,11 +250,12 @@ sub each_part {
                 my (undef, @idx) = @$p;
                 @idx = (join('.', @idx));
                 my $depth = ($idx[0] =~ tr/././) + 1;
-                my $sub = new_sub(undef, \(shift @{$p->[0]}));
+                my $sub = shift @{$p->[0]};
                 if ($depth < $mime_nesting_limit &&
                                 (my $nxt = mp_descend($sub, $nr))) {
                         push(@s, $p) if scalar @{$p->[0]};
                         $p = [ $nxt, @idx, 0 ];
+                        $cb->([$sub, $depth, @idx], $arg) if $sub->{-call_cb};
                 } else { # a leaf node
                         $cb->([$sub, $depth, @idx], $arg);
                 }
@@ -270,7 +295,7 @@ sub subparts {
         if ($$bdy =~ /^--\Q$bnd\E--[ \t]*\r?\n(.+)\z/sm) {
                 $self->{epilogue} = $1;
         }
-        map { new_sub(undef, \$_) } @$parts;
+        @$parts;
 }
 
 sub parts_set {
diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm
index 7c28d019..5ec2a4d9 100644
--- a/lib/PublicInbox/MsgIter.pm
+++ b/lib/PublicInbox/MsgIter.pm
@@ -64,8 +64,12 @@ sub msg_part_text ($$) {
         # times when it should not have been:
         #   <87llgalspt.fsf@free.fr>
         #   <200308111450.h7BEoOu20077@mail.osdl.org>
+        # But also do not try this with ->{is_submsg} (message/rfc822),
+        # since a broken multipart/mixed inside a message/rfc822 part
+        # has not been seen in the wild, yet...
         if ($err && ($ct =~ m!\btext/\b!i ||
-                        $ct =~ m!\bmultipart/mixed\b!i)) {
+                        (!$part->{is_submsg} &&
+                                $ct =~ m!\bmultipart/mixed\b!i) ) ) {
                 my $cte = $part->header_raw('Content-Transfer-Encoding');
                 if (defined($cte) && $cte =~ /\b7bit\b/i) {
                         $s = $part->body;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 4bdd69f5..5f5ae895 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -284,6 +284,13 @@ sub index_xapian { # msg_iter callback
         if (defined $fn && $fn ne '') {
                 index_text($self, $fn, 1, 'XFN');
         }
+        if ($part->{is_submsg}) {
+                my $mids = mids_for_index($part);
+                index_ids($self, $doc, $part, $mids);
+                my $smsg = PublicInbox::Smsg->new($part);
+                index_users($self, $smsg);
+                index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+        }
 
         my ($s, undef) = msg_part_text($part, $ct);
         defined $s or return;
@@ -307,6 +314,27 @@ sub index_xapian { # msg_iter callback
         }
 }
 
+sub index_ids ($$$$) {
+        my ($self, $doc, $hdr, $mids) = @_;
+        for my $mid (@$mids) {
+                index_text($self, $mid, 1, 'XM');
+
+                # because too many Message-IDs are prefixed with
+                # "Pine.LNX."...
+                if ($mid =~ /\w{12,}/) {
+                        my @long = ($mid =~ /(\w{3,}+)/g);
+                        index_text($self, join(' ', @long), 1, 'XM');
+                }
+        }
+        $doc->add_boolean_term('Q' . $_) for @$mids;
+        for my $l ($hdr->header_raw('List-Id')) {
+                $l =~ /<([^>]+)>/ or next;
+                my $lid = $1;
+                $doc->add_boolean_term('G' . $lid);
+                index_text($self, $lid, 1, 'XL'); # probabilistic
+        }
+}
+
 sub add_xapian ($$$$) {
         my ($self, $mime, $smsg, $mids) = @_;
         $smsg->{mime} = $mime; # XXX dangerous
@@ -321,22 +349,12 @@ sub add_xapian ($$$$) {
         add_val($doc, PublicInbox::Search::DT(), $dt);
 
         my $tg = term_generator($self);
-
         $tg->set_document($doc);
         index_text($self, $subj, 1, 'S') if $subj;
         index_users($self, $smsg);
 
         msg_iter($mime, \&index_xapian, [ $self, $doc ]);
-        foreach my $mid (@$mids) {
-                index_text($self, $mid, 1, 'XM');
-
-                # because too many Message-IDs are prefixed with
-                # "Pine.LNX."...
-                if ($mid =~ /\w{12,}/) {
-                        my @long = ($mid =~ /(\w{3,}+)/g);
-                        index_text($self, join(' ', @long), 1, 'XM');
-                }
-        }
+        index_ids($self, $doc, $hdr, $mids);
         $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
         PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
         my $data = $smsg->to_doc_data;
@@ -351,13 +369,6 @@ sub add_xapian ($$$$) {
                         }
                 }
         }
-        $doc->add_boolean_term('Q' . $_) foreach @$mids;
-        for my $l ($hdr->header_raw('List-Id')) {
-                $l =~ /<([^>]+)>/ or next;
-                my $lid = $1;
-                $doc->add_boolean_term('G' . $lid);
-                index_text($self, $lid, 1, 'XL'); # probabilistic
-        }
         $self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 7264f4b6..93a5b329 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -17,6 +17,7 @@ use PublicInbox::Address;
 use PublicInbox::WwwStream;
 use PublicInbox::Reply;
 use PublicInbox::ViewDiff qw(flush_diff);
+use PublicInbox::Eml;
 use POSIX qw(strftime);
 use Time::Local qw(timegm);
 use PublicInbox::Smsg qw(subject_normalized);
@@ -480,6 +481,21 @@ sub multipart_text_as_html {
         $_[0]->each_part(\&add_text_body, $_[1], 1);
 }
 
+sub submsg_hdr ($$) {
+        my ($ctx, $eml) = @_;
+        my $obfs_ibx = $ctx->{-obfs_ibx};
+        my $rv = $ctx->{obuf};
+        $$rv .= "\n";
+        for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) {
+                my @v = $eml->header($h);
+                for my $v (@v) {
+                        obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
+                        $v = ascii_html($v);
+                        $$rv .= "$h: $v\n";
+                }
+        }
+}
+
 sub attach_link ($$$$;$) {
         my ($ctx, $ct, $p, $fn, $err) = @_;
         my ($part, $depth, $idx) = @$p;
@@ -511,6 +527,9 @@ EOF
         $desc = ascii_html($desc);
         $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]";
         $$rv .= "</a>\n";
+
+        submsg_hdr($ctx, $part) if $part->{is_submsg};
+
         undef;
 }
 
@@ -518,6 +537,7 @@ sub add_text_body { # callback for each_part
         my ($p, $ctx) = @_;
         my $upfx = $ctx->{mhref};
         my $ibx = $ctx->{-inbox};
+        my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
         # $p - from each_part: [ Email::MIME-like, depth, $idx ]
         my ($part, $depth, $idx) = @$p;
         my $ct = $part->content_type || 'text/plain';
@@ -525,6 +545,12 @@ sub add_text_body { # callback for each_part
         my ($s, $err) = msg_part_text($part, $ct);
         return attach_link($ctx, $ct, $p, $fn) unless defined $s;
 
+        my $rv = $ctx->{obuf};
+        if ($part->{is_submsg}) {
+                submsg_hdr($ctx, $part);
+                $$rv .= "\n";
+        }
+
         # makes no difference to browsers, and don't screw up filename
         # link generation in diffs with the extra '%0D'
         $s =~ s/\r\n/\n/sg;
@@ -571,13 +597,11 @@ sub add_text_body { # callback for each_part
         # split off quoted and unquoted blocks:
         my @sections = PublicInbox::MsgIter::split_quotes($s);
         undef $s; # free memory
-        my $rv = $ctx->{obuf};
-        if (defined($fn) || $depth > 0 || $err) {
+        if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) {
                 # badly-encoded message with $err? tell the world about it!
                 attach_link($ctx, $ct, $p, $fn, $err);
                 $$rv .= "\n";
         }
-        my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
         foreach my $cur (@sections) {
                 if ($cur =~ /\A>/) {
                         # we use a <span> here to allow users to specify
diff --git a/t/eml.t b/t/eml.t
index c91deb3a..b7f58ac7 100644
--- a/t/eml.t
+++ b/t/eml.t
@@ -117,6 +117,34 @@ EOF
                 '', 'each_part can clobber body');
 }
 
+if ('descend into message/rfc822') {
+        my $eml = eml_load 't/data/message_embed.eml';
+        my @parts;
+        $eml->each_part(sub {
+                my ($part, $level, @ex) = @{$_[0]};
+                push @parts, [ $part, $level, @ex ];
+        });
+        is(scalar(@parts), 6, 'got all parts');
+        like($parts[0]->[0]->body, qr/^testing embedded message harder\n/sm,
+                'first part found');
+        is_deeply([ @{$parts[0]}[1..2] ], [ 1, '1' ],
+                'got expected depth and level for part #0');
+        is($parts[1]->[0]->filename, 'embed2x.eml',
+                'attachment filename found');
+        is_deeply([ @{$parts[1]}[1..2] ], [ 1, '2' ],
+                'got expected depth and level for part #1');
+        is_deeply([ @{$parts[2]}[1..2] ], [ 2, '2.1' ],
+                'got expected depth and level for part #2');
+        is_deeply([ @{$parts[3]}[1..2] ], [ 3, '2.1.1' ],
+                'got expected depth and level for part #3');
+        is_deeply([ @{$parts[4]}[1..2] ], [ 3, '2.1.2' ],
+                'got expected depth and level for part #4');
+        is($parts[4]->[0]->filename, 'test.eml',
+                'another attachment filename found');
+        is_deeply([ @{$parts[5]}[1..2] ], [ 4, '2.1.2.1' ],
+                'got expected depth and level for part #5');
+}
+
 # body-less, boundary-less
 for my $cls (@classes) {
         my $call = 0;
diff --git a/t/psgi_attach.t b/t/psgi_attach.t
index 12f9e6ee..c6f8072f 100644
--- a/t/psgi_attach.t
+++ b/t/psgi_attach.t
@@ -75,6 +75,9 @@ $im->init_bare;
                 $res = $cb->(GET("/test/$mid/"));
                 like($res->content, qr/\bhref="2-embed2x\.eml"/s,
                         'href to message/rfc822 attachment visible');
+                like($res->content, qr/\bhref="2\.1\.2-test\.eml"/s,
+                        'href to nested message/rfc822 attachment visible');
+
                 $res = $cb->(GET("/test/$mid/2-embed2x.eml"));
                 my $eml = PublicInbox::Eml->new(\($res->content));
                 is_deeply([ $eml->header_raw('Message-ID') ], [ "<$irt>" ],
@@ -85,6 +88,12 @@ $im->init_bare;
                         '1st attachment is as expected');
                 is($subs[1]->header('Content-Type'), 'message/rfc822',
                         '2nd attachment is as expected');
+
+                $res = $cb->(GET("/test/$mid/2.1.2-test.eml"));
+                $eml = PublicInbox::Eml->new(\($res->content));
+                is_deeply([ $eml->header_raw('Message-ID') ],
+                        [ '<20200418214114.7575-1-e@yhbt.net>' ],
+                        'nested eml retrieved');
         });
 }
 done_testing();
diff --git a/t/search.t b/t/search.t
index 6dd50474..9d74f5e0 100644
--- a/t/search.t
+++ b/t/search.t
@@ -479,6 +479,31 @@ EOF
         is_deeply($found, [], 'matched on phrase with l:');
 }
 
+$ibx->with_umask(sub {
+        $rw_commit->();
+        my $doc_id = $rw->add_message(eml_load('t/data/message_embed.eml'));
+        ok($doc_id > 0, 'messages within messages');
+        $rw->commit_txn_lazy;
+        $ro->reopen;
+        my $n_test_eml = $ro->query('n:test.eml');
+        is(scalar(@$n_test_eml), 1, 'got a result');
+        my $n_embed2x_eml = $ro->query('n:embed2x.eml');
+        is_deeply($n_test_eml, $n_embed2x_eml, '.eml filenames searchable');
+        for my $m (qw(20200418222508.GA13918@dcvr 20200418222020.GA2745@dcvr
+                        20200418214114.7575-1-e@yhbt.net)) {
+                is($ro->query("m:$m")->[0]->{mid},
+                        '20200418222508.GA13918@dcvr', 'probabilistic m:'.$m);
+                is($ro->query("mid:$m")->[0]->{mid},
+                        '20200418222508.GA13918@dcvr', 'boolean mid:'.$m);
+        }
+        is($ro->query('dfpost:4dc62c50')->[0]->{mid},
+                '20200418222508.GA13918@dcvr',
+                'diff search reaches inside message/rfc822');
+        is($ro->query('s:"mail header experiments"')->[0]->{mid},
+                '20200418222508.GA13918@dcvr',
+                'Subject search reaches inside message/rfc822');
+});
+
 done_testing();
 
 1;