From 0b4b51e8ba8cf62c8da9312666d4866ff2403d6e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 28 Mar 2014 08:22:45 +0000 Subject: filter: use file(1) to detect mime type if octet-stream Some mailers do not correctly detect/set the Content-Type header; so attempt to keep messages based on our server-detected MIME type if application/octet-stream was specified. --- lib/PublicInbox/Filter.pm | 36 +++++++++++++++++++++++++++++++++--- t/filter.t | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm index a83ecc83..64c31e8e 100644 --- a/lib/PublicInbox/Filter.pm +++ b/lib/PublicInbox/Filter.pm @@ -15,6 +15,8 @@ our $VERSION = '0.0.1'; # start with the same defaults as mailman our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i; +our $MIME_HTML = qr!\btext/html\b!i; +our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; # this is highly opinionated delivery # returns 0 only if there is nothing to deliver @@ -35,7 +37,7 @@ sub run { if ($content_type =~ m!\btext/plain\b!i) { return 1; # yay, nothing to do - } elsif ($content_type =~ m!\btext/html\b!i) { + } elsif ($content_type =~ $MIME_HTML) { # HTML-only, non-multipart my $body = $simple->body; my $ct_parsed = parse_content_type($content_type); @@ -129,14 +131,23 @@ sub strip_multipart { my $part_type = $part->content_type; if ($part_type =~ m!\btext/plain\b!i) { push @keep, $part; - } elsif ($part_type =~ m!\btext/html\b!i) { + } elsif ($part_type =~ $MIME_HTML) { push @html, $part; - } elsif ($part_type =~ m!\btext/[a-z0-9\+\._-]+\b!i) { + } elsif ($part_type =~ $MIME_TEXT_ANY) { # Give other text attachments the benefit of the doubt, # here? Could be source code or script the user wants # help with. push @keep, $part; + } elsif ($part_type =~ m!\Aapplication/octet-stream\z!i) { + # unfortunately, some mailers don't set correct types, + # let messages of unknown type through but do not + # change the sender-specified type + if (recheck_type_ok($part)) { + push @keep, $part; + } else { + $rejected++; + } } else { # reject everything else # @@ -216,4 +227,23 @@ sub replace_body { mark_changed($simple); } +# run the file(1) command to detect mime type +# Not using File::MMagic for now since that requires extra configuration +# Note: we do not rewrite the message with the detected mime type +sub recheck_type_ok { + my ($part) = @_; + my $cmd = "file --mime-type -b -"; + my $pid = open2(my $out, my $in, $cmd); + print $in $part->body; + close $in; + my $type = eval { + local $/; + <$out>; + }; + waitpid($pid, 0); + chomp $type; + + (($type =~ $MIME_TEXT_ANY) && ($type !~ $MIME_HTML)) +} + 1; diff --git a/t/filter.t b/t/filter.t index 12f4ed6f..0aa26a5f 100644 --- a/t/filter.t +++ b/t/filter.t @@ -278,5 +278,40 @@ sub count_body_parts { is(undef, $f->simple->header("Mail-Followup-To"), "mft stripped"); } +# multi-part with application/octet-stream +{ + my $os = 'application/octet-stream'; + my $parts = [ + Email::MIME->create( + attributes => { content_type => $os }, + body => < +int main(void) +{ + printf("Hello world\\n"); + return 0; +} +EOF + ), + Email::MIME->create( + attributes => { + filename => 'zero.data', + encoding => 'base64', + content_type => $os, + }, + body => ("\0" x 4096), + ) + ]; + my $email = Email::MIME->create( + header_str => [ From => 'a@example.com', Subject => 'blah' ], + parts => $parts, + ); + my $f = Email::Filter->new(data => $email->as_string); + is(1, PublicInbox::Filter->run($f->simple), "run was a success"); + my $parsed = Email::MIME->new($f->simple->as_string); + is(scalar $parsed->parts, 1, "only one remaining part"); + like($f->simple->header("X-Content-Filtered-By"), + qr/PublicInbox::Filter/, "XCFB header added"); +} done_testing(); -- cgit v1.2.3-24-ge0c7