From 975a1b174d40fd957ed869e9ff68f3c6dd6f37df Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 15 Jun 2016 00:14:29 +0000 Subject: mda: hook up new filter functionality This removes the Email::Filter dependency as well as the signature-breaking scrubber code. We now prefer to reject unacceptable messages and grudgingly (and blindly) mirror messages we're not the primary endpoint for. --- lib/PublicInbox/Filter.pm | 232 ---------------------------------------------- 1 file changed, 232 deletions(-) delete mode 100644 lib/PublicInbox/Filter.pm (limited to 'lib') diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm deleted file mode 100644 index 8b78a441..00000000 --- a/lib/PublicInbox/Filter.pm +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright (C) 2013-2015 all contributors -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Used to filter incoming mail for -mda and importers -# This only exposes one function: run -# Note: the settings here are highly opinionated. Obviously, this is -# Free Software (AGPLv3), so you may change it if you host yourself. -package PublicInbox::Filter; -use strict; -use warnings; -use Email::MIME; -use Email::MIME::ContentType qw/parse_content_type/; -use Email::Filter; -use IPC::Run; -our $VERSION = '0.0.1'; -use constant NO_HTML => '*** We only accept plain-text email, no HTML ***'; -use constant TEXT_ONLY => '*** We only accept plain-text email ***'; - -# start with the same defaults as mailman -our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i; -our $MIME_HTML = qr!\btext/x?html\b!i; -our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; - -# this is highly opinionated delivery -# returns 0 only if there is nothing to deliver -sub run { - my ($class, $mime, $filter) = @_; - - my $content_type = $mime->header('Content-Type') || 'text/plain'; - - if ($content_type =~ m!\btext/plain\b!i) { - return 1; # yay, nothing to do - } elsif ($content_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - # HTML-only, non-multipart - my $body = $mime->body; - my $ct_parsed = parse_content_type($content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_body($mime, $body); - return 1; - } elsif ($content_type =~ m!\bmultipart/!i) { - return strip_multipart($mime, $content_type, $filter); - } else { - $filter->reject(TEXT_ONLY) if $filter; - replace_body($mime, "$content_type message scrubbed"); - return 0; - } -} - -sub replace_part { - my ($mime, $part, $type) = ($_[0], $_[1], $_[3]); - # don't copy $_[2], that's the body (it may be huge) - - # Email::MIME insists on setting Date:, so just set it consistently - # to avoid conflicts to avoid git merge conflicts in a split brain - # situation. - unless (defined $part->header('Date')) { - my $date = $mime->header('Date') || - 'Thu, 01 Jan 1970 00:00:00 +0000'; - $part->header_set('Date', $date); - } - - $part->charset_set(undef); - $part->name_set(undef); - $part->filename_set(undef); - $part->format_set(undef); - $part->encoding_set('8bit'); - $part->disposition_set(undef); - $part->content_type_set($type); - $part->body_set($_[2]); -} - -# converts one part of a multipart message to text -sub html_part_to_text { - my ($mime, $part) = @_; - my $body = $part->body; - my $ct_parsed = parse_content_type($part->content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_part($mime, $part, $body, 'text/plain'); -} - -# modifies $_[0] in place -sub dump_html { - my ($body, $charset) = @_; - $charset ||= 'US-ASCII'; - my @cmd = qw(lynx -stdin -stderr -dump); - my $out = ""; - my $err = ""; - - # be careful about remote command injection! - if ($charset =~ /\A([A-Za-z0-9\-]+)\z/) { - push @cmd, "-assume_charset=$charset"; - } - if (IPC::Run::run(\@cmd, $body, \$out, \$err)) { - $out =~ s/\r\n/\n/sg; - $$body = $out; - } else { - # give them an ugly version: - $$body = "public-inbox HTML conversion failed: $err\n" . - $$body . "\n"; - } -} - -# this is to correct old archives during import. -sub strip_multipart { - my ($mime, $content_type, $filter) = @_; - - my (@html, @keep); - my $rejected = 0; - my $ok = 1; - - # scan through all parts once - $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - - # some extensions are just bad, reject them outright - my $fn = $part->filename; - if (defined($fn) && $fn =~ $BAD_EXT) { - $filter->reject("Bad file type: $1") if $filter; - $rejected++; - return; - } - - my $part_type = $part->content_type || ''; - if ($part_type =~ m!\btext/plain\b!i) { - push @keep, $part; - } elsif ($part_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - push @html, $part; - } elsif ($part_type =~ $MIME_TEXT_ANY) { - # Give other text attachments the benefit of the doubt, - # here? Could be source code or script the user wants - # help with. - - push @keep, $part; - } elsif ($part_type eq '' || - $part_type =~ m!\bapplication/octet-stream\b!i) { - # unfortunately, some mailers don't set correct types, - # let messages of unknown type through but do not - # change the sender-specified type - if (recheck_type_ok($part)) { - push @keep, $part; - } elsif ($filter) { - $filter->reject("Bad attachment: $part_type ". - TEXT_ONLY); - } else { - $rejected++; - } - } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) { - # PGP signatures are not huge, we may keep them. - # They can only be valid if it's the last element, - # so we keep them iff the message is unmodified: - if ($rejected == 0 && !@html) { - push @keep, $part; - } - } elsif ($filter) { - $filter->reject("unacceptable mime-type: $part_type ". - TEXT_ONLY); - } else { - # reject everything else, including non-PGP signatures - $rejected++; - } - }); - - if ($content_type =~ m!\bmultipart/alternative\b!i) { - if (scalar @keep == 1) { - return collapse($mime, $keep[0]); - } - } else { # convert HTML parts to plain text - foreach my $part (@html) { - html_part_to_text($mime, $part); - push @keep, $part; - } - } - - if (@keep == 0) { - @keep = (Email::MIME->create( - attributes => { - content_type => 'text/plain', - charset => 'US-ASCII', - encoding => '8bit', - }, - body_str => 'all attachments scrubbed by '. __PACKAGE__ - )); - $ok = 0; - } - if (scalar(@html) || $rejected) { - $mime->parts_set(\@keep); - $mime->body_set($mime->body_raw); - mark_changed($mime); - } # else: no changes - - return $ok; -} - -sub mark_changed { - my ($mime) = @_; - $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION"); -} - -sub collapse { - my ($mime, $part) = @_; - $mime->header_set('Content-Type', $part->content_type); - $mime->body_set($part->body_raw); - my $cte = $part->header('Content-Transfer-Encoding'); - if (defined($cte) && $cte ne '') { - $mime->header_set('Content-Transfer-Encoding', $cte); - } - mark_changed($mime); - return 1; -} - -sub replace_body { - my $mime = $_[0]; - $mime->body_set($_[1]); - $mime->header_set('Content-Type', 'text/plain'); - if ($mime->header('Content-Transfer-Encoding')) { - $mime->header_set('Content-Transfer-Encoding', undef); - } - mark_changed($mime); -} - -# Check for display-able text, no messed up binaries -# Note: we can not rewrite the message with the detected mime type -sub recheck_type_ok { - my ($part) = @_; - my $s = $part->body; - ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s)); -} - -1; -- cgit v1.2.3-24-ge0c7