about summary refs log tree commit homepage
path: root/lib/PublicInbox/Filter.pm
diff options
context:
space:
mode:
authorEric Wong <normalperson@yhbt.net>2014-01-09 23:13:37 +0000
committerEric Wong <e@80x24.org>2014-01-09 22:37:54 +0000
commit3e96cf129ba5fc2834b691314c504aa363fd5cf4 (patch)
treedd5c42532049bc5dd2a420126edb9f07e9a5b9a0 /lib/PublicInbox/Filter.pm
downloadpublic-inbox-3e96cf129ba5fc2834b691314c504aa363fd5cf4.tar.gz
Diffstat (limited to 'lib/PublicInbox/Filter.pm')
-rw-r--r--lib/PublicInbox/Filter.pm216
1 files changed, 216 insertions, 0 deletions
diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm
new file mode 100644
index 00000000..6cccd930
--- /dev/null
+++ b/lib/PublicInbox/Filter.pm
@@ -0,0 +1,216 @@
+# Copyright (C) 2013, Eric Wong <normalperson@yhbt.net> and all contributors
+# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+#
+# This only exposes one function: run
+# Note: the settings here are highly opinionated.  Obviously, this is
+# Free Software (AGPLv3), so you may change it if you host yourself.
+package PublicInbox::Filter;
+use strict;
+use warnings;
+use Email::MIME;
+use Email::MIME::ContentType qw/parse_content_type/;
+use Email::Filter;
+use IPC::Open2;
+our $VERSION = '0.0.1';
+
+# start with the same defaults as mailman
+our $BAD_EXT = qr/\.(?:exe|bat|cmd|com|pif|scr|vbs|cpl)\z/i;
+
+# this is highly opinionated delivery
+# returns 0 only if there is nothing to deliver
+sub run {
+        my ($class, $simple) = @_;
+
+        my $content_type = $simple->header("Content-Type") || "text/plain";
+
+        # kill potentially bad/confusing headers
+        # Note: ssoma already does this, but since we mangle the message,
+        # we should do this before it gets to ssoma.
+        foreach my $d (qw(status lines content-length)) {
+                $simple->header_set($d);
+        }
+
+        if ($content_type =~ m!\btext/plain\b!i) {
+                return 1; # yay, nothing to do
+        } elsif ($content_type =~ m!\btext/html\b!i) {
+                # HTML-only, non-multipart
+                my $body = $simple->body;
+                my $ct_parsed = parse_content_type($content_type);
+                dump_html($body, $ct_parsed->{attributes}->{charset});
+                replace_body($simple, $body);
+                return 1;
+        } elsif ($content_type =~ m!\bmultipart/!i) {
+                return strip_multipart($simple, $content_type);
+        } else {
+                replace_body($simple, "$content_type message scrubbed");
+                return 0;
+        }
+}
+
+sub replace_part {
+        my ($simple, $part, $type) = ($_[0], $_[1], $_[3]);
+        # don't copy $_[2], that's the body (it may be huge)
+
+        # Email::MIME insists on setting Date:, so just set it consistently
+        # to avoid conflicts to avoid git merge conflicts in a split brain
+        # situation.
+        unless (defined $part->header('Date')) {
+                my $date = $simple->header('Date') ||
+                           'Thu, 01 Jan 1970 00:00:00 +0000';
+                $part->header_set('Date', $date);
+        }
+
+        $part->charset_set(undef);
+        $part->name_set(undef);
+        $part->filename_set(undef);
+        $part->format_set(undef);
+        $part->encoding_set('8bit');
+        $part->disposition_set(undef);
+        $part->content_type_set($type);
+        $part->body_set($_[2]);
+}
+
+# converts one part of a multipart message to text
+sub html_part_to_text {
+        my ($simple, $part) = @_;
+        my $body = $part->body;
+        my $ct_parsed = parse_content_type($part->content_type);
+        dump_html($body, $ct_parsed->{attributes}->{charset});
+        replace_part($simple, $part, $body, 'text/plain');
+}
+
+# modifies $_[0] in place
+sub dump_html {
+        my $charset = $_[1] || 'US-ASCII';
+        my $cmd = "lynx -stdin -dump";
+
+        # be careful about remote command injection!
+        if ($charset =~ /\A[A-Za-z0-9\-]+\z/) {
+                $cmd .= " -assume_charset=$charset";
+        }
+
+        my $pid = open2(my $out, my $in, $cmd);
+        print $in $_[0];
+        close $in;
+        {
+                local $/;
+                $_[0] = <$out>;
+        }
+        waitpid($pid, 0);
+}
+
+# this is to correct user errors and not expected to cover all corner cases
+# if users don't want to hit this, they should be sending text/plain messages
+# unfortunately, too many people send HTML mail and we'll attempt to convert
+# it to something safer, smaller and harder-to-track.
+sub strip_multipart {
+        my ($simple, $content_type) = @_;
+        my $mime = Email::MIME->new($simple->as_string);
+
+        my (@html, @keep);
+        my $rejected = 0;
+        my $ok = 1;
+
+        # scan through all parts once
+        $mime->walk_parts(sub {
+                my ($part) = @_;
+                return if $part->subparts; # walk_parts already recurses
+
+                # some extensions are just bad, reject them outright
+                my $fn = $part->filename;
+                if (defined($fn) && $fn =~ $BAD_EXT) {
+                        $rejected++;
+                        return;
+                }
+
+                my $part_type = $part->content_type;
+                if ($part_type =~ m!\btext/plain\b!i) {
+                        push @keep, $part;
+                } elsif ($part_type =~ m!\btext/html\b!i) {
+                        push @html, $part;
+                } elsif ($part_type =~ m!\btext/[a-z0-9\+\._-]+\b!i) {
+                        # Give other text attachments the benefit of the doubt,
+                        # here?  Could be source code or script the user wants
+                        # help with.
+
+                        push @keep, $part;
+                } else {
+                        # reject everything else
+                        #
+                        # Yes, we drop GPG/PGP signatures because:
+                        # * hardly anybody bothers to verify signatures
+                        # * we strip/convert HTML parts, which could invalidate
+                        #   the signature
+                        # * they increase the size of messages greatly
+                        #   (especially short ones)
+                        # * they do not compress well
+                        #
+                        # Instead, rely on soft verification measures:
+                        # * content of the message is most important
+                        # * we encourage Cc: all replies, so replies go to
+                        #   the original sender
+                        # * Received, User-Agent, and similar headers
+                        #   (this is also to encourage using self-hosted mail
+                        #   servers (using 100% Free Software, of course :)
+                        #
+                        # Furthermore, identity theft is uncommon in Free/Open
+                        # Source, even in communities where signatures are rare.
+                        $rejected++;
+                }
+        });
+
+        if ($content_type =~ m!\bmultipart/alternative\b!i) {
+                if (scalar @keep == 1) {
+                        return collapse($simple, $keep[0]);
+                }
+        } else { # convert HTML parts to plain text
+                foreach my $part (@html) {
+                        html_part_to_text($simple, $part);
+                        push @keep, $part;
+                }
+        }
+
+        if (@keep == 0) {
+                @keep = (Email::MIME->create(
+                        attributes => {
+                                content_type => 'text/plain',
+                                charset => 'US-ASCII',
+                                encoding => '8bit',
+                        },
+                        body_str => 'all attachments scrubbed by '. __PACKAGE__
+                ));
+                $ok = 0;
+        }
+        if (scalar(@html) || $rejected) {
+                $mime->parts_set(\@keep);
+                $simple->body_set($mime->body_raw);
+                mark_changed($simple);
+        } # else: no changes
+
+        return $ok;
+}
+
+sub mark_changed {
+        my ($simple) = @_;
+        $simple->header_set("X-Content-Filtered-By", __PACKAGE__ ." $VERSION");
+}
+
+sub collapse {
+        my ($simple, $part) = @_;
+        $simple->header_set("Content-Type", $part->content_type);
+        $simple->body_set($part->body_raw);
+        mark_changed($simple);
+        return 1;
+}
+
+sub replace_body {
+        my $simple = $_[0];
+        $simple->body_set($_[1]);
+        $simple->header_set("Content-Type", "text/plain");
+        if ($simple->header("Content-Transfer-Encoding")) {
+                $simple->header_set("Content-Transfer-Encoding", undef);
+        }
+        mark_changed($simple);
+}
+
+1;